sublim-phase4-combo-06 / trainer_state.json
eac123's picture
Upload final checkpoint (checkpoint-804)
0a6039b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 804,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.1324340403079987,
"epoch": 0.0037313432835820895,
"grad_norm": 1.6067556142807007,
"learning_rate": 0.0002,
"loss": 2.4804701805114746,
"mean_token_accuracy": 0.5353229343891144,
"num_tokens": 16370.0,
"step": 1
},
{
"entropy": 1.2276706099510193,
"epoch": 0.007462686567164179,
"grad_norm": 1.4987447261810303,
"learning_rate": 0.0002,
"loss": 2.135417938232422,
"mean_token_accuracy": 0.5693617165088654,
"num_tokens": 33043.0,
"step": 2
},
{
"entropy": 1.4045527577400208,
"epoch": 0.011194029850746268,
"grad_norm": 1.1359604597091675,
"learning_rate": 0.0002,
"loss": 1.72599196434021,
"mean_token_accuracy": 0.5919849574565887,
"num_tokens": 49458.0,
"step": 3
},
{
"entropy": 1.3863026201725006,
"epoch": 0.014925373134328358,
"grad_norm": 0.9200887084007263,
"learning_rate": 0.0002,
"loss": 1.4096770286560059,
"mean_token_accuracy": 0.6369052678346634,
"num_tokens": 65795.0,
"step": 4
},
{
"entropy": 1.331774890422821,
"epoch": 0.018656716417910446,
"grad_norm": 1.2737244367599487,
"learning_rate": 0.0002,
"loss": 1.2862391471862793,
"mean_token_accuracy": 0.6422256380319595,
"num_tokens": 82033.0,
"step": 5
},
{
"entropy": 1.2540993690490723,
"epoch": 0.022388059701492536,
"grad_norm": 0.6736201643943787,
"learning_rate": 0.0002,
"loss": 1.1756055355072021,
"mean_token_accuracy": 0.6605449765920639,
"num_tokens": 97997.0,
"step": 6
},
{
"entropy": 1.169641524553299,
"epoch": 0.026119402985074626,
"grad_norm": 0.3927549719810486,
"learning_rate": 0.0002,
"loss": 1.1019014120101929,
"mean_token_accuracy": 0.6672378480434418,
"num_tokens": 114186.0,
"step": 7
},
{
"entropy": 1.0887874066829681,
"epoch": 0.029850746268656716,
"grad_norm": 0.4364261329174042,
"learning_rate": 0.0002,
"loss": 1.0323972702026367,
"mean_token_accuracy": 0.6782350987195969,
"num_tokens": 130751.0,
"step": 8
},
{
"entropy": 1.0042430609464645,
"epoch": 0.033582089552238806,
"grad_norm": 0.5108282566070557,
"learning_rate": 0.0002,
"loss": 0.9582932591438293,
"mean_token_accuracy": 0.692020371556282,
"num_tokens": 147264.0,
"step": 9
},
{
"entropy": 0.9632741063833237,
"epoch": 0.03731343283582089,
"grad_norm": 0.4669722616672516,
"learning_rate": 0.0002,
"loss": 0.8919203877449036,
"mean_token_accuracy": 0.7046539932489395,
"num_tokens": 163507.0,
"step": 10
},
{
"entropy": 0.9305494576692581,
"epoch": 0.041044776119402986,
"grad_norm": 0.4794766902923584,
"learning_rate": 0.0002,
"loss": 0.8569780588150024,
"mean_token_accuracy": 0.7103458344936371,
"num_tokens": 179680.0,
"step": 11
},
{
"entropy": 0.8464002013206482,
"epoch": 0.04477611940298507,
"grad_norm": 0.396366685628891,
"learning_rate": 0.0002,
"loss": 0.7772667407989502,
"mean_token_accuracy": 0.7248742878437042,
"num_tokens": 196084.0,
"step": 12
},
{
"entropy": 0.8053079694509506,
"epoch": 0.048507462686567165,
"grad_norm": 3.4283485412597656,
"learning_rate": 0.0002,
"loss": 0.7701212763786316,
"mean_token_accuracy": 0.7237996459007263,
"num_tokens": 212421.0,
"step": 13
},
{
"entropy": 0.7701881229877472,
"epoch": 0.05223880597014925,
"grad_norm": 0.4621308147907257,
"learning_rate": 0.0002,
"loss": 0.7581663727760315,
"mean_token_accuracy": 0.725386381149292,
"num_tokens": 228835.0,
"step": 14
},
{
"entropy": 0.7058936208486557,
"epoch": 0.055970149253731345,
"grad_norm": 0.45394617319107056,
"learning_rate": 0.0002,
"loss": 0.7281949520111084,
"mean_token_accuracy": 0.731869712471962,
"num_tokens": 245106.0,
"step": 15
},
{
"entropy": 0.7007950246334076,
"epoch": 0.05970149253731343,
"grad_norm": 0.38048553466796875,
"learning_rate": 0.0002,
"loss": 0.6906558871269226,
"mean_token_accuracy": 0.7422550022602081,
"num_tokens": 261510.0,
"step": 16
},
{
"entropy": 0.6775622367858887,
"epoch": 0.06343283582089553,
"grad_norm": 0.3588451147079468,
"learning_rate": 0.0002,
"loss": 0.6660153865814209,
"mean_token_accuracy": 0.7494668215513229,
"num_tokens": 278002.0,
"step": 17
},
{
"entropy": 0.6844813376665115,
"epoch": 0.06716417910447761,
"grad_norm": 0.34310266375541687,
"learning_rate": 0.0002,
"loss": 0.6606006026268005,
"mean_token_accuracy": 0.745672732591629,
"num_tokens": 294482.0,
"step": 18
},
{
"entropy": 0.6752376109361649,
"epoch": 0.0708955223880597,
"grad_norm": 0.3563651740550995,
"learning_rate": 0.0002,
"loss": 0.6529812216758728,
"mean_token_accuracy": 0.7467419356107712,
"num_tokens": 310804.0,
"step": 19
},
{
"entropy": 0.655072346329689,
"epoch": 0.07462686567164178,
"grad_norm": 0.30358463525772095,
"learning_rate": 0.0002,
"loss": 0.6404100656509399,
"mean_token_accuracy": 0.7505071759223938,
"num_tokens": 327252.0,
"step": 20
},
{
"entropy": 0.6286358386278152,
"epoch": 0.07835820895522388,
"grad_norm": 0.30567091703414917,
"learning_rate": 0.0002,
"loss": 0.6207510232925415,
"mean_token_accuracy": 0.7580177336931229,
"num_tokens": 343737.0,
"step": 21
},
{
"entropy": 0.6086345314979553,
"epoch": 0.08208955223880597,
"grad_norm": 0.27747389674186707,
"learning_rate": 0.0002,
"loss": 0.6111672520637512,
"mean_token_accuracy": 0.760840117931366,
"num_tokens": 359961.0,
"step": 22
},
{
"entropy": 0.5925645977258682,
"epoch": 0.08582089552238806,
"grad_norm": 0.25484028458595276,
"learning_rate": 0.0002,
"loss": 0.5915433168411255,
"mean_token_accuracy": 0.7686687558889389,
"num_tokens": 376034.0,
"step": 23
},
{
"entropy": 0.6192648261785507,
"epoch": 0.08955223880597014,
"grad_norm": 0.2309548258781433,
"learning_rate": 0.0002,
"loss": 0.6154056787490845,
"mean_token_accuracy": 0.7575328648090363,
"num_tokens": 392454.0,
"step": 24
},
{
"entropy": 0.6046310663223267,
"epoch": 0.09328358208955224,
"grad_norm": 0.24919550120830536,
"learning_rate": 0.0002,
"loss": 0.5856317281723022,
"mean_token_accuracy": 0.769055038690567,
"num_tokens": 408673.0,
"step": 25
},
{
"entropy": 0.6073041707277298,
"epoch": 0.09701492537313433,
"grad_norm": 0.22897422313690186,
"learning_rate": 0.0002,
"loss": 0.6000080108642578,
"mean_token_accuracy": 0.7657780200242996,
"num_tokens": 425147.0,
"step": 26
},
{
"entropy": 0.5694791227579117,
"epoch": 0.10074626865671642,
"grad_norm": 0.26130226254463196,
"learning_rate": 0.0002,
"loss": 0.5651018619537354,
"mean_token_accuracy": 0.7780718505382538,
"num_tokens": 441676.0,
"step": 27
},
{
"entropy": 0.5705035477876663,
"epoch": 0.1044776119402985,
"grad_norm": 0.2569018304347992,
"learning_rate": 0.0002,
"loss": 0.5736910700798035,
"mean_token_accuracy": 0.7736188471317291,
"num_tokens": 457862.0,
"step": 28
},
{
"entropy": 0.5686106830835342,
"epoch": 0.10820895522388059,
"grad_norm": 0.24455995857715607,
"learning_rate": 0.0002,
"loss": 0.5789230465888977,
"mean_token_accuracy": 0.7694863677024841,
"num_tokens": 473929.0,
"step": 29
},
{
"entropy": 0.5674358904361725,
"epoch": 0.11194029850746269,
"grad_norm": 0.2457604557275772,
"learning_rate": 0.0002,
"loss": 0.581587553024292,
"mean_token_accuracy": 0.7700542360544205,
"num_tokens": 490261.0,
"step": 30
},
{
"entropy": 0.5924967974424362,
"epoch": 0.11567164179104478,
"grad_norm": 0.24704386293888092,
"learning_rate": 0.0002,
"loss": 0.5963209271430969,
"mean_token_accuracy": 0.7627938687801361,
"num_tokens": 506614.0,
"step": 31
},
{
"entropy": 0.5728770643472672,
"epoch": 0.11940298507462686,
"grad_norm": 0.24360406398773193,
"learning_rate": 0.0002,
"loss": 0.570555567741394,
"mean_token_accuracy": 0.7713408023118973,
"num_tokens": 523175.0,
"step": 32
},
{
"entropy": 0.5846883952617645,
"epoch": 0.12313432835820895,
"grad_norm": 0.20197518169879913,
"learning_rate": 0.0002,
"loss": 0.5723189115524292,
"mean_token_accuracy": 0.7742884606122971,
"num_tokens": 539383.0,
"step": 33
},
{
"entropy": 0.5598815232515335,
"epoch": 0.12686567164179105,
"grad_norm": 0.25282159447669983,
"learning_rate": 0.0002,
"loss": 0.5645520687103271,
"mean_token_accuracy": 0.7759677618741989,
"num_tokens": 555484.0,
"step": 34
},
{
"entropy": 0.5746279805898666,
"epoch": 0.13059701492537312,
"grad_norm": 0.20525087416172028,
"learning_rate": 0.0002,
"loss": 0.5774482488632202,
"mean_token_accuracy": 0.7711690366268158,
"num_tokens": 572050.0,
"step": 35
},
{
"entropy": 0.5689367800951004,
"epoch": 0.13432835820895522,
"grad_norm": 0.2016289383172989,
"learning_rate": 0.0002,
"loss": 0.5688468217849731,
"mean_token_accuracy": 0.7752531915903091,
"num_tokens": 588229.0,
"step": 36
},
{
"entropy": 0.5673371106386185,
"epoch": 0.13805970149253732,
"grad_norm": 0.20251700282096863,
"learning_rate": 0.0002,
"loss": 0.5676092505455017,
"mean_token_accuracy": 0.7740599513053894,
"num_tokens": 604842.0,
"step": 37
},
{
"entropy": 0.5538036525249481,
"epoch": 0.1417910447761194,
"grad_norm": 0.18855363130569458,
"learning_rate": 0.0002,
"loss": 0.5636182427406311,
"mean_token_accuracy": 0.7732492536306381,
"num_tokens": 621334.0,
"step": 38
},
{
"entropy": 0.5772293359041214,
"epoch": 0.1455223880597015,
"grad_norm": 0.1829119771718979,
"learning_rate": 0.0002,
"loss": 0.5749870538711548,
"mean_token_accuracy": 0.7699291855096817,
"num_tokens": 637861.0,
"step": 39
},
{
"entropy": 0.5583464652299881,
"epoch": 0.14925373134328357,
"grad_norm": 0.16470657289028168,
"learning_rate": 0.0002,
"loss": 0.5537322163581848,
"mean_token_accuracy": 0.7790806740522385,
"num_tokens": 653894.0,
"step": 40
},
{
"entropy": 0.5681058615446091,
"epoch": 0.15298507462686567,
"grad_norm": 0.17573200166225433,
"learning_rate": 0.0002,
"loss": 0.5643278360366821,
"mean_token_accuracy": 0.7733141183853149,
"num_tokens": 670015.0,
"step": 41
},
{
"entropy": 0.566686749458313,
"epoch": 0.15671641791044777,
"grad_norm": 0.16218754649162292,
"learning_rate": 0.0002,
"loss": 0.5597659945487976,
"mean_token_accuracy": 0.7758253067731857,
"num_tokens": 686056.0,
"step": 42
},
{
"entropy": 0.5558898448944092,
"epoch": 0.16044776119402984,
"grad_norm": 0.18278591334819794,
"learning_rate": 0.0002,
"loss": 0.558386504650116,
"mean_token_accuracy": 0.7759624123573303,
"num_tokens": 702659.0,
"step": 43
},
{
"entropy": 0.5585661381483078,
"epoch": 0.16417910447761194,
"grad_norm": 0.17696230113506317,
"learning_rate": 0.0002,
"loss": 0.5635029673576355,
"mean_token_accuracy": 0.7751695066690445,
"num_tokens": 718850.0,
"step": 44
},
{
"entropy": 0.5506571680307388,
"epoch": 0.16791044776119404,
"grad_norm": 0.1652524471282959,
"learning_rate": 0.0002,
"loss": 0.5565558671951294,
"mean_token_accuracy": 0.7778312116861343,
"num_tokens": 735246.0,
"step": 45
},
{
"entropy": 0.5514795780181885,
"epoch": 0.17164179104477612,
"grad_norm": 0.18487824499607086,
"learning_rate": 0.0002,
"loss": 0.5487773418426514,
"mean_token_accuracy": 0.7793762385845184,
"num_tokens": 751565.0,
"step": 46
},
{
"entropy": 0.5588273853063583,
"epoch": 0.17537313432835822,
"grad_norm": 0.19246406853199005,
"learning_rate": 0.0002,
"loss": 0.5596141219139099,
"mean_token_accuracy": 0.7778225541114807,
"num_tokens": 767932.0,
"step": 47
},
{
"entropy": 0.5591737627983093,
"epoch": 0.1791044776119403,
"grad_norm": 0.15891006588935852,
"learning_rate": 0.0002,
"loss": 0.5638841390609741,
"mean_token_accuracy": 0.7727467268705368,
"num_tokens": 784014.0,
"step": 48
},
{
"entropy": 0.5501811355352402,
"epoch": 0.1828358208955224,
"grad_norm": 0.16706983745098114,
"learning_rate": 0.0002,
"loss": 0.5501376986503601,
"mean_token_accuracy": 0.7761423140764236,
"num_tokens": 800374.0,
"step": 49
},
{
"entropy": 0.5606948286294937,
"epoch": 0.1865671641791045,
"grad_norm": 0.17230357229709625,
"learning_rate": 0.0002,
"loss": 0.5634580850601196,
"mean_token_accuracy": 0.7727725654840469,
"num_tokens": 816520.0,
"step": 50
},
{
"entropy": 0.5541675686836243,
"epoch": 0.19029850746268656,
"grad_norm": 0.1744348555803299,
"learning_rate": 0.0002,
"loss": 0.5664834380149841,
"mean_token_accuracy": 0.7722806632518768,
"num_tokens": 832574.0,
"step": 51
},
{
"entropy": 0.5447754859924316,
"epoch": 0.19402985074626866,
"grad_norm": 0.1993291825056076,
"learning_rate": 0.0002,
"loss": 0.5500599145889282,
"mean_token_accuracy": 0.7812339067459106,
"num_tokens": 848524.0,
"step": 52
},
{
"entropy": 0.55513696372509,
"epoch": 0.19776119402985073,
"grad_norm": 0.18667836487293243,
"learning_rate": 0.0002,
"loss": 0.5566352605819702,
"mean_token_accuracy": 0.7776180505752563,
"num_tokens": 864701.0,
"step": 53
},
{
"entropy": 0.5591137707233429,
"epoch": 0.20149253731343283,
"grad_norm": 0.1556427925825119,
"learning_rate": 0.0002,
"loss": 0.5615472197532654,
"mean_token_accuracy": 0.7761439085006714,
"num_tokens": 881019.0,
"step": 54
},
{
"entropy": 0.5678103417158127,
"epoch": 0.20522388059701493,
"grad_norm": 0.176001638174057,
"learning_rate": 0.0002,
"loss": 0.5604614615440369,
"mean_token_accuracy": 0.7737350314855576,
"num_tokens": 897731.0,
"step": 55
},
{
"entropy": 0.5736003369092941,
"epoch": 0.208955223880597,
"grad_norm": 0.17963656783103943,
"learning_rate": 0.0002,
"loss": 0.5741879940032959,
"mean_token_accuracy": 0.7709980905056,
"num_tokens": 914031.0,
"step": 56
},
{
"entropy": 0.5704395622014999,
"epoch": 0.2126865671641791,
"grad_norm": 0.15910783410072327,
"learning_rate": 0.0002,
"loss": 0.571160078048706,
"mean_token_accuracy": 0.7722027599811554,
"num_tokens": 930606.0,
"step": 57
},
{
"entropy": 0.5746669173240662,
"epoch": 0.21641791044776118,
"grad_norm": 0.15874247252941132,
"learning_rate": 0.0002,
"loss": 0.5674406886100769,
"mean_token_accuracy": 0.7708650529384613,
"num_tokens": 947244.0,
"step": 58
},
{
"entropy": 0.5582200437784195,
"epoch": 0.22014925373134328,
"grad_norm": 0.16829723119735718,
"learning_rate": 0.0002,
"loss": 0.5581406950950623,
"mean_token_accuracy": 0.7757681459188461,
"num_tokens": 963619.0,
"step": 59
},
{
"entropy": 0.5504408478736877,
"epoch": 0.22388059701492538,
"grad_norm": 0.14540037512779236,
"learning_rate": 0.0002,
"loss": 0.5557159781455994,
"mean_token_accuracy": 0.776930645108223,
"num_tokens": 980040.0,
"step": 60
},
{
"entropy": 0.5402641594409943,
"epoch": 0.22761194029850745,
"grad_norm": 0.14897902309894562,
"learning_rate": 0.0002,
"loss": 0.5523658394813538,
"mean_token_accuracy": 0.7773705869913101,
"num_tokens": 996383.0,
"step": 61
},
{
"entropy": 0.5391396135091782,
"epoch": 0.23134328358208955,
"grad_norm": 0.16873425245285034,
"learning_rate": 0.0002,
"loss": 0.5509910583496094,
"mean_token_accuracy": 0.7777218073606491,
"num_tokens": 1012664.0,
"step": 62
},
{
"entropy": 0.5582114011049271,
"epoch": 0.23507462686567165,
"grad_norm": 0.1502108871936798,
"learning_rate": 0.0002,
"loss": 0.5559942126274109,
"mean_token_accuracy": 0.7745993584394455,
"num_tokens": 1029022.0,
"step": 63
},
{
"entropy": 0.5812249481678009,
"epoch": 0.23880597014925373,
"grad_norm": 0.13852274417877197,
"learning_rate": 0.0002,
"loss": 0.5768259167671204,
"mean_token_accuracy": 0.766035184264183,
"num_tokens": 1045337.0,
"step": 64
},
{
"entropy": 0.555647611618042,
"epoch": 0.24253731343283583,
"grad_norm": 0.1643349826335907,
"learning_rate": 0.0002,
"loss": 0.5524765849113464,
"mean_token_accuracy": 0.7790125608444214,
"num_tokens": 1061843.0,
"step": 65
},
{
"entropy": 0.5712831914424896,
"epoch": 0.2462686567164179,
"grad_norm": 0.1458103060722351,
"learning_rate": 0.0002,
"loss": 0.5671954154968262,
"mean_token_accuracy": 0.7726651430130005,
"num_tokens": 1078313.0,
"step": 66
},
{
"entropy": 0.548685610294342,
"epoch": 0.25,
"grad_norm": 0.13704419136047363,
"learning_rate": 0.0002,
"loss": 0.5478826761245728,
"mean_token_accuracy": 0.7788915038108826,
"num_tokens": 1094803.0,
"step": 67
},
{
"entropy": 0.5427667200565338,
"epoch": 0.2537313432835821,
"grad_norm": 0.16616535186767578,
"learning_rate": 0.0002,
"loss": 0.5495492815971375,
"mean_token_accuracy": 0.7795749753713608,
"num_tokens": 1111058.0,
"step": 68
},
{
"entropy": 0.5463619232177734,
"epoch": 0.2574626865671642,
"grad_norm": 0.1541680544614792,
"learning_rate": 0.0002,
"loss": 0.5557973980903625,
"mean_token_accuracy": 0.7797737270593643,
"num_tokens": 1127187.0,
"step": 69
},
{
"entropy": 0.5503609925508499,
"epoch": 0.26119402985074625,
"grad_norm": 0.16344738006591797,
"learning_rate": 0.0002,
"loss": 0.5560310482978821,
"mean_token_accuracy": 0.7764633148908615,
"num_tokens": 1143517.0,
"step": 70
},
{
"entropy": 0.564177006483078,
"epoch": 0.26492537313432835,
"grad_norm": 0.1369864046573639,
"learning_rate": 0.0002,
"loss": 0.5619618892669678,
"mean_token_accuracy": 0.774873822927475,
"num_tokens": 1160191.0,
"step": 71
},
{
"entropy": 0.5624472498893738,
"epoch": 0.26865671641791045,
"grad_norm": 0.16099311411380768,
"learning_rate": 0.0002,
"loss": 0.5546153783798218,
"mean_token_accuracy": 0.7775298207998276,
"num_tokens": 1176379.0,
"step": 72
},
{
"entropy": 0.5442378669977188,
"epoch": 0.27238805970149255,
"grad_norm": 0.18382063508033752,
"learning_rate": 0.0002,
"loss": 0.5439026951789856,
"mean_token_accuracy": 0.7808986604213715,
"num_tokens": 1192611.0,
"step": 73
},
{
"entropy": 0.5539779812097549,
"epoch": 0.27611940298507465,
"grad_norm": 0.14527475833892822,
"learning_rate": 0.0002,
"loss": 0.5488794445991516,
"mean_token_accuracy": 0.7770136892795563,
"num_tokens": 1209218.0,
"step": 74
},
{
"entropy": 0.5399174243211746,
"epoch": 0.2798507462686567,
"grad_norm": 0.16744667291641235,
"learning_rate": 0.0002,
"loss": 0.5474289059638977,
"mean_token_accuracy": 0.7779674381017685,
"num_tokens": 1225760.0,
"step": 75
},
{
"entropy": 0.5410275682806969,
"epoch": 0.2835820895522388,
"grad_norm": 0.1709633320569992,
"learning_rate": 0.0002,
"loss": 0.548405110836029,
"mean_token_accuracy": 0.7785314917564392,
"num_tokens": 1242263.0,
"step": 76
},
{
"entropy": 0.5613621175289154,
"epoch": 0.2873134328358209,
"grad_norm": 0.13462653756141663,
"learning_rate": 0.0002,
"loss": 0.5592188835144043,
"mean_token_accuracy": 0.7736580222845078,
"num_tokens": 1258802.0,
"step": 77
},
{
"entropy": 0.5370856672525406,
"epoch": 0.291044776119403,
"grad_norm": 0.14010556042194366,
"learning_rate": 0.0002,
"loss": 0.5362333655357361,
"mean_token_accuracy": 0.7829223275184631,
"num_tokens": 1274985.0,
"step": 78
},
{
"entropy": 0.5476308465003967,
"epoch": 0.2947761194029851,
"grad_norm": 0.14489887654781342,
"learning_rate": 0.0002,
"loss": 0.549788236618042,
"mean_token_accuracy": 0.7797223776578903,
"num_tokens": 1291341.0,
"step": 79
},
{
"entropy": 0.5441256165504456,
"epoch": 0.29850746268656714,
"grad_norm": 0.14331087470054626,
"learning_rate": 0.0002,
"loss": 0.5457456111907959,
"mean_token_accuracy": 0.7812238931655884,
"num_tokens": 1307441.0,
"step": 80
},
{
"entropy": 0.5347439795732498,
"epoch": 0.30223880597014924,
"grad_norm": 0.13690398633480072,
"learning_rate": 0.0002,
"loss": 0.5451613068580627,
"mean_token_accuracy": 0.7763567119836807,
"num_tokens": 1323409.0,
"step": 81
},
{
"entropy": 0.5473417937755585,
"epoch": 0.30597014925373134,
"grad_norm": 0.16063734889030457,
"learning_rate": 0.0002,
"loss": 0.5565767288208008,
"mean_token_accuracy": 0.7768999934196472,
"num_tokens": 1339750.0,
"step": 82
},
{
"entropy": 0.5419514924287796,
"epoch": 0.30970149253731344,
"grad_norm": 0.16186301410198212,
"learning_rate": 0.0002,
"loss": 0.5480918288230896,
"mean_token_accuracy": 0.7810427248477936,
"num_tokens": 1355977.0,
"step": 83
},
{
"entropy": 0.5665269196033478,
"epoch": 0.31343283582089554,
"grad_norm": 0.14284147322177887,
"learning_rate": 0.0002,
"loss": 0.5600348711013794,
"mean_token_accuracy": 0.7740004658699036,
"num_tokens": 1372396.0,
"step": 84
},
{
"entropy": 0.5530648082494736,
"epoch": 0.31716417910447764,
"grad_norm": 0.1373152732849121,
"learning_rate": 0.0002,
"loss": 0.547944962978363,
"mean_token_accuracy": 0.7793020755052567,
"num_tokens": 1388474.0,
"step": 85
},
{
"entropy": 0.5625097453594208,
"epoch": 0.3208955223880597,
"grad_norm": 0.1248691976070404,
"learning_rate": 0.0002,
"loss": 0.5582663416862488,
"mean_token_accuracy": 0.7758172750473022,
"num_tokens": 1404880.0,
"step": 86
},
{
"entropy": 0.5460606664419174,
"epoch": 0.3246268656716418,
"grad_norm": 0.16231709718704224,
"learning_rate": 0.0002,
"loss": 0.5510202646255493,
"mean_token_accuracy": 0.7779169529676437,
"num_tokens": 1421168.0,
"step": 87
},
{
"entropy": 0.5403235554695129,
"epoch": 0.3283582089552239,
"grad_norm": 0.15352240204811096,
"learning_rate": 0.0002,
"loss": 0.5474361181259155,
"mean_token_accuracy": 0.7786824256181717,
"num_tokens": 1437433.0,
"step": 88
},
{
"entropy": 0.550665482878685,
"epoch": 0.332089552238806,
"grad_norm": 0.17033375799655914,
"learning_rate": 0.0002,
"loss": 0.5535221695899963,
"mean_token_accuracy": 0.7792181968688965,
"num_tokens": 1453476.0,
"step": 89
},
{
"entropy": 0.563551127910614,
"epoch": 0.3358208955223881,
"grad_norm": 0.13113154470920563,
"learning_rate": 0.0002,
"loss": 0.5608611106872559,
"mean_token_accuracy": 0.7760418206453323,
"num_tokens": 1469909.0,
"step": 90
},
{
"entropy": 0.5737572461366653,
"epoch": 0.33955223880597013,
"grad_norm": 0.12551374733448029,
"learning_rate": 0.0002,
"loss": 0.5643397569656372,
"mean_token_accuracy": 0.7728746980428696,
"num_tokens": 1486426.0,
"step": 91
},
{
"entropy": 0.5659501850605011,
"epoch": 0.34328358208955223,
"grad_norm": 0.15791846811771393,
"learning_rate": 0.0002,
"loss": 0.5704576969146729,
"mean_token_accuracy": 0.7684866786003113,
"num_tokens": 1502522.0,
"step": 92
},
{
"entropy": 0.5568918883800507,
"epoch": 0.34701492537313433,
"grad_norm": 0.14071005582809448,
"learning_rate": 0.0002,
"loss": 0.559943437576294,
"mean_token_accuracy": 0.7734934538602829,
"num_tokens": 1518718.0,
"step": 93
},
{
"entropy": 0.5584161728620529,
"epoch": 0.35074626865671643,
"grad_norm": 0.14257407188415527,
"learning_rate": 0.0002,
"loss": 0.5574990510940552,
"mean_token_accuracy": 0.7743052095174789,
"num_tokens": 1534997.0,
"step": 94
},
{
"entropy": 0.5583510845899582,
"epoch": 0.35447761194029853,
"grad_norm": 0.13653768599033356,
"learning_rate": 0.0002,
"loss": 0.5597235560417175,
"mean_token_accuracy": 0.7758298218250275,
"num_tokens": 1551457.0,
"step": 95
},
{
"entropy": 0.5537077486515045,
"epoch": 0.3582089552238806,
"grad_norm": 0.14674222469329834,
"learning_rate": 0.0002,
"loss": 0.5539477467536926,
"mean_token_accuracy": 0.7744529694318771,
"num_tokens": 1567731.0,
"step": 96
},
{
"entropy": 0.5472210198640823,
"epoch": 0.3619402985074627,
"grad_norm": 0.1276751160621643,
"learning_rate": 0.0002,
"loss": 0.5464935898780823,
"mean_token_accuracy": 0.7826344817876816,
"num_tokens": 1584021.0,
"step": 97
},
{
"entropy": 0.5479029715061188,
"epoch": 0.3656716417910448,
"grad_norm": 0.16119465231895447,
"learning_rate": 0.0002,
"loss": 0.5547060966491699,
"mean_token_accuracy": 0.7760697901248932,
"num_tokens": 1600533.0,
"step": 98
},
{
"entropy": 0.5536443293094635,
"epoch": 0.3694029850746269,
"grad_norm": 0.12991106510162354,
"learning_rate": 0.0002,
"loss": 0.5573412775993347,
"mean_token_accuracy": 0.7744511961936951,
"num_tokens": 1616690.0,
"step": 99
},
{
"entropy": 0.5505102574825287,
"epoch": 0.373134328358209,
"grad_norm": 0.1364317238330841,
"learning_rate": 0.0002,
"loss": 0.5571202635765076,
"mean_token_accuracy": 0.7761907130479813,
"num_tokens": 1632957.0,
"step": 100
},
{
"entropy": 0.5503265261650085,
"epoch": 0.376865671641791,
"grad_norm": 0.14918965101242065,
"learning_rate": 0.0002,
"loss": 0.5452536344528198,
"mean_token_accuracy": 0.7773023992776871,
"num_tokens": 1649397.0,
"step": 101
},
{
"entropy": 0.5523863285779953,
"epoch": 0.3805970149253731,
"grad_norm": 0.14225420355796814,
"learning_rate": 0.0002,
"loss": 0.5425117611885071,
"mean_token_accuracy": 0.7800490856170654,
"num_tokens": 1665876.0,
"step": 102
},
{
"entropy": 0.5518430918455124,
"epoch": 0.3843283582089552,
"grad_norm": 0.12764710187911987,
"learning_rate": 0.0002,
"loss": 0.5529345870018005,
"mean_token_accuracy": 0.7768139094114304,
"num_tokens": 1682296.0,
"step": 103
},
{
"entropy": 0.5581493228673935,
"epoch": 0.3880597014925373,
"grad_norm": 0.16170883178710938,
"learning_rate": 0.0002,
"loss": 0.5702566504478455,
"mean_token_accuracy": 0.7671579420566559,
"num_tokens": 1698550.0,
"step": 104
},
{
"entropy": 0.558798760175705,
"epoch": 0.3917910447761194,
"grad_norm": 0.14736565947532654,
"learning_rate": 0.0002,
"loss": 0.5634024143218994,
"mean_token_accuracy": 0.7718724012374878,
"num_tokens": 1714882.0,
"step": 105
},
{
"entropy": 0.5496668964624405,
"epoch": 0.39552238805970147,
"grad_norm": 0.150962695479393,
"learning_rate": 0.0002,
"loss": 0.5452749133110046,
"mean_token_accuracy": 0.7789688110351562,
"num_tokens": 1731436.0,
"step": 106
},
{
"entropy": 0.5397633910179138,
"epoch": 0.39925373134328357,
"grad_norm": 0.12951846420764923,
"learning_rate": 0.0002,
"loss": 0.5374678373336792,
"mean_token_accuracy": 0.7823840379714966,
"num_tokens": 1747667.0,
"step": 107
},
{
"entropy": 0.5504965782165527,
"epoch": 0.40298507462686567,
"grad_norm": 0.1469883769750595,
"learning_rate": 0.0002,
"loss": 0.5489968061447144,
"mean_token_accuracy": 0.7779988348484039,
"num_tokens": 1763956.0,
"step": 108
},
{
"entropy": 0.5401955544948578,
"epoch": 0.40671641791044777,
"grad_norm": 0.14114412665367126,
"learning_rate": 0.0002,
"loss": 0.5469740033149719,
"mean_token_accuracy": 0.7791216820478439,
"num_tokens": 1780050.0,
"step": 109
},
{
"entropy": 0.5623095035552979,
"epoch": 0.41044776119402987,
"grad_norm": 0.12923510372638702,
"learning_rate": 0.0002,
"loss": 0.5578881502151489,
"mean_token_accuracy": 0.7777072787284851,
"num_tokens": 1796820.0,
"step": 110
},
{
"entropy": 0.5413771942257881,
"epoch": 0.4141791044776119,
"grad_norm": 0.1528160274028778,
"learning_rate": 0.0002,
"loss": 0.5452436208724976,
"mean_token_accuracy": 0.7776108086109161,
"num_tokens": 1813232.0,
"step": 111
},
{
"entropy": 0.5609131902456284,
"epoch": 0.417910447761194,
"grad_norm": 0.12400584667921066,
"learning_rate": 0.0002,
"loss": 0.5644053816795349,
"mean_token_accuracy": 0.7719212174415588,
"num_tokens": 1829542.0,
"step": 112
},
{
"entropy": 0.543258398771286,
"epoch": 0.4216417910447761,
"grad_norm": 0.11892957985401154,
"learning_rate": 0.0002,
"loss": 0.5409727692604065,
"mean_token_accuracy": 0.7800008654594421,
"num_tokens": 1845855.0,
"step": 113
},
{
"entropy": 0.5490185469388962,
"epoch": 0.4253731343283582,
"grad_norm": 0.1497296690940857,
"learning_rate": 0.0002,
"loss": 0.5536864995956421,
"mean_token_accuracy": 0.7792476564645767,
"num_tokens": 1862087.0,
"step": 114
},
{
"entropy": 0.53768490254879,
"epoch": 0.4291044776119403,
"grad_norm": 0.13764707744121552,
"learning_rate": 0.0002,
"loss": 0.5394353866577148,
"mean_token_accuracy": 0.7829310894012451,
"num_tokens": 1878496.0,
"step": 115
},
{
"entropy": 0.548382118344307,
"epoch": 0.43283582089552236,
"grad_norm": 0.1350480020046234,
"learning_rate": 0.0002,
"loss": 0.5588696002960205,
"mean_token_accuracy": 0.773399829864502,
"num_tokens": 1894649.0,
"step": 116
},
{
"entropy": 0.5273909568786621,
"epoch": 0.43656716417910446,
"grad_norm": 0.1509886085987091,
"learning_rate": 0.0002,
"loss": 0.5329999923706055,
"mean_token_accuracy": 0.7835660129785538,
"num_tokens": 1910828.0,
"step": 117
},
{
"entropy": 0.5727127343416214,
"epoch": 0.44029850746268656,
"grad_norm": 0.12369527667760849,
"learning_rate": 0.0002,
"loss": 0.5647591948509216,
"mean_token_accuracy": 0.7721648663282394,
"num_tokens": 1927319.0,
"step": 118
},
{
"entropy": 0.5657652169466019,
"epoch": 0.44402985074626866,
"grad_norm": 0.14263150095939636,
"learning_rate": 0.0002,
"loss": 0.5616084337234497,
"mean_token_accuracy": 0.7732421457767487,
"num_tokens": 1943783.0,
"step": 119
},
{
"entropy": 0.5638687461614609,
"epoch": 0.44776119402985076,
"grad_norm": 0.11849121749401093,
"learning_rate": 0.0002,
"loss": 0.5577123165130615,
"mean_token_accuracy": 0.7739600390195847,
"num_tokens": 1960125.0,
"step": 120
},
{
"entropy": 0.5605282336473465,
"epoch": 0.45149253731343286,
"grad_norm": 0.1323515772819519,
"learning_rate": 0.0002,
"loss": 0.557800829410553,
"mean_token_accuracy": 0.7727965116500854,
"num_tokens": 1976458.0,
"step": 121
},
{
"entropy": 0.5336878746747971,
"epoch": 0.4552238805970149,
"grad_norm": 0.14154070615768433,
"learning_rate": 0.0002,
"loss": 0.5429147481918335,
"mean_token_accuracy": 0.7805563360452652,
"num_tokens": 1992835.0,
"step": 122
},
{
"entropy": 0.5291022211313248,
"epoch": 0.458955223880597,
"grad_norm": 0.15199723839759827,
"learning_rate": 0.0002,
"loss": 0.5432179570198059,
"mean_token_accuracy": 0.7801262736320496,
"num_tokens": 2008972.0,
"step": 123
},
{
"entropy": 0.551175132393837,
"epoch": 0.4626865671641791,
"grad_norm": 0.11983563005924225,
"learning_rate": 0.0002,
"loss": 0.5541180968284607,
"mean_token_accuracy": 0.7762188464403152,
"num_tokens": 2025359.0,
"step": 124
},
{
"entropy": 0.5533900856971741,
"epoch": 0.4664179104477612,
"grad_norm": 0.11737282574176788,
"learning_rate": 0.0002,
"loss": 0.5463876724243164,
"mean_token_accuracy": 0.7790547609329224,
"num_tokens": 2041643.0,
"step": 125
},
{
"entropy": 0.5509413182735443,
"epoch": 0.4701492537313433,
"grad_norm": 0.13276953995227814,
"learning_rate": 0.0002,
"loss": 0.5425540208816528,
"mean_token_accuracy": 0.7806166559457779,
"num_tokens": 2057820.0,
"step": 126
},
{
"entropy": 0.5531751215457916,
"epoch": 0.47388059701492535,
"grad_norm": 0.12553741037845612,
"learning_rate": 0.0002,
"loss": 0.5523180961608887,
"mean_token_accuracy": 0.7784822881221771,
"num_tokens": 2074179.0,
"step": 127
},
{
"entropy": 0.546363577246666,
"epoch": 0.47761194029850745,
"grad_norm": 0.13337954878807068,
"learning_rate": 0.0002,
"loss": 0.5551460981369019,
"mean_token_accuracy": 0.7742737084627151,
"num_tokens": 2090654.0,
"step": 128
},
{
"entropy": 0.5285965204238892,
"epoch": 0.48134328358208955,
"grad_norm": 0.13400429487228394,
"learning_rate": 0.0002,
"loss": 0.5407966375350952,
"mean_token_accuracy": 0.7815738469362259,
"num_tokens": 2107063.0,
"step": 129
},
{
"entropy": 0.5335082858800888,
"epoch": 0.48507462686567165,
"grad_norm": 0.13302984833717346,
"learning_rate": 0.0002,
"loss": 0.5388374328613281,
"mean_token_accuracy": 0.7839466333389282,
"num_tokens": 2123452.0,
"step": 130
},
{
"entropy": 0.557282879948616,
"epoch": 0.48880597014925375,
"grad_norm": 0.13119758665561676,
"learning_rate": 0.0002,
"loss": 0.5534148812294006,
"mean_token_accuracy": 0.7738241106271744,
"num_tokens": 2139585.0,
"step": 131
},
{
"entropy": 0.5428808927536011,
"epoch": 0.4925373134328358,
"grad_norm": 0.12375836819410324,
"learning_rate": 0.0002,
"loss": 0.5381428003311157,
"mean_token_accuracy": 0.7813713997602463,
"num_tokens": 2155902.0,
"step": 132
},
{
"entropy": 0.5618433207273483,
"epoch": 0.4962686567164179,
"grad_norm": 0.13146650791168213,
"learning_rate": 0.0002,
"loss": 0.552733838558197,
"mean_token_accuracy": 0.7768221199512482,
"num_tokens": 2172496.0,
"step": 133
},
{
"entropy": 0.5565268397331238,
"epoch": 0.5,
"grad_norm": 0.11766450107097626,
"learning_rate": 0.0002,
"loss": 0.5559637546539307,
"mean_token_accuracy": 0.7758495062589645,
"num_tokens": 2188987.0,
"step": 134
},
{
"entropy": 0.5205433219671249,
"epoch": 0.503731343283582,
"grad_norm": 0.12712325155735016,
"learning_rate": 0.0002,
"loss": 0.5280570387840271,
"mean_token_accuracy": 0.7863014787435532,
"num_tokens": 2205010.0,
"step": 135
},
{
"entropy": 0.5373736917972565,
"epoch": 0.5074626865671642,
"grad_norm": 0.13094842433929443,
"learning_rate": 0.0002,
"loss": 0.5430901050567627,
"mean_token_accuracy": 0.780227467417717,
"num_tokens": 2221474.0,
"step": 136
},
{
"entropy": 0.5688028186559677,
"epoch": 0.5111940298507462,
"grad_norm": 0.1379985511302948,
"learning_rate": 0.0002,
"loss": 0.5740535855293274,
"mean_token_accuracy": 0.7692983150482178,
"num_tokens": 2238030.0,
"step": 137
},
{
"entropy": 0.5621554553508759,
"epoch": 0.5149253731343284,
"grad_norm": 0.13305246829986572,
"learning_rate": 0.0002,
"loss": 0.5573163032531738,
"mean_token_accuracy": 0.7748852521181107,
"num_tokens": 2254436.0,
"step": 138
},
{
"entropy": 0.5507737994194031,
"epoch": 0.5186567164179104,
"grad_norm": 0.12606868147850037,
"learning_rate": 0.0002,
"loss": 0.5473536849021912,
"mean_token_accuracy": 0.7785522937774658,
"num_tokens": 2270806.0,
"step": 139
},
{
"entropy": 0.5534549057483673,
"epoch": 0.5223880597014925,
"grad_norm": 0.14390718936920166,
"learning_rate": 0.0002,
"loss": 0.5571063756942749,
"mean_token_accuracy": 0.7750511020421982,
"num_tokens": 2286975.0,
"step": 140
},
{
"entropy": 0.5419649630784988,
"epoch": 0.5261194029850746,
"grad_norm": 0.13526654243469238,
"learning_rate": 0.0002,
"loss": 0.5507834553718567,
"mean_token_accuracy": 0.7767505496740341,
"num_tokens": 2303373.0,
"step": 141
},
{
"entropy": 0.5532436519861221,
"epoch": 0.5298507462686567,
"grad_norm": 0.1307537853717804,
"learning_rate": 0.0002,
"loss": 0.5537344813346863,
"mean_token_accuracy": 0.7779698222875595,
"num_tokens": 2319833.0,
"step": 142
},
{
"entropy": 0.5443145930767059,
"epoch": 0.5335820895522388,
"grad_norm": 0.12360236793756485,
"learning_rate": 0.0002,
"loss": 0.5414459109306335,
"mean_token_accuracy": 0.7796581238508224,
"num_tokens": 2336100.0,
"step": 143
},
{
"entropy": 0.5436644405126572,
"epoch": 0.5373134328358209,
"grad_norm": 0.13813567161560059,
"learning_rate": 0.0002,
"loss": 0.5399284362792969,
"mean_token_accuracy": 0.781887099146843,
"num_tokens": 2352431.0,
"step": 144
},
{
"entropy": 0.554161787033081,
"epoch": 0.5410447761194029,
"grad_norm": 0.1234111338853836,
"learning_rate": 0.0002,
"loss": 0.5504522323608398,
"mean_token_accuracy": 0.7768333554267883,
"num_tokens": 2368781.0,
"step": 145
},
{
"entropy": 0.540039673447609,
"epoch": 0.5447761194029851,
"grad_norm": 0.12760984897613525,
"learning_rate": 0.0002,
"loss": 0.5470931529998779,
"mean_token_accuracy": 0.7785885185003281,
"num_tokens": 2385030.0,
"step": 146
},
{
"entropy": 0.538455605506897,
"epoch": 0.5485074626865671,
"grad_norm": 0.11708244681358337,
"learning_rate": 0.0002,
"loss": 0.540416419506073,
"mean_token_accuracy": 0.782222330570221,
"num_tokens": 2401529.0,
"step": 147
},
{
"entropy": 0.5445697456598282,
"epoch": 0.5522388059701493,
"grad_norm": 0.11756740510463715,
"learning_rate": 0.0002,
"loss": 0.5511283278465271,
"mean_token_accuracy": 0.7760586440563202,
"num_tokens": 2417920.0,
"step": 148
},
{
"entropy": 0.5568743199110031,
"epoch": 0.5559701492537313,
"grad_norm": 0.1262131929397583,
"learning_rate": 0.0002,
"loss": 0.5587324500083923,
"mean_token_accuracy": 0.7755658030509949,
"num_tokens": 2434402.0,
"step": 149
},
{
"entropy": 0.5476635098457336,
"epoch": 0.5597014925373134,
"grad_norm": 0.14212746918201447,
"learning_rate": 0.0002,
"loss": 0.5485654473304749,
"mean_token_accuracy": 0.7787987738847733,
"num_tokens": 2450648.0,
"step": 150
},
{
"entropy": 0.5328710079193115,
"epoch": 0.5634328358208955,
"grad_norm": 0.1456608921289444,
"learning_rate": 0.0002,
"loss": 0.5320286750793457,
"mean_token_accuracy": 0.7839557826519012,
"num_tokens": 2466701.0,
"step": 151
},
{
"entropy": 0.5372531861066818,
"epoch": 0.5671641791044776,
"grad_norm": 0.11793923377990723,
"learning_rate": 0.0002,
"loss": 0.5379877090454102,
"mean_token_accuracy": 0.7800156623125076,
"num_tokens": 2482627.0,
"step": 152
},
{
"entropy": 0.5532563626766205,
"epoch": 0.5708955223880597,
"grad_norm": 0.13809776306152344,
"learning_rate": 0.0002,
"loss": 0.551555871963501,
"mean_token_accuracy": 0.7761517316102982,
"num_tokens": 2499250.0,
"step": 153
},
{
"entropy": 0.5471682995557785,
"epoch": 0.5746268656716418,
"grad_norm": 0.1408306509256363,
"learning_rate": 0.0002,
"loss": 0.5491219758987427,
"mean_token_accuracy": 0.7767983973026276,
"num_tokens": 2515443.0,
"step": 154
},
{
"entropy": 0.571009948849678,
"epoch": 0.5783582089552238,
"grad_norm": 0.1486109346151352,
"learning_rate": 0.0002,
"loss": 0.5713759660720825,
"mean_token_accuracy": 0.7713276296854019,
"num_tokens": 2531761.0,
"step": 155
},
{
"entropy": 0.5617386847734451,
"epoch": 0.582089552238806,
"grad_norm": 0.15764987468719482,
"learning_rate": 0.0002,
"loss": 0.5562607645988464,
"mean_token_accuracy": 0.7755531519651413,
"num_tokens": 2548176.0,
"step": 156
},
{
"entropy": 0.5492932498455048,
"epoch": 0.585820895522388,
"grad_norm": 0.153673455119133,
"learning_rate": 0.0002,
"loss": 0.5581745505332947,
"mean_token_accuracy": 0.7730790227651596,
"num_tokens": 2564448.0,
"step": 157
},
{
"entropy": 0.555228590965271,
"epoch": 0.5895522388059702,
"grad_norm": 0.1345115751028061,
"learning_rate": 0.0002,
"loss": 0.5605562329292297,
"mean_token_accuracy": 0.7717746198177338,
"num_tokens": 2580905.0,
"step": 158
},
{
"entropy": 0.5399526059627533,
"epoch": 0.5932835820895522,
"grad_norm": 0.11657729744911194,
"learning_rate": 0.0002,
"loss": 0.5369132161140442,
"mean_token_accuracy": 0.7842999547719955,
"num_tokens": 2597180.0,
"step": 159
},
{
"entropy": 0.5353947132825851,
"epoch": 0.5970149253731343,
"grad_norm": 0.1333966851234436,
"learning_rate": 0.0002,
"loss": 0.5362208485603333,
"mean_token_accuracy": 0.7827091217041016,
"num_tokens": 2613444.0,
"step": 160
},
{
"entropy": 0.5535644590854645,
"epoch": 0.6007462686567164,
"grad_norm": 0.13608874380588531,
"learning_rate": 0.0002,
"loss": 0.5567671656608582,
"mean_token_accuracy": 0.7774695008993149,
"num_tokens": 2629983.0,
"step": 161
},
{
"entropy": 0.5560604184865952,
"epoch": 0.6044776119402985,
"grad_norm": 0.1163283959031105,
"learning_rate": 0.0002,
"loss": 0.5636521577835083,
"mean_token_accuracy": 0.7745625525712967,
"num_tokens": 2646578.0,
"step": 162
},
{
"entropy": 0.5764736235141754,
"epoch": 0.6082089552238806,
"grad_norm": 0.1255754828453064,
"learning_rate": 0.0002,
"loss": 0.578213632106781,
"mean_token_accuracy": 0.7662594020366669,
"num_tokens": 2663032.0,
"step": 163
},
{
"entropy": 0.5460716336965561,
"epoch": 0.6119402985074627,
"grad_norm": 0.13686135411262512,
"learning_rate": 0.0002,
"loss": 0.5406862497329712,
"mean_token_accuracy": 0.7790546417236328,
"num_tokens": 2679368.0,
"step": 164
},
{
"entropy": 0.5340383723378181,
"epoch": 0.6156716417910447,
"grad_norm": 0.12064651399850845,
"learning_rate": 0.0002,
"loss": 0.5316583514213562,
"mean_token_accuracy": 0.7829991579055786,
"num_tokens": 2695866.0,
"step": 165
},
{
"entropy": 0.5442641973495483,
"epoch": 0.6194029850746269,
"grad_norm": 0.12049891799688339,
"learning_rate": 0.0002,
"loss": 0.5513224005699158,
"mean_token_accuracy": 0.7753165811300278,
"num_tokens": 2712061.0,
"step": 166
},
{
"entropy": 0.5361381322145462,
"epoch": 0.6231343283582089,
"grad_norm": 0.13572274148464203,
"learning_rate": 0.0002,
"loss": 0.5410642623901367,
"mean_token_accuracy": 0.7834690064191818,
"num_tokens": 2728405.0,
"step": 167
},
{
"entropy": 0.542312353849411,
"epoch": 0.6268656716417911,
"grad_norm": 0.12791581451892853,
"learning_rate": 0.0002,
"loss": 0.5421413779258728,
"mean_token_accuracy": 0.7781463712453842,
"num_tokens": 2744612.0,
"step": 168
},
{
"entropy": 0.5568868666887283,
"epoch": 0.6305970149253731,
"grad_norm": 0.12156295031309128,
"learning_rate": 0.0002,
"loss": 0.5577100515365601,
"mean_token_accuracy": 0.7726946324110031,
"num_tokens": 2761047.0,
"step": 169
},
{
"entropy": 0.5537672489881516,
"epoch": 0.6343283582089553,
"grad_norm": 0.1293496936559677,
"learning_rate": 0.0002,
"loss": 0.5571946501731873,
"mean_token_accuracy": 0.7751306742429733,
"num_tokens": 2777250.0,
"step": 170
},
{
"entropy": 0.5509191900491714,
"epoch": 0.6380597014925373,
"grad_norm": 0.1272898018360138,
"learning_rate": 0.0002,
"loss": 0.5516744256019592,
"mean_token_accuracy": 0.7766414433717728,
"num_tokens": 2793605.0,
"step": 171
},
{
"entropy": 0.5510837286710739,
"epoch": 0.6417910447761194,
"grad_norm": 0.14305925369262695,
"learning_rate": 0.0002,
"loss": 0.5544188618659973,
"mean_token_accuracy": 0.7760672718286514,
"num_tokens": 2809948.0,
"step": 172
},
{
"entropy": 0.5232614651322365,
"epoch": 0.6455223880597015,
"grad_norm": 0.1384088695049286,
"learning_rate": 0.0002,
"loss": 0.5274964570999146,
"mean_token_accuracy": 0.7859550416469574,
"num_tokens": 2826128.0,
"step": 173
},
{
"entropy": 0.5601816028356552,
"epoch": 0.6492537313432836,
"grad_norm": 0.1388508826494217,
"learning_rate": 0.0002,
"loss": 0.5543120503425598,
"mean_token_accuracy": 0.7758214622735977,
"num_tokens": 2842612.0,
"step": 174
},
{
"entropy": 0.5437414795160294,
"epoch": 0.6529850746268657,
"grad_norm": 0.11655397713184357,
"learning_rate": 0.0002,
"loss": 0.5404227375984192,
"mean_token_accuracy": 0.7822663187980652,
"num_tokens": 2859123.0,
"step": 175
},
{
"entropy": 0.55133356153965,
"epoch": 0.6567164179104478,
"grad_norm": 0.1398521363735199,
"learning_rate": 0.0002,
"loss": 0.5518021583557129,
"mean_token_accuracy": 0.7771210372447968,
"num_tokens": 2875360.0,
"step": 176
},
{
"entropy": 0.5468268245458603,
"epoch": 0.6604477611940298,
"grad_norm": 0.12005320936441422,
"learning_rate": 0.0002,
"loss": 0.5481685996055603,
"mean_token_accuracy": 0.7786961048841476,
"num_tokens": 2891626.0,
"step": 177
},
{
"entropy": 0.5444129258394241,
"epoch": 0.664179104477612,
"grad_norm": 0.16883929073810577,
"learning_rate": 0.0002,
"loss": 0.5526378750801086,
"mean_token_accuracy": 0.7768739610910416,
"num_tokens": 2907939.0,
"step": 178
},
{
"entropy": 0.5393242985010147,
"epoch": 0.667910447761194,
"grad_norm": 0.1297578513622284,
"learning_rate": 0.0002,
"loss": 0.5451361536979675,
"mean_token_accuracy": 0.7800205200910568,
"num_tokens": 2924294.0,
"step": 179
},
{
"entropy": 0.5417011380195618,
"epoch": 0.6716417910447762,
"grad_norm": 0.12030332535505295,
"learning_rate": 0.0002,
"loss": 0.5440862774848938,
"mean_token_accuracy": 0.7813349515199661,
"num_tokens": 2940716.0,
"step": 180
},
{
"entropy": 0.5521986186504364,
"epoch": 0.6753731343283582,
"grad_norm": 0.11406023800373077,
"learning_rate": 0.0002,
"loss": 0.5487515926361084,
"mean_token_accuracy": 0.7764244675636292,
"num_tokens": 2956993.0,
"step": 181
},
{
"entropy": 0.5547273755073547,
"epoch": 0.6791044776119403,
"grad_norm": 0.13328734040260315,
"learning_rate": 0.0002,
"loss": 0.552635669708252,
"mean_token_accuracy": 0.7759450674057007,
"num_tokens": 2973622.0,
"step": 182
},
{
"entropy": 0.5548880398273468,
"epoch": 0.6828358208955224,
"grad_norm": 0.11328119784593582,
"learning_rate": 0.0002,
"loss": 0.5517279505729675,
"mean_token_accuracy": 0.7757984399795532,
"num_tokens": 2989995.0,
"step": 183
},
{
"entropy": 0.5576671957969666,
"epoch": 0.6865671641791045,
"grad_norm": 0.1849256306886673,
"learning_rate": 0.0002,
"loss": 0.5650368332862854,
"mean_token_accuracy": 0.7731626927852631,
"num_tokens": 3006538.0,
"step": 184
},
{
"entropy": 0.537109300494194,
"epoch": 0.6902985074626866,
"grad_norm": 0.1240711435675621,
"learning_rate": 0.0002,
"loss": 0.5376191139221191,
"mean_token_accuracy": 0.7854040563106537,
"num_tokens": 3022770.0,
"step": 185
},
{
"entropy": 0.5537560731172562,
"epoch": 0.6940298507462687,
"grad_norm": 0.1654159426689148,
"learning_rate": 0.0002,
"loss": 0.5570691227912903,
"mean_token_accuracy": 0.7766956984996796,
"num_tokens": 3039407.0,
"step": 186
},
{
"entropy": 0.5552389770746231,
"epoch": 0.6977611940298507,
"grad_norm": 0.10993515700101852,
"learning_rate": 0.0002,
"loss": 0.5586962103843689,
"mean_token_accuracy": 0.7749262005090714,
"num_tokens": 3055780.0,
"step": 187
},
{
"entropy": 0.5666979551315308,
"epoch": 0.7014925373134329,
"grad_norm": 0.11159558594226837,
"learning_rate": 0.0002,
"loss": 0.5667304992675781,
"mean_token_accuracy": 0.7695165723562241,
"num_tokens": 3072362.0,
"step": 188
},
{
"entropy": 0.5639722347259521,
"epoch": 0.7052238805970149,
"grad_norm": 0.14158234000205994,
"learning_rate": 0.0002,
"loss": 0.5614078044891357,
"mean_token_accuracy": 0.7733878195285797,
"num_tokens": 3088887.0,
"step": 189
},
{
"entropy": 0.5518735945224762,
"epoch": 0.7089552238805971,
"grad_norm": 0.12406881153583527,
"learning_rate": 0.0002,
"loss": 0.5611676573753357,
"mean_token_accuracy": 0.7746167629957199,
"num_tokens": 3105332.0,
"step": 190
},
{
"entropy": 0.5349650382995605,
"epoch": 0.7126865671641791,
"grad_norm": 0.13473471999168396,
"learning_rate": 0.0002,
"loss": 0.54412841796875,
"mean_token_accuracy": 0.7769501060247421,
"num_tokens": 3121582.0,
"step": 191
},
{
"entropy": 0.5316546410322189,
"epoch": 0.7164179104477612,
"grad_norm": 0.11828400939702988,
"learning_rate": 0.0002,
"loss": 0.530936062335968,
"mean_token_accuracy": 0.7848189175128937,
"num_tokens": 3137920.0,
"step": 192
},
{
"entropy": 0.556887611746788,
"epoch": 0.7201492537313433,
"grad_norm": 0.1256878823041916,
"learning_rate": 0.0002,
"loss": 0.555519700050354,
"mean_token_accuracy": 0.7738869190216064,
"num_tokens": 3154339.0,
"step": 193
},
{
"entropy": 0.5477663427591324,
"epoch": 0.7238805970149254,
"grad_norm": 0.11984176933765411,
"learning_rate": 0.0002,
"loss": 0.5489908456802368,
"mean_token_accuracy": 0.7780539244413376,
"num_tokens": 3170574.0,
"step": 194
},
{
"entropy": 0.5371970534324646,
"epoch": 0.7276119402985075,
"grad_norm": 0.11440598219633102,
"learning_rate": 0.0002,
"loss": 0.5346511602401733,
"mean_token_accuracy": 0.7856602966785431,
"num_tokens": 3187140.0,
"step": 195
},
{
"entropy": 0.5374069362878799,
"epoch": 0.7313432835820896,
"grad_norm": 0.1220874935388565,
"learning_rate": 0.0002,
"loss": 0.5448272228240967,
"mean_token_accuracy": 0.7792176902294159,
"num_tokens": 3203454.0,
"step": 196
},
{
"entropy": 0.5373833179473877,
"epoch": 0.7350746268656716,
"grad_norm": 0.14692658185958862,
"learning_rate": 0.0002,
"loss": 0.547886312007904,
"mean_token_accuracy": 0.7767521291971207,
"num_tokens": 3219558.0,
"step": 197
},
{
"entropy": 0.554410994052887,
"epoch": 0.7388059701492538,
"grad_norm": 0.12380608916282654,
"learning_rate": 0.0002,
"loss": 0.550884485244751,
"mean_token_accuracy": 0.7776724547147751,
"num_tokens": 3235877.0,
"step": 198
},
{
"entropy": 0.5471773892641068,
"epoch": 0.7425373134328358,
"grad_norm": 0.11140885949134827,
"learning_rate": 0.0002,
"loss": 0.5401238799095154,
"mean_token_accuracy": 0.7774412035942078,
"num_tokens": 3252209.0,
"step": 199
},
{
"entropy": 0.5380608141422272,
"epoch": 0.746268656716418,
"grad_norm": 0.1454455554485321,
"learning_rate": 0.0002,
"loss": 0.5387637615203857,
"mean_token_accuracy": 0.7800891399383545,
"num_tokens": 3268329.0,
"step": 200
},
{
"entropy": 0.5308581739664078,
"epoch": 0.75,
"grad_norm": 0.1361016035079956,
"learning_rate": 0.0002,
"loss": 0.5343608260154724,
"mean_token_accuracy": 0.7855110317468643,
"num_tokens": 3284338.0,
"step": 201
},
{
"entropy": 0.5632822811603546,
"epoch": 0.753731343283582,
"grad_norm": 0.13291221857070923,
"learning_rate": 0.0002,
"loss": 0.5640154480934143,
"mean_token_accuracy": 0.767445370554924,
"num_tokens": 3300776.0,
"step": 202
},
{
"entropy": 0.554180920124054,
"epoch": 0.7574626865671642,
"grad_norm": 0.12478666007518768,
"learning_rate": 0.0002,
"loss": 0.5525573492050171,
"mean_token_accuracy": 0.774932399392128,
"num_tokens": 3317196.0,
"step": 203
},
{
"entropy": 0.5349105298519135,
"epoch": 0.7611940298507462,
"grad_norm": 0.12442342936992645,
"learning_rate": 0.0002,
"loss": 0.5401512980461121,
"mean_token_accuracy": 0.7819676995277405,
"num_tokens": 3333516.0,
"step": 204
},
{
"entropy": 0.5417488664388657,
"epoch": 0.7649253731343284,
"grad_norm": 0.12787121534347534,
"learning_rate": 0.0002,
"loss": 0.5460774302482605,
"mean_token_accuracy": 0.7793125957250595,
"num_tokens": 3349860.0,
"step": 205
},
{
"entropy": 0.5238666534423828,
"epoch": 0.7686567164179104,
"grad_norm": 0.14022648334503174,
"learning_rate": 0.0002,
"loss": 0.5336724519729614,
"mean_token_accuracy": 0.7843347638845444,
"num_tokens": 3365954.0,
"step": 206
},
{
"entropy": 0.5506514012813568,
"epoch": 0.7723880597014925,
"grad_norm": 0.10952670127153397,
"learning_rate": 0.0002,
"loss": 0.5459721684455872,
"mean_token_accuracy": 0.7809877097606659,
"num_tokens": 3382344.0,
"step": 207
},
{
"entropy": 0.5601198077201843,
"epoch": 0.7761194029850746,
"grad_norm": 0.14921848475933075,
"learning_rate": 0.0002,
"loss": 0.5593782663345337,
"mean_token_accuracy": 0.7718043476343155,
"num_tokens": 3398687.0,
"step": 208
},
{
"entropy": 0.5334768891334534,
"epoch": 0.7798507462686567,
"grad_norm": 0.11596426367759705,
"learning_rate": 0.0002,
"loss": 0.5338318943977356,
"mean_token_accuracy": 0.783938467502594,
"num_tokens": 3414913.0,
"step": 209
},
{
"entropy": 0.5415135025978088,
"epoch": 0.7835820895522388,
"grad_norm": 0.13524818420410156,
"learning_rate": 0.0002,
"loss": 0.5422087907791138,
"mean_token_accuracy": 0.7810906171798706,
"num_tokens": 3431071.0,
"step": 210
},
{
"entropy": 0.5562594383955002,
"epoch": 0.7873134328358209,
"grad_norm": 0.14714977145195007,
"learning_rate": 0.0002,
"loss": 0.5575138926506042,
"mean_token_accuracy": 0.7743899971246719,
"num_tokens": 3447417.0,
"step": 211
},
{
"entropy": 0.536840409040451,
"epoch": 0.7910447761194029,
"grad_norm": 0.1191772073507309,
"learning_rate": 0.0002,
"loss": 0.539043664932251,
"mean_token_accuracy": 0.7791986167430878,
"num_tokens": 3463951.0,
"step": 212
},
{
"entropy": 0.5601708441972733,
"epoch": 0.7947761194029851,
"grad_norm": 0.14285218715667725,
"learning_rate": 0.0002,
"loss": 0.5604355931282043,
"mean_token_accuracy": 0.7729564011096954,
"num_tokens": 3480303.0,
"step": 213
},
{
"entropy": 0.5470457077026367,
"epoch": 0.7985074626865671,
"grad_norm": 0.13420677185058594,
"learning_rate": 0.0002,
"loss": 0.554261326789856,
"mean_token_accuracy": 0.7758394628763199,
"num_tokens": 3496665.0,
"step": 214
},
{
"entropy": 0.5595335066318512,
"epoch": 0.8022388059701493,
"grad_norm": 0.12468434125185013,
"learning_rate": 0.0002,
"loss": 0.5626363158226013,
"mean_token_accuracy": 0.7708792388439178,
"num_tokens": 3512987.0,
"step": 215
},
{
"entropy": 0.5410265326499939,
"epoch": 0.8059701492537313,
"grad_norm": 0.1368313878774643,
"learning_rate": 0.0002,
"loss": 0.5424209237098694,
"mean_token_accuracy": 0.780338704586029,
"num_tokens": 3529322.0,
"step": 216
},
{
"entropy": 0.5611067861318588,
"epoch": 0.8097014925373134,
"grad_norm": 0.12065284699201584,
"learning_rate": 0.0002,
"loss": 0.5554131269454956,
"mean_token_accuracy": 0.775262787938118,
"num_tokens": 3545541.0,
"step": 217
},
{
"entropy": 0.5451776385307312,
"epoch": 0.8134328358208955,
"grad_norm": 0.13018189370632172,
"learning_rate": 0.0002,
"loss": 0.5477407574653625,
"mean_token_accuracy": 0.7790820002555847,
"num_tokens": 3562081.0,
"step": 218
},
{
"entropy": 0.5475118607282639,
"epoch": 0.8171641791044776,
"grad_norm": 0.1309870183467865,
"learning_rate": 0.0002,
"loss": 0.548214852809906,
"mean_token_accuracy": 0.7790254205465317,
"num_tokens": 3578349.0,
"step": 219
},
{
"entropy": 0.5216370671987534,
"epoch": 0.8208955223880597,
"grad_norm": 0.1223544329404831,
"learning_rate": 0.0002,
"loss": 0.5256963968276978,
"mean_token_accuracy": 0.787861168384552,
"num_tokens": 3594724.0,
"step": 220
},
{
"entropy": 0.5441537946462631,
"epoch": 0.8246268656716418,
"grad_norm": 0.1324274092912674,
"learning_rate": 0.0002,
"loss": 0.5496052503585815,
"mean_token_accuracy": 0.7781362533569336,
"num_tokens": 3611250.0,
"step": 221
},
{
"entropy": 0.5336802899837494,
"epoch": 0.8283582089552238,
"grad_norm": 0.15294679999351501,
"learning_rate": 0.0002,
"loss": 0.5427975654602051,
"mean_token_accuracy": 0.7801742255687714,
"num_tokens": 3627526.0,
"step": 222
},
{
"entropy": 0.5635577589273453,
"epoch": 0.832089552238806,
"grad_norm": 0.1364123523235321,
"learning_rate": 0.0002,
"loss": 0.5619288682937622,
"mean_token_accuracy": 0.768532395362854,
"num_tokens": 3643553.0,
"step": 223
},
{
"entropy": 0.5576212853193283,
"epoch": 0.835820895522388,
"grad_norm": 0.1353282779455185,
"learning_rate": 0.0002,
"loss": 0.5438153147697449,
"mean_token_accuracy": 0.779265359044075,
"num_tokens": 3660133.0,
"step": 224
},
{
"entropy": 0.5412103980779648,
"epoch": 0.8395522388059702,
"grad_norm": 0.12540455162525177,
"learning_rate": 0.0002,
"loss": 0.5397533774375916,
"mean_token_accuracy": 0.7794700264930725,
"num_tokens": 3676295.0,
"step": 225
},
{
"entropy": 0.5455985218286514,
"epoch": 0.8432835820895522,
"grad_norm": 0.13320018351078033,
"learning_rate": 0.0002,
"loss": 0.5485510230064392,
"mean_token_accuracy": 0.778446152806282,
"num_tokens": 3692894.0,
"step": 226
},
{
"entropy": 0.5248135328292847,
"epoch": 0.8470149253731343,
"grad_norm": 0.13709791004657745,
"learning_rate": 0.0002,
"loss": 0.536843478679657,
"mean_token_accuracy": 0.7809243649244308,
"num_tokens": 3709122.0,
"step": 227
},
{
"entropy": 0.53542160987854,
"epoch": 0.8507462686567164,
"grad_norm": 0.12484195083379745,
"learning_rate": 0.0002,
"loss": 0.5407888293266296,
"mean_token_accuracy": 0.7803395837545395,
"num_tokens": 3725461.0,
"step": 228
},
{
"entropy": 0.5458493530750275,
"epoch": 0.8544776119402985,
"grad_norm": 0.13020864129066467,
"learning_rate": 0.0002,
"loss": 0.5498859882354736,
"mean_token_accuracy": 0.7766377329826355,
"num_tokens": 3741717.0,
"step": 229
},
{
"entropy": 0.5359915047883987,
"epoch": 0.8582089552238806,
"grad_norm": 0.11409227550029755,
"learning_rate": 0.0002,
"loss": 0.5289561748504639,
"mean_token_accuracy": 0.7882120311260223,
"num_tokens": 3757988.0,
"step": 230
},
{
"entropy": 0.5659278780221939,
"epoch": 0.8619402985074627,
"grad_norm": 0.10721168667078018,
"learning_rate": 0.0002,
"loss": 0.5621720552444458,
"mean_token_accuracy": 0.7705938816070557,
"num_tokens": 3774220.0,
"step": 231
},
{
"entropy": 0.5599822998046875,
"epoch": 0.8656716417910447,
"grad_norm": 0.12365678697824478,
"learning_rate": 0.0002,
"loss": 0.5598929524421692,
"mean_token_accuracy": 0.7715335041284561,
"num_tokens": 3790653.0,
"step": 232
},
{
"entropy": 0.54929418861866,
"epoch": 0.8694029850746269,
"grad_norm": 0.12949936091899872,
"learning_rate": 0.0002,
"loss": 0.5555176734924316,
"mean_token_accuracy": 0.7733278125524521,
"num_tokens": 3807110.0,
"step": 233
},
{
"entropy": 0.5474081933498383,
"epoch": 0.8731343283582089,
"grad_norm": 0.12146537750959396,
"learning_rate": 0.0002,
"loss": 0.5511813759803772,
"mean_token_accuracy": 0.7766411751508713,
"num_tokens": 3823486.0,
"step": 234
},
{
"entropy": 0.5372883975505829,
"epoch": 0.8768656716417911,
"grad_norm": 0.12444064766168594,
"learning_rate": 0.0002,
"loss": 0.5384877324104309,
"mean_token_accuracy": 0.7811126857995987,
"num_tokens": 3839856.0,
"step": 235
},
{
"entropy": 0.5574021190404892,
"epoch": 0.8805970149253731,
"grad_norm": 0.11953511834144592,
"learning_rate": 0.0002,
"loss": 0.5613345503807068,
"mean_token_accuracy": 0.7729752510786057,
"num_tokens": 3856362.0,
"step": 236
},
{
"entropy": 0.5452482104301453,
"epoch": 0.8843283582089553,
"grad_norm": 0.11208797991275787,
"learning_rate": 0.0002,
"loss": 0.5457064509391785,
"mean_token_accuracy": 0.7782498598098755,
"num_tokens": 3872666.0,
"step": 237
},
{
"entropy": 0.5534125864505768,
"epoch": 0.8880597014925373,
"grad_norm": 0.15453441441059113,
"learning_rate": 0.0002,
"loss": 0.5572060346603394,
"mean_token_accuracy": 0.7716512382030487,
"num_tokens": 3888939.0,
"step": 238
},
{
"entropy": 0.547100231051445,
"epoch": 0.8917910447761194,
"grad_norm": 0.12707094848155975,
"learning_rate": 0.0002,
"loss": 0.5511140823364258,
"mean_token_accuracy": 0.7789764106273651,
"num_tokens": 3905243.0,
"step": 239
},
{
"entropy": 0.544873908162117,
"epoch": 0.8955223880597015,
"grad_norm": 0.13703206181526184,
"learning_rate": 0.0002,
"loss": 0.5423987507820129,
"mean_token_accuracy": 0.7779188007116318,
"num_tokens": 3921866.0,
"step": 240
},
{
"entropy": 0.5453302264213562,
"epoch": 0.8992537313432836,
"grad_norm": 0.11689020693302155,
"learning_rate": 0.0002,
"loss": 0.5460352301597595,
"mean_token_accuracy": 0.7779721468687057,
"num_tokens": 3938407.0,
"step": 241
},
{
"entropy": 0.5635591447353363,
"epoch": 0.9029850746268657,
"grad_norm": 0.13040713965892792,
"learning_rate": 0.0002,
"loss": 0.5655105113983154,
"mean_token_accuracy": 0.768951028585434,
"num_tokens": 3954812.0,
"step": 242
},
{
"entropy": 0.5287201702594757,
"epoch": 0.9067164179104478,
"grad_norm": 0.11932681500911713,
"learning_rate": 0.0002,
"loss": 0.5290012359619141,
"mean_token_accuracy": 0.7868975102901459,
"num_tokens": 3970722.0,
"step": 243
},
{
"entropy": 0.5399811267852783,
"epoch": 0.9104477611940298,
"grad_norm": 0.15166425704956055,
"learning_rate": 0.0002,
"loss": 0.5475818514823914,
"mean_token_accuracy": 0.7782254964113235,
"num_tokens": 3986919.0,
"step": 244
},
{
"entropy": 0.5479171127080917,
"epoch": 0.914179104477612,
"grad_norm": 0.13205286860466003,
"learning_rate": 0.0002,
"loss": 0.5506084561347961,
"mean_token_accuracy": 0.7769028395414352,
"num_tokens": 4003718.0,
"step": 245
},
{
"entropy": 0.5506049394607544,
"epoch": 0.917910447761194,
"grad_norm": 0.1079086884856224,
"learning_rate": 0.0002,
"loss": 0.5398848056793213,
"mean_token_accuracy": 0.7830533385276794,
"num_tokens": 4020063.0,
"step": 246
},
{
"entropy": 0.5654618889093399,
"epoch": 0.9216417910447762,
"grad_norm": 0.1322406679391861,
"learning_rate": 0.0002,
"loss": 0.5590391755104065,
"mean_token_accuracy": 0.7732941806316376,
"num_tokens": 4036681.0,
"step": 247
},
{
"entropy": 0.546074166893959,
"epoch": 0.9253731343283582,
"grad_norm": 0.12490007281303406,
"learning_rate": 0.0002,
"loss": 0.5554251670837402,
"mean_token_accuracy": 0.7764608860015869,
"num_tokens": 4052971.0,
"step": 248
},
{
"entropy": 0.5580905228853226,
"epoch": 0.9291044776119403,
"grad_norm": 0.11980146169662476,
"learning_rate": 0.0002,
"loss": 0.5676828622817993,
"mean_token_accuracy": 0.7696985453367233,
"num_tokens": 4069338.0,
"step": 249
},
{
"entropy": 0.5355470329523087,
"epoch": 0.9328358208955224,
"grad_norm": 0.12107004970312119,
"learning_rate": 0.0002,
"loss": 0.5405516028404236,
"mean_token_accuracy": 0.7829477041959763,
"num_tokens": 4085750.0,
"step": 250
},
{
"entropy": 0.5567673444747925,
"epoch": 0.9365671641791045,
"grad_norm": 0.12893939018249512,
"learning_rate": 0.0002,
"loss": 0.5650359988212585,
"mean_token_accuracy": 0.7712520509958267,
"num_tokens": 4102118.0,
"step": 251
},
{
"entropy": 0.5410316288471222,
"epoch": 0.9402985074626866,
"grad_norm": 0.11652866750955582,
"learning_rate": 0.0002,
"loss": 0.5460695028305054,
"mean_token_accuracy": 0.7774221301078796,
"num_tokens": 4118568.0,
"step": 252
},
{
"entropy": 0.5609200298786163,
"epoch": 0.9440298507462687,
"grad_norm": 0.11244899779558182,
"learning_rate": 0.0002,
"loss": 0.5490402579307556,
"mean_token_accuracy": 0.7748613804578781,
"num_tokens": 4135123.0,
"step": 253
},
{
"entropy": 0.5497269034385681,
"epoch": 0.9477611940298507,
"grad_norm": 0.14016613364219666,
"learning_rate": 0.0002,
"loss": 0.5342196822166443,
"mean_token_accuracy": 0.7829579263925552,
"num_tokens": 4151216.0,
"step": 254
},
{
"entropy": 0.5376796424388885,
"epoch": 0.9514925373134329,
"grad_norm": 0.11261948943138123,
"learning_rate": 0.0002,
"loss": 0.5384314656257629,
"mean_token_accuracy": 0.779564619064331,
"num_tokens": 4167504.0,
"step": 255
},
{
"entropy": 0.5369044691324234,
"epoch": 0.9552238805970149,
"grad_norm": 0.1335015743970871,
"learning_rate": 0.0002,
"loss": 0.5465540885925293,
"mean_token_accuracy": 0.7757421284914017,
"num_tokens": 4183799.0,
"step": 256
},
{
"entropy": 0.5567403733730316,
"epoch": 0.9589552238805971,
"grad_norm": 0.14907455444335938,
"learning_rate": 0.0002,
"loss": 0.567619800567627,
"mean_token_accuracy": 0.770223930478096,
"num_tokens": 4200155.0,
"step": 257
},
{
"entropy": 0.5468429028987885,
"epoch": 0.9626865671641791,
"grad_norm": 0.11520266532897949,
"learning_rate": 0.0002,
"loss": 0.5453846454620361,
"mean_token_accuracy": 0.7773052304983139,
"num_tokens": 4216435.0,
"step": 258
},
{
"entropy": 0.5431469082832336,
"epoch": 0.9664179104477612,
"grad_norm": 0.13169828057289124,
"learning_rate": 0.0002,
"loss": 0.5401536822319031,
"mean_token_accuracy": 0.7807234972715378,
"num_tokens": 4232685.0,
"step": 259
},
{
"entropy": 0.5463652908802032,
"epoch": 0.9701492537313433,
"grad_norm": 0.1208634227514267,
"learning_rate": 0.0002,
"loss": 0.539630115032196,
"mean_token_accuracy": 0.7806746661663055,
"num_tokens": 4248983.0,
"step": 260
},
{
"entropy": 0.5373689532279968,
"epoch": 0.9738805970149254,
"grad_norm": 0.1322765052318573,
"learning_rate": 0.0002,
"loss": 0.5365580916404724,
"mean_token_accuracy": 0.7808263897895813,
"num_tokens": 4265223.0,
"step": 261
},
{
"entropy": 0.5479995906352997,
"epoch": 0.9776119402985075,
"grad_norm": 0.12395796924829483,
"learning_rate": 0.0002,
"loss": 0.5560559630393982,
"mean_token_accuracy": 0.7720989733934402,
"num_tokens": 4281420.0,
"step": 262
},
{
"entropy": 0.5320831388235092,
"epoch": 0.9813432835820896,
"grad_norm": 0.15233781933784485,
"learning_rate": 0.0002,
"loss": 0.5420798659324646,
"mean_token_accuracy": 0.7780148983001709,
"num_tokens": 4297933.0,
"step": 263
},
{
"entropy": 0.5410943180322647,
"epoch": 0.9850746268656716,
"grad_norm": 0.11531079560518265,
"learning_rate": 0.0002,
"loss": 0.5476459264755249,
"mean_token_accuracy": 0.7788786739110947,
"num_tokens": 4314320.0,
"step": 264
},
{
"entropy": 0.5516358613967896,
"epoch": 0.9888059701492538,
"grad_norm": 0.11947735399007797,
"learning_rate": 0.0002,
"loss": 0.5536230206489563,
"mean_token_accuracy": 0.7767823338508606,
"num_tokens": 4330601.0,
"step": 265
},
{
"entropy": 0.5500903576612473,
"epoch": 0.9925373134328358,
"grad_norm": 0.12315159291028976,
"learning_rate": 0.0002,
"loss": 0.5529444813728333,
"mean_token_accuracy": 0.7752810269594193,
"num_tokens": 4347043.0,
"step": 266
},
{
"entropy": 0.5517779290676117,
"epoch": 0.996268656716418,
"grad_norm": 0.11137247085571289,
"learning_rate": 0.0002,
"loss": 0.5534829497337341,
"mean_token_accuracy": 0.7717059701681137,
"num_tokens": 4363391.0,
"step": 267
},
{
"entropy": 0.5500383973121643,
"epoch": 1.0,
"grad_norm": 0.1438470184803009,
"learning_rate": 0.0002,
"loss": 0.5475767850875854,
"mean_token_accuracy": 0.7807454466819763,
"num_tokens": 4379703.0,
"step": 268
},
{
"entropy": 0.5567186176776886,
"epoch": 1.0037313432835822,
"grad_norm": 0.12165568768978119,
"learning_rate": 0.0002,
"loss": 0.5443229079246521,
"mean_token_accuracy": 0.7788188308477402,
"num_tokens": 4395979.0,
"step": 269
},
{
"entropy": 0.5200136750936508,
"epoch": 1.007462686567164,
"grad_norm": 0.11453047394752502,
"learning_rate": 0.0002,
"loss": 0.5096794962882996,
"mean_token_accuracy": 0.7945292145013809,
"num_tokens": 4412227.0,
"step": 270
},
{
"entropy": 0.5380017757415771,
"epoch": 1.0111940298507462,
"grad_norm": 0.15120473504066467,
"learning_rate": 0.0002,
"loss": 0.5425546169281006,
"mean_token_accuracy": 0.781953439116478,
"num_tokens": 4428611.0,
"step": 271
},
{
"entropy": 0.5208772569894791,
"epoch": 1.0149253731343284,
"grad_norm": 0.1341351717710495,
"learning_rate": 0.0002,
"loss": 0.5326657295227051,
"mean_token_accuracy": 0.7831600904464722,
"num_tokens": 4444927.0,
"step": 272
},
{
"entropy": 0.5214353799819946,
"epoch": 1.0186567164179103,
"grad_norm": 0.14984826743602753,
"learning_rate": 0.0002,
"loss": 0.5280492901802063,
"mean_token_accuracy": 0.786370187997818,
"num_tokens": 4460991.0,
"step": 273
},
{
"entropy": 0.5258834809064865,
"epoch": 1.0223880597014925,
"grad_norm": 0.13014522194862366,
"learning_rate": 0.0002,
"loss": 0.5271875858306885,
"mean_token_accuracy": 0.7869210243225098,
"num_tokens": 4477645.0,
"step": 274
},
{
"entropy": 0.5273120403289795,
"epoch": 1.0261194029850746,
"grad_norm": 0.1311647742986679,
"learning_rate": 0.0002,
"loss": 0.5195775032043457,
"mean_token_accuracy": 0.7897085547447205,
"num_tokens": 4493809.0,
"step": 275
},
{
"entropy": 0.5415386855602264,
"epoch": 1.0298507462686568,
"grad_norm": 0.11555178463459015,
"learning_rate": 0.0002,
"loss": 0.5413332581520081,
"mean_token_accuracy": 0.7796304523944855,
"num_tokens": 4510212.0,
"step": 276
},
{
"entropy": 0.5370220988988876,
"epoch": 1.0335820895522387,
"grad_norm": 0.13971680402755737,
"learning_rate": 0.0002,
"loss": 0.5396295785903931,
"mean_token_accuracy": 0.7788214385509491,
"num_tokens": 4526435.0,
"step": 277
},
{
"entropy": 0.5435305833816528,
"epoch": 1.037313432835821,
"grad_norm": 0.10762611031532288,
"learning_rate": 0.0002,
"loss": 0.5435919761657715,
"mean_token_accuracy": 0.7784401625394821,
"num_tokens": 4542952.0,
"step": 278
},
{
"entropy": 0.5561162084341049,
"epoch": 1.041044776119403,
"grad_norm": 0.1305421143770218,
"learning_rate": 0.0002,
"loss": 0.5544913411140442,
"mean_token_accuracy": 0.7771686464548111,
"num_tokens": 4559371.0,
"step": 279
},
{
"entropy": 0.5161843150854111,
"epoch": 1.044776119402985,
"grad_norm": 0.13184338808059692,
"learning_rate": 0.0002,
"loss": 0.511843204498291,
"mean_token_accuracy": 0.7913843542337418,
"num_tokens": 4575731.0,
"step": 280
},
{
"entropy": 0.52925243973732,
"epoch": 1.0485074626865671,
"grad_norm": 0.1287873089313507,
"learning_rate": 0.0002,
"loss": 0.5263785719871521,
"mean_token_accuracy": 0.7861436605453491,
"num_tokens": 4592056.0,
"step": 281
},
{
"entropy": 0.5253249853849411,
"epoch": 1.0522388059701493,
"grad_norm": 0.12661200761795044,
"learning_rate": 0.0002,
"loss": 0.5272859334945679,
"mean_token_accuracy": 0.7849764674901962,
"num_tokens": 4608326.0,
"step": 282
},
{
"entropy": 0.5225464850664139,
"epoch": 1.0559701492537314,
"grad_norm": 0.11925826221704483,
"learning_rate": 0.0002,
"loss": 0.5287873148918152,
"mean_token_accuracy": 0.7825718820095062,
"num_tokens": 4624408.0,
"step": 283
},
{
"entropy": 0.5239171385765076,
"epoch": 1.0597014925373134,
"grad_norm": 0.12639594078063965,
"learning_rate": 0.0002,
"loss": 0.5275134444236755,
"mean_token_accuracy": 0.784866139292717,
"num_tokens": 4640897.0,
"step": 284
},
{
"entropy": 0.5350142568349838,
"epoch": 1.0634328358208955,
"grad_norm": 0.13742367923259735,
"learning_rate": 0.0002,
"loss": 0.5391872525215149,
"mean_token_accuracy": 0.7813242971897125,
"num_tokens": 4657487.0,
"step": 285
},
{
"entropy": 0.5414403080940247,
"epoch": 1.0671641791044777,
"grad_norm": 0.12273678928613663,
"learning_rate": 0.0002,
"loss": 0.538042426109314,
"mean_token_accuracy": 0.7844662219285965,
"num_tokens": 4674009.0,
"step": 286
},
{
"entropy": 0.5556955337524414,
"epoch": 1.0708955223880596,
"grad_norm": 0.11591946333646774,
"learning_rate": 0.0002,
"loss": 0.5542109608650208,
"mean_token_accuracy": 0.7758783847093582,
"num_tokens": 4690230.0,
"step": 287
},
{
"entropy": 0.5334881544113159,
"epoch": 1.0746268656716418,
"grad_norm": 0.11168122291564941,
"learning_rate": 0.0002,
"loss": 0.5347651243209839,
"mean_token_accuracy": 0.7833859175443649,
"num_tokens": 4706362.0,
"step": 288
},
{
"entropy": 0.5315591096878052,
"epoch": 1.078358208955224,
"grad_norm": 0.13917559385299683,
"learning_rate": 0.0002,
"loss": 0.5380789041519165,
"mean_token_accuracy": 0.7812001705169678,
"num_tokens": 4722595.0,
"step": 289
},
{
"entropy": 0.5346228331327438,
"epoch": 1.0820895522388059,
"grad_norm": 0.13478422164916992,
"learning_rate": 0.0002,
"loss": 0.5455847978591919,
"mean_token_accuracy": 0.7781703919172287,
"num_tokens": 4738887.0,
"step": 290
},
{
"entropy": 0.5461715310811996,
"epoch": 1.085820895522388,
"grad_norm": 0.13396981358528137,
"learning_rate": 0.0002,
"loss": 0.5379023551940918,
"mean_token_accuracy": 0.7827265560626984,
"num_tokens": 4755212.0,
"step": 291
},
{
"entropy": 0.5389465689659119,
"epoch": 1.0895522388059702,
"grad_norm": 0.12781155109405518,
"learning_rate": 0.0002,
"loss": 0.5376452803611755,
"mean_token_accuracy": 0.7828295826911926,
"num_tokens": 4771644.0,
"step": 292
},
{
"entropy": 0.5441965609788895,
"epoch": 1.0932835820895523,
"grad_norm": 0.13662317395210266,
"learning_rate": 0.0002,
"loss": 0.53973788022995,
"mean_token_accuracy": 0.781336709856987,
"num_tokens": 4787994.0,
"step": 293
},
{
"entropy": 0.557211622595787,
"epoch": 1.0970149253731343,
"grad_norm": 0.13968485593795776,
"learning_rate": 0.0002,
"loss": 0.5545478463172913,
"mean_token_accuracy": 0.7766687870025635,
"num_tokens": 4804240.0,
"step": 294
},
{
"entropy": 0.5415647476911545,
"epoch": 1.1007462686567164,
"grad_norm": 0.14245721697807312,
"learning_rate": 0.0002,
"loss": 0.5388385653495789,
"mean_token_accuracy": 0.7829283177852631,
"num_tokens": 4820711.0,
"step": 295
},
{
"entropy": 0.5286812037229538,
"epoch": 1.1044776119402986,
"grad_norm": 0.14483948051929474,
"learning_rate": 0.0002,
"loss": 0.5349111557006836,
"mean_token_accuracy": 0.7845683097839355,
"num_tokens": 4836959.0,
"step": 296
},
{
"entropy": 0.5258732736110687,
"epoch": 1.1082089552238805,
"grad_norm": 0.13696761429309845,
"learning_rate": 0.0002,
"loss": 0.529443085193634,
"mean_token_accuracy": 0.7867940962314606,
"num_tokens": 4853067.0,
"step": 297
},
{
"entropy": 0.5512303709983826,
"epoch": 1.1119402985074627,
"grad_norm": 0.15340439975261688,
"learning_rate": 0.0002,
"loss": 0.552986741065979,
"mean_token_accuracy": 0.7754423469305038,
"num_tokens": 4869588.0,
"step": 298
},
{
"entropy": 0.5339537411928177,
"epoch": 1.1156716417910448,
"grad_norm": 0.15107926726341248,
"learning_rate": 0.0002,
"loss": 0.5356568694114685,
"mean_token_accuracy": 0.7815524339675903,
"num_tokens": 4885904.0,
"step": 299
},
{
"entropy": 0.5544896274805069,
"epoch": 1.1194029850746268,
"grad_norm": 0.13157761096954346,
"learning_rate": 0.0002,
"loss": 0.5553483366966248,
"mean_token_accuracy": 0.7737178802490234,
"num_tokens": 4902327.0,
"step": 300
},
{
"entropy": 0.5695160180330276,
"epoch": 1.123134328358209,
"grad_norm": 0.1447787880897522,
"learning_rate": 0.0002,
"loss": 0.5667352676391602,
"mean_token_accuracy": 0.7724233418703079,
"num_tokens": 4918857.0,
"step": 301
},
{
"entropy": 0.5424528568983078,
"epoch": 1.126865671641791,
"grad_norm": 0.130395770072937,
"learning_rate": 0.0002,
"loss": 0.54450523853302,
"mean_token_accuracy": 0.7784540206193924,
"num_tokens": 4935469.0,
"step": 302
},
{
"entropy": 0.537494882941246,
"epoch": 1.1305970149253732,
"grad_norm": 0.1572721302509308,
"learning_rate": 0.0002,
"loss": 0.539937436580658,
"mean_token_accuracy": 0.7787607908248901,
"num_tokens": 4951497.0,
"step": 303
},
{
"entropy": 0.5239665806293488,
"epoch": 1.1343283582089552,
"grad_norm": 0.14227941632270813,
"learning_rate": 0.0002,
"loss": 0.5174288153648376,
"mean_token_accuracy": 0.7907485216856003,
"num_tokens": 4967826.0,
"step": 304
},
{
"entropy": 0.5226030200719833,
"epoch": 1.1380597014925373,
"grad_norm": 0.13234300911426544,
"learning_rate": 0.0002,
"loss": 0.5237756967544556,
"mean_token_accuracy": 0.7902256399393082,
"num_tokens": 4984247.0,
"step": 305
},
{
"entropy": 0.5070921406149864,
"epoch": 1.1417910447761195,
"grad_norm": 0.15718795359134674,
"learning_rate": 0.0002,
"loss": 0.520646333694458,
"mean_token_accuracy": 0.7865647524595261,
"num_tokens": 5000320.0,
"step": 306
},
{
"entropy": 0.5070105642080307,
"epoch": 1.1455223880597014,
"grad_norm": 0.20183522999286652,
"learning_rate": 0.0002,
"loss": 0.528045654296875,
"mean_token_accuracy": 0.7873903512954712,
"num_tokens": 5016226.0,
"step": 307
},
{
"entropy": 0.5490072518587112,
"epoch": 1.1492537313432836,
"grad_norm": 0.12259556353092194,
"learning_rate": 0.0002,
"loss": 0.5465996861457825,
"mean_token_accuracy": 0.7795770764350891,
"num_tokens": 5032435.0,
"step": 308
},
{
"entropy": 0.5369555801153183,
"epoch": 1.1529850746268657,
"grad_norm": 0.17033320665359497,
"learning_rate": 0.0002,
"loss": 0.5238630175590515,
"mean_token_accuracy": 0.7864966690540314,
"num_tokens": 5048673.0,
"step": 309
},
{
"entropy": 0.5474718064069748,
"epoch": 1.1567164179104479,
"grad_norm": 0.15336251258850098,
"learning_rate": 0.0002,
"loss": 0.5351282358169556,
"mean_token_accuracy": 0.7832874804735184,
"num_tokens": 5064889.0,
"step": 310
},
{
"entropy": 0.5407518595457077,
"epoch": 1.1604477611940298,
"grad_norm": 0.1288745403289795,
"learning_rate": 0.0002,
"loss": 0.532909631729126,
"mean_token_accuracy": 0.7854967713356018,
"num_tokens": 5081181.0,
"step": 311
},
{
"entropy": 0.5553453862667084,
"epoch": 1.164179104477612,
"grad_norm": 0.17325082421302795,
"learning_rate": 0.0002,
"loss": 0.5650225877761841,
"mean_token_accuracy": 0.7709382623434067,
"num_tokens": 5097695.0,
"step": 312
},
{
"entropy": 0.5312155932188034,
"epoch": 1.1679104477611941,
"grad_norm": 0.14813978970050812,
"learning_rate": 0.0002,
"loss": 0.5398642420768738,
"mean_token_accuracy": 0.7819912135601044,
"num_tokens": 5114124.0,
"step": 313
},
{
"entropy": 0.5393004268407822,
"epoch": 1.171641791044776,
"grad_norm": 0.13244624435901642,
"learning_rate": 0.0002,
"loss": 0.5397657155990601,
"mean_token_accuracy": 0.7833016067743301,
"num_tokens": 5130526.0,
"step": 314
},
{
"entropy": 0.5356107205152512,
"epoch": 1.1753731343283582,
"grad_norm": 0.1546393185853958,
"learning_rate": 0.0002,
"loss": 0.5278767347335815,
"mean_token_accuracy": 0.7873012572526932,
"num_tokens": 5146786.0,
"step": 315
},
{
"entropy": 0.5360458493232727,
"epoch": 1.1791044776119404,
"grad_norm": 0.14604224264621735,
"learning_rate": 0.0002,
"loss": 0.5378543138504028,
"mean_token_accuracy": 0.7808638215065002,
"num_tokens": 5163157.0,
"step": 316
},
{
"entropy": 0.5358310341835022,
"epoch": 1.1828358208955223,
"grad_norm": 0.11514927446842194,
"learning_rate": 0.0002,
"loss": 0.5323253273963928,
"mean_token_accuracy": 0.7850612699985504,
"num_tokens": 5179759.0,
"step": 317
},
{
"entropy": 0.5336421579122543,
"epoch": 1.1865671641791045,
"grad_norm": 0.14939743280410767,
"learning_rate": 0.0002,
"loss": 0.5399504899978638,
"mean_token_accuracy": 0.7822477370500565,
"num_tokens": 5195772.0,
"step": 318
},
{
"entropy": 0.5196461454033852,
"epoch": 1.1902985074626866,
"grad_norm": 0.16364845633506775,
"learning_rate": 0.0002,
"loss": 0.5318784117698669,
"mean_token_accuracy": 0.7826407551765442,
"num_tokens": 5212049.0,
"step": 319
},
{
"entropy": 0.5297210067510605,
"epoch": 1.1940298507462686,
"grad_norm": 0.1340930312871933,
"learning_rate": 0.0002,
"loss": 0.5342279672622681,
"mean_token_accuracy": 0.7825554758310318,
"num_tokens": 5228387.0,
"step": 320
},
{
"entropy": 0.5374090075492859,
"epoch": 1.1977611940298507,
"grad_norm": 0.13523836433887482,
"learning_rate": 0.0002,
"loss": 0.5342003107070923,
"mean_token_accuracy": 0.7829677164554596,
"num_tokens": 5244798.0,
"step": 321
},
{
"entropy": 0.5403262600302696,
"epoch": 1.2014925373134329,
"grad_norm": 0.11974834650754929,
"learning_rate": 0.0002,
"loss": 0.5366995334625244,
"mean_token_accuracy": 0.7828448265790939,
"num_tokens": 5261240.0,
"step": 322
},
{
"entropy": 0.5380197167396545,
"epoch": 1.205223880597015,
"grad_norm": 0.154353529214859,
"learning_rate": 0.0002,
"loss": 0.533047080039978,
"mean_token_accuracy": 0.7859889715909958,
"num_tokens": 5277554.0,
"step": 323
},
{
"entropy": 0.5303442776203156,
"epoch": 1.208955223880597,
"grad_norm": 0.14264924824237823,
"learning_rate": 0.0002,
"loss": 0.5314475893974304,
"mean_token_accuracy": 0.7831806391477585,
"num_tokens": 5293949.0,
"step": 324
},
{
"entropy": 0.5252211391925812,
"epoch": 1.212686567164179,
"grad_norm": 0.1556359827518463,
"learning_rate": 0.0002,
"loss": 0.5285252928733826,
"mean_token_accuracy": 0.783245861530304,
"num_tokens": 5310026.0,
"step": 325
},
{
"entropy": 0.5328008607029915,
"epoch": 1.2164179104477613,
"grad_norm": 0.13450154662132263,
"learning_rate": 0.0002,
"loss": 0.5320917367935181,
"mean_token_accuracy": 0.7842745780944824,
"num_tokens": 5326386.0,
"step": 326
},
{
"entropy": 0.5319949090480804,
"epoch": 1.2201492537313432,
"grad_norm": 0.12143786996603012,
"learning_rate": 0.0002,
"loss": 0.5349273681640625,
"mean_token_accuracy": 0.7820626497268677,
"num_tokens": 5342658.0,
"step": 327
},
{
"entropy": 0.5234760195016861,
"epoch": 1.2238805970149254,
"grad_norm": 0.16645972430706024,
"learning_rate": 0.0002,
"loss": 0.5320586562156677,
"mean_token_accuracy": 0.7844817489385605,
"num_tokens": 5358974.0,
"step": 328
},
{
"entropy": 0.5378956496715546,
"epoch": 1.2276119402985075,
"grad_norm": 0.13522404432296753,
"learning_rate": 0.0002,
"loss": 0.5357790589332581,
"mean_token_accuracy": 0.7823758125305176,
"num_tokens": 5375371.0,
"step": 329
},
{
"entropy": 0.5387023985385895,
"epoch": 1.2313432835820897,
"grad_norm": 0.1315094530582428,
"learning_rate": 0.0002,
"loss": 0.5362842082977295,
"mean_token_accuracy": 0.7809555679559708,
"num_tokens": 5391896.0,
"step": 330
},
{
"entropy": 0.5072716027498245,
"epoch": 1.2350746268656716,
"grad_norm": 0.13498196005821228,
"learning_rate": 0.0002,
"loss": 0.507161021232605,
"mean_token_accuracy": 0.7966707944869995,
"num_tokens": 5408354.0,
"step": 331
},
{
"entropy": 0.5260337740182877,
"epoch": 1.2388059701492538,
"grad_norm": 0.13349276781082153,
"learning_rate": 0.0002,
"loss": 0.5276508331298828,
"mean_token_accuracy": 0.7871510088443756,
"num_tokens": 5424531.0,
"step": 332
},
{
"entropy": 0.5349582731723785,
"epoch": 1.242537313432836,
"grad_norm": 0.13890203833580017,
"learning_rate": 0.0002,
"loss": 0.5371206402778625,
"mean_token_accuracy": 0.7821635603904724,
"num_tokens": 5440815.0,
"step": 333
},
{
"entropy": 0.5346423760056496,
"epoch": 1.2462686567164178,
"grad_norm": 0.1553906500339508,
"learning_rate": 0.0002,
"loss": 0.5395735502243042,
"mean_token_accuracy": 0.7817864269018173,
"num_tokens": 5457072.0,
"step": 334
},
{
"entropy": 0.5478692203760147,
"epoch": 1.25,
"grad_norm": 0.15934403240680695,
"learning_rate": 0.0002,
"loss": 0.5516626834869385,
"mean_token_accuracy": 0.7753347009420395,
"num_tokens": 5473422.0,
"step": 335
},
{
"entropy": 0.5378739535808563,
"epoch": 1.2537313432835822,
"grad_norm": 0.12844312191009521,
"learning_rate": 0.0002,
"loss": 0.5326632261276245,
"mean_token_accuracy": 0.7827756106853485,
"num_tokens": 5489671.0,
"step": 336
},
{
"entropy": 0.5409121513366699,
"epoch": 1.2574626865671643,
"grad_norm": 0.1285056471824646,
"learning_rate": 0.0002,
"loss": 0.5452673435211182,
"mean_token_accuracy": 0.7786683291196823,
"num_tokens": 5506084.0,
"step": 337
},
{
"entropy": 0.5422088652849197,
"epoch": 1.2611940298507462,
"grad_norm": 0.14476130902767181,
"learning_rate": 0.0002,
"loss": 0.5416613817214966,
"mean_token_accuracy": 0.7791768312454224,
"num_tokens": 5522548.0,
"step": 338
},
{
"entropy": 0.5449076443910599,
"epoch": 1.2649253731343284,
"grad_norm": 0.13138490915298462,
"learning_rate": 0.0002,
"loss": 0.5395404696464539,
"mean_token_accuracy": 0.7813031673431396,
"num_tokens": 5539208.0,
"step": 339
},
{
"entropy": 0.5443570464849472,
"epoch": 1.2686567164179103,
"grad_norm": 0.15328356623649597,
"learning_rate": 0.0002,
"loss": 0.5410760641098022,
"mean_token_accuracy": 0.7822384089231491,
"num_tokens": 5555492.0,
"step": 340
},
{
"entropy": 0.5302190482616425,
"epoch": 1.2723880597014925,
"grad_norm": 0.15014180541038513,
"learning_rate": 0.0002,
"loss": 0.5311694145202637,
"mean_token_accuracy": 0.7823975682258606,
"num_tokens": 5571999.0,
"step": 341
},
{
"entropy": 0.5198534801602364,
"epoch": 1.2761194029850746,
"grad_norm": 0.13281527161598206,
"learning_rate": 0.0002,
"loss": 0.5303924083709717,
"mean_token_accuracy": 0.7844155579805374,
"num_tokens": 5588098.0,
"step": 342
},
{
"entropy": 0.5089417994022369,
"epoch": 1.2798507462686568,
"grad_norm": 0.1406290978193283,
"learning_rate": 0.0002,
"loss": 0.5175491571426392,
"mean_token_accuracy": 0.7906824499368668,
"num_tokens": 5604254.0,
"step": 343
},
{
"entropy": 0.5032122731208801,
"epoch": 1.2835820895522387,
"grad_norm": 0.15877749025821686,
"learning_rate": 0.0002,
"loss": 0.5124095678329468,
"mean_token_accuracy": 0.790567934513092,
"num_tokens": 5620363.0,
"step": 344
},
{
"entropy": 0.5435033291578293,
"epoch": 1.287313432835821,
"grad_norm": 0.1633625328540802,
"learning_rate": 0.0002,
"loss": 0.553101658821106,
"mean_token_accuracy": 0.7757033556699753,
"num_tokens": 5636720.0,
"step": 345
},
{
"entropy": 0.5401125550270081,
"epoch": 1.291044776119403,
"grad_norm": 0.14126214385032654,
"learning_rate": 0.0002,
"loss": 0.5362418293952942,
"mean_token_accuracy": 0.7848408222198486,
"num_tokens": 5653198.0,
"step": 346
},
{
"entropy": 0.5514497756958008,
"epoch": 1.294776119402985,
"grad_norm": 0.12672948837280273,
"learning_rate": 0.0002,
"loss": 0.5441724061965942,
"mean_token_accuracy": 0.7795091718435287,
"num_tokens": 5669516.0,
"step": 347
},
{
"entropy": 0.5293784886598587,
"epoch": 1.2985074626865671,
"grad_norm": 0.11630003899335861,
"learning_rate": 0.0002,
"loss": 0.5298827886581421,
"mean_token_accuracy": 0.783647358417511,
"num_tokens": 5685856.0,
"step": 348
},
{
"entropy": 0.5244417935609818,
"epoch": 1.3022388059701493,
"grad_norm": 0.14798091351985931,
"learning_rate": 0.0002,
"loss": 0.5307499170303345,
"mean_token_accuracy": 0.7859917134046555,
"num_tokens": 5702057.0,
"step": 349
},
{
"entropy": 0.5323777049779892,
"epoch": 1.3059701492537314,
"grad_norm": 0.12870146334171295,
"learning_rate": 0.0002,
"loss": 0.5365279912948608,
"mean_token_accuracy": 0.7816431373357773,
"num_tokens": 5718688.0,
"step": 350
},
{
"entropy": 0.5243604183197021,
"epoch": 1.3097014925373134,
"grad_norm": 0.12391035258769989,
"learning_rate": 0.0002,
"loss": 0.5227367281913757,
"mean_token_accuracy": 0.7866858392953873,
"num_tokens": 5734891.0,
"step": 351
},
{
"entropy": 0.5347918272018433,
"epoch": 1.3134328358208955,
"grad_norm": 0.145299032330513,
"learning_rate": 0.0002,
"loss": 0.5310446619987488,
"mean_token_accuracy": 0.7831001132726669,
"num_tokens": 5751328.0,
"step": 352
},
{
"entropy": 0.5411982387304306,
"epoch": 1.3171641791044777,
"grad_norm": 0.1532508134841919,
"learning_rate": 0.0002,
"loss": 0.5382261276245117,
"mean_token_accuracy": 0.7814776748418808,
"num_tokens": 5767612.0,
"step": 353
},
{
"entropy": 0.5384319573640823,
"epoch": 1.3208955223880596,
"grad_norm": 0.12034327536821365,
"learning_rate": 0.0002,
"loss": 0.5356577038764954,
"mean_token_accuracy": 0.7809152156114578,
"num_tokens": 5783823.0,
"step": 354
},
{
"entropy": 0.5378035828471184,
"epoch": 1.3246268656716418,
"grad_norm": 0.17426501214504242,
"learning_rate": 0.0002,
"loss": 0.54035884141922,
"mean_token_accuracy": 0.781380295753479,
"num_tokens": 5800149.0,
"step": 355
},
{
"entropy": 0.5415401831269264,
"epoch": 1.328358208955224,
"grad_norm": 0.1543213427066803,
"learning_rate": 0.0002,
"loss": 0.5499249696731567,
"mean_token_accuracy": 0.7782198786735535,
"num_tokens": 5816367.0,
"step": 356
},
{
"entropy": 0.5541952252388,
"epoch": 1.332089552238806,
"grad_norm": 0.1483956277370453,
"learning_rate": 0.0002,
"loss": 0.5502984523773193,
"mean_token_accuracy": 0.7760822772979736,
"num_tokens": 5832681.0,
"step": 357
},
{
"entropy": 0.5343631953001022,
"epoch": 1.335820895522388,
"grad_norm": 0.1370651125907898,
"learning_rate": 0.0002,
"loss": 0.531204879283905,
"mean_token_accuracy": 0.7847591787576675,
"num_tokens": 5848778.0,
"step": 358
},
{
"entropy": 0.5292060524225235,
"epoch": 1.3395522388059702,
"grad_norm": 0.13134512305259705,
"learning_rate": 0.0002,
"loss": 0.5340976119041443,
"mean_token_accuracy": 0.7800851762294769,
"num_tokens": 5864821.0,
"step": 359
},
{
"entropy": 0.5334947407245636,
"epoch": 1.3432835820895521,
"grad_norm": 0.1279117912054062,
"learning_rate": 0.0002,
"loss": 0.5352479815483093,
"mean_token_accuracy": 0.7832343429327011,
"num_tokens": 5881116.0,
"step": 360
},
{
"entropy": 0.5323592573404312,
"epoch": 1.3470149253731343,
"grad_norm": 0.28604868054389954,
"learning_rate": 0.0002,
"loss": 0.5301060080528259,
"mean_token_accuracy": 0.7850496172904968,
"num_tokens": 5897810.0,
"step": 361
},
{
"entropy": 0.5503924041986465,
"epoch": 1.3507462686567164,
"grad_norm": 0.34482085704803467,
"learning_rate": 0.0002,
"loss": 0.5528603196144104,
"mean_token_accuracy": 0.7764434367418289,
"num_tokens": 5914260.0,
"step": 362
},
{
"entropy": 0.5227297842502594,
"epoch": 1.3544776119402986,
"grad_norm": 0.12345509976148605,
"learning_rate": 0.0002,
"loss": 0.5238011479377747,
"mean_token_accuracy": 0.7891107350587845,
"num_tokens": 5930444.0,
"step": 363
},
{
"entropy": 0.5462608188390732,
"epoch": 1.3582089552238805,
"grad_norm": 0.1688961386680603,
"learning_rate": 0.0002,
"loss": 0.5603306293487549,
"mean_token_accuracy": 0.771704226732254,
"num_tokens": 5946741.0,
"step": 364
},
{
"entropy": 0.5538459420204163,
"epoch": 1.3619402985074627,
"grad_norm": 0.14098992943763733,
"learning_rate": 0.0002,
"loss": 0.5526646375656128,
"mean_token_accuracy": 0.7749083191156387,
"num_tokens": 5963128.0,
"step": 365
},
{
"entropy": 0.5297324359416962,
"epoch": 1.3656716417910448,
"grad_norm": 0.12920008599758148,
"learning_rate": 0.0002,
"loss": 0.5280593633651733,
"mean_token_accuracy": 0.784359410405159,
"num_tokens": 5979218.0,
"step": 366
},
{
"entropy": 0.5375068634748459,
"epoch": 1.3694029850746268,
"grad_norm": 0.1362897753715515,
"learning_rate": 0.0002,
"loss": 0.5373224020004272,
"mean_token_accuracy": 0.7841860055923462,
"num_tokens": 5995687.0,
"step": 367
},
{
"entropy": 0.5355936139822006,
"epoch": 1.373134328358209,
"grad_norm": 0.14052827656269073,
"learning_rate": 0.0002,
"loss": 0.5387214422225952,
"mean_token_accuracy": 0.7806743085384369,
"num_tokens": 6012035.0,
"step": 368
},
{
"entropy": 0.5435226261615753,
"epoch": 1.376865671641791,
"grad_norm": 0.1556740403175354,
"learning_rate": 0.0002,
"loss": 0.5441159009933472,
"mean_token_accuracy": 0.7787201553583145,
"num_tokens": 6028365.0,
"step": 369
},
{
"entropy": 0.5268312245607376,
"epoch": 1.3805970149253732,
"grad_norm": 0.15513257682323456,
"learning_rate": 0.0002,
"loss": 0.5291861891746521,
"mean_token_accuracy": 0.7877073138952255,
"num_tokens": 6044796.0,
"step": 370
},
{
"entropy": 0.5517646074295044,
"epoch": 1.3843283582089552,
"grad_norm": 0.1265048235654831,
"learning_rate": 0.0002,
"loss": 0.5546433925628662,
"mean_token_accuracy": 0.7754338979721069,
"num_tokens": 6061487.0,
"step": 371
},
{
"entropy": 0.5410579442977905,
"epoch": 1.3880597014925373,
"grad_norm": 0.13882151246070862,
"learning_rate": 0.0002,
"loss": 0.5375149846076965,
"mean_token_accuracy": 0.7817846387624741,
"num_tokens": 6077933.0,
"step": 372
},
{
"entropy": 0.5343161523342133,
"epoch": 1.3917910447761195,
"grad_norm": 0.1435064971446991,
"learning_rate": 0.0002,
"loss": 0.5308974981307983,
"mean_token_accuracy": 0.7849253863096237,
"num_tokens": 6094407.0,
"step": 373
},
{
"entropy": 0.5472413003444672,
"epoch": 1.3955223880597014,
"grad_norm": 0.1254650354385376,
"learning_rate": 0.0002,
"loss": 0.5410266518592834,
"mean_token_accuracy": 0.7794545590877533,
"num_tokens": 6110923.0,
"step": 374
},
{
"entropy": 0.5365632474422455,
"epoch": 1.3992537313432836,
"grad_norm": 0.13213133811950684,
"learning_rate": 0.0002,
"loss": 0.5404695868492126,
"mean_token_accuracy": 0.7813301384449005,
"num_tokens": 6127219.0,
"step": 375
},
{
"entropy": 0.5322464108467102,
"epoch": 1.4029850746268657,
"grad_norm": 0.1703079640865326,
"learning_rate": 0.0002,
"loss": 0.5420417189598083,
"mean_token_accuracy": 0.7813734114170074,
"num_tokens": 6143418.0,
"step": 376
},
{
"entropy": 0.5500752478837967,
"epoch": 1.4067164179104479,
"grad_norm": 0.1431417018175125,
"learning_rate": 0.0002,
"loss": 0.5511533617973328,
"mean_token_accuracy": 0.7758170068264008,
"num_tokens": 6159747.0,
"step": 377
},
{
"entropy": 0.5427335649728775,
"epoch": 1.4104477611940298,
"grad_norm": 0.1817740648984909,
"learning_rate": 0.0002,
"loss": 0.5414767861366272,
"mean_token_accuracy": 0.7784233242273331,
"num_tokens": 6176317.0,
"step": 378
},
{
"entropy": 0.5470531731843948,
"epoch": 1.414179104477612,
"grad_norm": 0.1422269493341446,
"learning_rate": 0.0002,
"loss": 0.5472888946533203,
"mean_token_accuracy": 0.7780141085386276,
"num_tokens": 6192737.0,
"step": 379
},
{
"entropy": 0.5464377701282501,
"epoch": 1.417910447761194,
"grad_norm": 0.17506512999534607,
"learning_rate": 0.0002,
"loss": 0.5490654706954956,
"mean_token_accuracy": 0.7765569537878036,
"num_tokens": 6208852.0,
"step": 380
},
{
"entropy": 0.5500655770301819,
"epoch": 1.421641791044776,
"grad_norm": 0.13887247443199158,
"learning_rate": 0.0002,
"loss": 0.5514895915985107,
"mean_token_accuracy": 0.7774574458599091,
"num_tokens": 6225069.0,
"step": 381
},
{
"entropy": 0.5438679605722427,
"epoch": 1.4253731343283582,
"grad_norm": 0.19045118987560272,
"learning_rate": 0.0002,
"loss": 0.5430073738098145,
"mean_token_accuracy": 0.7802658081054688,
"num_tokens": 6241528.0,
"step": 382
},
{
"entropy": 0.5306290239095688,
"epoch": 1.4291044776119404,
"grad_norm": 0.160585418343544,
"learning_rate": 0.0002,
"loss": 0.5361081957817078,
"mean_token_accuracy": 0.7803311944007874,
"num_tokens": 6257867.0,
"step": 383
},
{
"entropy": 0.5401095002889633,
"epoch": 1.4328358208955223,
"grad_norm": 0.1656486541032791,
"learning_rate": 0.0002,
"loss": 0.5400689244270325,
"mean_token_accuracy": 0.780994102358818,
"num_tokens": 6274155.0,
"step": 384
},
{
"entropy": 0.5327940136194229,
"epoch": 1.4365671641791045,
"grad_norm": 0.1317523568868637,
"learning_rate": 0.0002,
"loss": 0.5320010185241699,
"mean_token_accuracy": 0.7850325703620911,
"num_tokens": 6290558.0,
"step": 385
},
{
"entropy": 0.5441479384899139,
"epoch": 1.4402985074626866,
"grad_norm": 0.17623504996299744,
"learning_rate": 0.0002,
"loss": 0.5384020209312439,
"mean_token_accuracy": 0.7846230715513229,
"num_tokens": 6306878.0,
"step": 386
},
{
"entropy": 0.5452490895986557,
"epoch": 1.4440298507462686,
"grad_norm": 0.16240645945072174,
"learning_rate": 0.0002,
"loss": 0.5443468689918518,
"mean_token_accuracy": 0.7802695333957672,
"num_tokens": 6323446.0,
"step": 387
},
{
"entropy": 0.5221313908696175,
"epoch": 1.4477611940298507,
"grad_norm": 0.1463281661272049,
"learning_rate": 0.0002,
"loss": 0.5281410813331604,
"mean_token_accuracy": 0.7816678881645203,
"num_tokens": 6339949.0,
"step": 388
},
{
"entropy": 0.548899233341217,
"epoch": 1.4514925373134329,
"grad_norm": 0.22850677371025085,
"learning_rate": 0.0002,
"loss": 0.5660842657089233,
"mean_token_accuracy": 0.7699355781078339,
"num_tokens": 6356385.0,
"step": 389
},
{
"entropy": 0.5538987964391708,
"epoch": 1.455223880597015,
"grad_norm": 0.14064767956733704,
"learning_rate": 0.0002,
"loss": 0.5418739318847656,
"mean_token_accuracy": 0.7807578295469284,
"num_tokens": 6372804.0,
"step": 390
},
{
"entropy": 0.5599593967199326,
"epoch": 1.458955223880597,
"grad_norm": 0.18051759898662567,
"learning_rate": 0.0002,
"loss": 0.5524702072143555,
"mean_token_accuracy": 0.776346430182457,
"num_tokens": 6389040.0,
"step": 391
},
{
"entropy": 0.5202420800924301,
"epoch": 1.462686567164179,
"grad_norm": 0.14325307309627533,
"learning_rate": 0.0002,
"loss": 0.519583523273468,
"mean_token_accuracy": 0.7894969880580902,
"num_tokens": 6405365.0,
"step": 392
},
{
"entropy": 0.5261730998754501,
"epoch": 1.4664179104477613,
"grad_norm": 0.1525595486164093,
"learning_rate": 0.0002,
"loss": 0.5307163596153259,
"mean_token_accuracy": 0.7871128022670746,
"num_tokens": 6421868.0,
"step": 393
},
{
"entropy": 0.5307900905609131,
"epoch": 1.4701492537313432,
"grad_norm": 0.19890250265598297,
"learning_rate": 0.0002,
"loss": 0.5441185832023621,
"mean_token_accuracy": 0.7786047160625458,
"num_tokens": 6438616.0,
"step": 394
},
{
"entropy": 0.5521271824836731,
"epoch": 1.4738805970149254,
"grad_norm": 0.14049610495567322,
"learning_rate": 0.0002,
"loss": 0.5551049113273621,
"mean_token_accuracy": 0.7755014002323151,
"num_tokens": 6455024.0,
"step": 395
},
{
"entropy": 0.539069190621376,
"epoch": 1.4776119402985075,
"grad_norm": 0.1545083075761795,
"learning_rate": 0.0002,
"loss": 0.5353712439537048,
"mean_token_accuracy": 0.78336501121521,
"num_tokens": 6471293.0,
"step": 396
},
{
"entropy": 0.5550021678209305,
"epoch": 1.4813432835820897,
"grad_norm": 0.18578873574733734,
"learning_rate": 0.0002,
"loss": 0.5472472310066223,
"mean_token_accuracy": 0.7796825766563416,
"num_tokens": 6487641.0,
"step": 397
},
{
"entropy": 0.5490831285715103,
"epoch": 1.4850746268656716,
"grad_norm": 0.1240464299917221,
"learning_rate": 0.0002,
"loss": 0.5474961400032043,
"mean_token_accuracy": 0.7774344980716705,
"num_tokens": 6503822.0,
"step": 398
},
{
"entropy": 0.5393417626619339,
"epoch": 1.4888059701492538,
"grad_norm": 0.1891254484653473,
"learning_rate": 0.0002,
"loss": 0.5524366497993469,
"mean_token_accuracy": 0.7745344191789627,
"num_tokens": 6520011.0,
"step": 399
},
{
"entropy": 0.513459712266922,
"epoch": 1.4925373134328357,
"grad_norm": 0.2974206805229187,
"learning_rate": 0.0002,
"loss": 0.5200244188308716,
"mean_token_accuracy": 0.7888158708810806,
"num_tokens": 6536205.0,
"step": 400
},
{
"entropy": 0.5186173021793365,
"epoch": 1.4962686567164178,
"grad_norm": 0.15046866238117218,
"learning_rate": 0.0002,
"loss": 0.5207955241203308,
"mean_token_accuracy": 0.7867278605699539,
"num_tokens": 6552440.0,
"step": 401
},
{
"entropy": 0.5499364733695984,
"epoch": 1.5,
"grad_norm": 0.4020411968231201,
"learning_rate": 0.0002,
"loss": 0.5530084371566772,
"mean_token_accuracy": 0.7796496748924255,
"num_tokens": 6568961.0,
"step": 402
},
{
"entropy": 0.5427668243646622,
"epoch": 1.5037313432835822,
"grad_norm": 0.11850416660308838,
"learning_rate": 0.0002,
"loss": 0.533820629119873,
"mean_token_accuracy": 0.7840306162834167,
"num_tokens": 6585550.0,
"step": 403
},
{
"entropy": 0.5325792133808136,
"epoch": 1.5074626865671643,
"grad_norm": 0.18302492797374725,
"learning_rate": 0.0002,
"loss": 0.534012496471405,
"mean_token_accuracy": 0.7814914137125015,
"num_tokens": 6601942.0,
"step": 404
},
{
"entropy": 0.5354548320174217,
"epoch": 1.5111940298507462,
"grad_norm": 0.15404394268989563,
"learning_rate": 0.0002,
"loss": 0.538587749004364,
"mean_token_accuracy": 0.7822761088609695,
"num_tokens": 6618440.0,
"step": 405
},
{
"entropy": 0.5441371351480484,
"epoch": 1.5149253731343284,
"grad_norm": 0.13057801127433777,
"learning_rate": 0.0002,
"loss": 0.542742908000946,
"mean_token_accuracy": 0.7798959463834763,
"num_tokens": 6634866.0,
"step": 406
},
{
"entropy": 0.542233407497406,
"epoch": 1.5186567164179103,
"grad_norm": 0.14343421161174774,
"learning_rate": 0.0002,
"loss": 0.5447250008583069,
"mean_token_accuracy": 0.7802796810865402,
"num_tokens": 6651150.0,
"step": 407
},
{
"entropy": 0.5407950282096863,
"epoch": 1.5223880597014925,
"grad_norm": 0.14996956288814545,
"learning_rate": 0.0002,
"loss": 0.5389798879623413,
"mean_token_accuracy": 0.7809374779462814,
"num_tokens": 6667674.0,
"step": 408
},
{
"entropy": 0.5433390289545059,
"epoch": 1.5261194029850746,
"grad_norm": 0.1311637908220291,
"learning_rate": 0.0002,
"loss": 0.5383128523826599,
"mean_token_accuracy": 0.7790700197219849,
"num_tokens": 6684068.0,
"step": 409
},
{
"entropy": 0.527245432138443,
"epoch": 1.5298507462686568,
"grad_norm": 0.16411243379116058,
"learning_rate": 0.0002,
"loss": 0.5319215059280396,
"mean_token_accuracy": 0.7840736508369446,
"num_tokens": 6700752.0,
"step": 410
},
{
"entropy": 0.5146678760647774,
"epoch": 1.533582089552239,
"grad_norm": 0.1607578545808792,
"learning_rate": 0.0002,
"loss": 0.5198485851287842,
"mean_token_accuracy": 0.7882288843393326,
"num_tokens": 6716857.0,
"step": 411
},
{
"entropy": 0.5308386236429214,
"epoch": 1.537313432835821,
"grad_norm": 0.166807621717453,
"learning_rate": 0.0002,
"loss": 0.5419335961341858,
"mean_token_accuracy": 0.7812209129333496,
"num_tokens": 6732981.0,
"step": 412
},
{
"entropy": 0.5488767176866531,
"epoch": 1.5410447761194028,
"grad_norm": 0.14006908237934113,
"learning_rate": 0.0002,
"loss": 0.5508508086204529,
"mean_token_accuracy": 0.7769163995981216,
"num_tokens": 6749307.0,
"step": 413
},
{
"entropy": 0.5410346239805222,
"epoch": 1.544776119402985,
"grad_norm": 0.13224521279335022,
"learning_rate": 0.0002,
"loss": 0.5321468710899353,
"mean_token_accuracy": 0.7842406779527664,
"num_tokens": 6765688.0,
"step": 414
},
{
"entropy": 0.5605396628379822,
"epoch": 1.5485074626865671,
"grad_norm": 0.1389547735452652,
"learning_rate": 0.0002,
"loss": 0.5529029369354248,
"mean_token_accuracy": 0.7745459079742432,
"num_tokens": 6782015.0,
"step": 415
},
{
"entropy": 0.5347004532814026,
"epoch": 1.5522388059701493,
"grad_norm": 0.1258436143398285,
"learning_rate": 0.0002,
"loss": 0.5315224528312683,
"mean_token_accuracy": 0.7851130068302155,
"num_tokens": 6798206.0,
"step": 416
},
{
"entropy": 0.5425236374139786,
"epoch": 1.5559701492537314,
"grad_norm": 0.16927701234817505,
"learning_rate": 0.0002,
"loss": 0.5464774370193481,
"mean_token_accuracy": 0.7801399230957031,
"num_tokens": 6814725.0,
"step": 417
},
{
"entropy": 0.5187622159719467,
"epoch": 1.5597014925373134,
"grad_norm": 0.13987842202186584,
"learning_rate": 0.0002,
"loss": 0.5246447920799255,
"mean_token_accuracy": 0.7894206643104553,
"num_tokens": 6831232.0,
"step": 418
},
{
"entropy": 0.5316571593284607,
"epoch": 1.5634328358208955,
"grad_norm": 0.15650241076946259,
"learning_rate": 0.0002,
"loss": 0.538478434085846,
"mean_token_accuracy": 0.7800242900848389,
"num_tokens": 6847650.0,
"step": 419
},
{
"entropy": 0.5246055871248245,
"epoch": 1.5671641791044775,
"grad_norm": 0.13061542809009552,
"learning_rate": 0.0002,
"loss": 0.5321829319000244,
"mean_token_accuracy": 0.7838113605976105,
"num_tokens": 6864019.0,
"step": 420
},
{
"entropy": 0.5212045907974243,
"epoch": 1.5708955223880596,
"grad_norm": 0.13846127688884735,
"learning_rate": 0.0002,
"loss": 0.5200290679931641,
"mean_token_accuracy": 0.7883654683828354,
"num_tokens": 6880204.0,
"step": 421
},
{
"entropy": 0.542250782251358,
"epoch": 1.5746268656716418,
"grad_norm": 0.12467647343873978,
"learning_rate": 0.0002,
"loss": 0.5380762815475464,
"mean_token_accuracy": 0.7811442613601685,
"num_tokens": 6896430.0,
"step": 422
},
{
"entropy": 0.5405887067317963,
"epoch": 1.578358208955224,
"grad_norm": 0.1305769383907318,
"learning_rate": 0.0002,
"loss": 0.5357393026351929,
"mean_token_accuracy": 0.7828609347343445,
"num_tokens": 6912971.0,
"step": 423
},
{
"entropy": 0.5287357568740845,
"epoch": 1.582089552238806,
"grad_norm": 0.17313086986541748,
"learning_rate": 0.0002,
"loss": 0.5329744219779968,
"mean_token_accuracy": 0.782240018248558,
"num_tokens": 6929204.0,
"step": 424
},
{
"entropy": 0.5423530340194702,
"epoch": 1.585820895522388,
"grad_norm": 0.1359935700893402,
"learning_rate": 0.0002,
"loss": 0.5377368330955505,
"mean_token_accuracy": 0.7828396558761597,
"num_tokens": 6945791.0,
"step": 425
},
{
"entropy": 0.5215180069208145,
"epoch": 1.5895522388059702,
"grad_norm": 0.1547544300556183,
"learning_rate": 0.0002,
"loss": 0.5314459800720215,
"mean_token_accuracy": 0.7837548702955246,
"num_tokens": 6961875.0,
"step": 426
},
{
"entropy": 0.5231145992875099,
"epoch": 1.5932835820895521,
"grad_norm": 0.13578681647777557,
"learning_rate": 0.0002,
"loss": 0.5277360677719116,
"mean_token_accuracy": 0.7842715680599213,
"num_tokens": 6978198.0,
"step": 427
},
{
"entropy": 0.5486603379249573,
"epoch": 1.5970149253731343,
"grad_norm": 0.15189069509506226,
"learning_rate": 0.0002,
"loss": 0.549156129360199,
"mean_token_accuracy": 0.7768438756465912,
"num_tokens": 6994444.0,
"step": 428
},
{
"entropy": 0.54026959836483,
"epoch": 1.6007462686567164,
"grad_norm": 0.13162657618522644,
"learning_rate": 0.0002,
"loss": 0.5345808863639832,
"mean_token_accuracy": 0.7827611416578293,
"num_tokens": 7010461.0,
"step": 429
},
{
"entropy": 0.53890560567379,
"epoch": 1.6044776119402986,
"grad_norm": 0.133237823843956,
"learning_rate": 0.0002,
"loss": 0.5350275635719299,
"mean_token_accuracy": 0.7830039262771606,
"num_tokens": 7026813.0,
"step": 430
},
{
"entropy": 0.5518313944339752,
"epoch": 1.6082089552238807,
"grad_norm": 0.14963583648204803,
"learning_rate": 0.0002,
"loss": 0.5478031039237976,
"mean_token_accuracy": 0.7780435979366302,
"num_tokens": 7043301.0,
"step": 431
},
{
"entropy": 0.5414951294660568,
"epoch": 1.6119402985074627,
"grad_norm": 0.12772321701049805,
"learning_rate": 0.0002,
"loss": 0.5401883125305176,
"mean_token_accuracy": 0.782444417476654,
"num_tokens": 7059646.0,
"step": 432
},
{
"entropy": 0.5394223630428314,
"epoch": 1.6156716417910446,
"grad_norm": 0.13813580572605133,
"learning_rate": 0.0002,
"loss": 0.5405031442642212,
"mean_token_accuracy": 0.7798984050750732,
"num_tokens": 7076271.0,
"step": 433
},
{
"entropy": 0.5429421365261078,
"epoch": 1.6194029850746268,
"grad_norm": 0.15601246058940887,
"learning_rate": 0.0002,
"loss": 0.5516016483306885,
"mean_token_accuracy": 0.775258257985115,
"num_tokens": 7092578.0,
"step": 434
},
{
"entropy": 0.5521349459886551,
"epoch": 1.623134328358209,
"grad_norm": 0.14428818225860596,
"learning_rate": 0.0002,
"loss": 0.5492872595787048,
"mean_token_accuracy": 0.7768293768167496,
"num_tokens": 7109046.0,
"step": 435
},
{
"entropy": 0.5354936867952347,
"epoch": 1.626865671641791,
"grad_norm": 0.15073303878307343,
"learning_rate": 0.0002,
"loss": 0.5428034663200378,
"mean_token_accuracy": 0.780666396021843,
"num_tokens": 7125466.0,
"step": 436
},
{
"entropy": 0.5443413555622101,
"epoch": 1.6305970149253732,
"grad_norm": 0.14848864078521729,
"learning_rate": 0.0002,
"loss": 0.5486512780189514,
"mean_token_accuracy": 0.7806312739849091,
"num_tokens": 7141898.0,
"step": 437
},
{
"entropy": 0.5337215662002563,
"epoch": 1.6343283582089554,
"grad_norm": 0.15302547812461853,
"learning_rate": 0.0002,
"loss": 0.5392454862594604,
"mean_token_accuracy": 0.7822044789791107,
"num_tokens": 7158167.0,
"step": 438
},
{
"entropy": 0.5586158037185669,
"epoch": 1.6380597014925373,
"grad_norm": 0.17401555180549622,
"learning_rate": 0.0002,
"loss": 0.557881772518158,
"mean_token_accuracy": 0.7756661027669907,
"num_tokens": 7174477.0,
"step": 439
},
{
"entropy": 0.5406471788883209,
"epoch": 1.6417910447761193,
"grad_norm": 0.14608509838581085,
"learning_rate": 0.0002,
"loss": 0.5353439450263977,
"mean_token_accuracy": 0.7812080383300781,
"num_tokens": 7190694.0,
"step": 440
},
{
"entropy": 0.5237606167793274,
"epoch": 1.6455223880597014,
"grad_norm": 0.1542704850435257,
"learning_rate": 0.0002,
"loss": 0.5290042161941528,
"mean_token_accuracy": 0.7855716645717621,
"num_tokens": 7207153.0,
"step": 441
},
{
"entropy": 0.5269318968057632,
"epoch": 1.6492537313432836,
"grad_norm": 0.1659008413553238,
"learning_rate": 0.0002,
"loss": 0.530527651309967,
"mean_token_accuracy": 0.7846795618534088,
"num_tokens": 7223109.0,
"step": 442
},
{
"entropy": 0.5195682793855667,
"epoch": 1.6529850746268657,
"grad_norm": 0.14120091497898102,
"learning_rate": 0.0002,
"loss": 0.5263478755950928,
"mean_token_accuracy": 0.7843965291976929,
"num_tokens": 7239499.0,
"step": 443
},
{
"entropy": 0.5257822424173355,
"epoch": 1.6567164179104479,
"grad_norm": 0.1643773764371872,
"learning_rate": 0.0002,
"loss": 0.5316389203071594,
"mean_token_accuracy": 0.7851150333881378,
"num_tokens": 7255730.0,
"step": 444
},
{
"entropy": 0.5377429872751236,
"epoch": 1.6604477611940298,
"grad_norm": 0.14926724135875702,
"learning_rate": 0.0002,
"loss": 0.5427424907684326,
"mean_token_accuracy": 0.7824969440698624,
"num_tokens": 7272167.0,
"step": 445
},
{
"entropy": 0.538849800825119,
"epoch": 1.664179104477612,
"grad_norm": 0.13225945830345154,
"learning_rate": 0.0002,
"loss": 0.5327820181846619,
"mean_token_accuracy": 0.783388078212738,
"num_tokens": 7288421.0,
"step": 446
},
{
"entropy": 0.5399289578199387,
"epoch": 1.667910447761194,
"grad_norm": 0.1308569759130478,
"learning_rate": 0.0002,
"loss": 0.5292877554893494,
"mean_token_accuracy": 0.7878285944461823,
"num_tokens": 7304880.0,
"step": 447
},
{
"entropy": 0.5436895489692688,
"epoch": 1.671641791044776,
"grad_norm": 0.16895835101604462,
"learning_rate": 0.0002,
"loss": 0.5451297163963318,
"mean_token_accuracy": 0.7789509892463684,
"num_tokens": 7321256.0,
"step": 448
},
{
"entropy": 0.5504481792449951,
"epoch": 1.6753731343283582,
"grad_norm": 0.13614578545093536,
"learning_rate": 0.0002,
"loss": 0.5539385080337524,
"mean_token_accuracy": 0.7752430438995361,
"num_tokens": 7337589.0,
"step": 449
},
{
"entropy": 0.5513797849416733,
"epoch": 1.6791044776119404,
"grad_norm": 0.15195772051811218,
"learning_rate": 0.0002,
"loss": 0.5530341267585754,
"mean_token_accuracy": 0.7749580442905426,
"num_tokens": 7353883.0,
"step": 450
},
{
"entropy": 0.5413680523633957,
"epoch": 1.6828358208955225,
"grad_norm": 0.15170808136463165,
"learning_rate": 0.0002,
"loss": 0.543311357498169,
"mean_token_accuracy": 0.7790023237466812,
"num_tokens": 7370160.0,
"step": 451
},
{
"entropy": 0.5648334920406342,
"epoch": 1.6865671641791045,
"grad_norm": 0.1327073723077774,
"learning_rate": 0.0002,
"loss": 0.5623019933700562,
"mean_token_accuracy": 0.7708193957805634,
"num_tokens": 7386478.0,
"step": 452
},
{
"entropy": 0.517740860581398,
"epoch": 1.6902985074626866,
"grad_norm": 0.13745424151420593,
"learning_rate": 0.0002,
"loss": 0.5170730352401733,
"mean_token_accuracy": 0.7882706969976425,
"num_tokens": 7402645.0,
"step": 453
},
{
"entropy": 0.5524223297834396,
"epoch": 1.6940298507462686,
"grad_norm": 0.1598864197731018,
"learning_rate": 0.0002,
"loss": 0.5490080714225769,
"mean_token_accuracy": 0.7766116112470627,
"num_tokens": 7419124.0,
"step": 454
},
{
"entropy": 0.5260176658630371,
"epoch": 1.6977611940298507,
"grad_norm": 0.13257424533367157,
"learning_rate": 0.0002,
"loss": 0.5297276973724365,
"mean_token_accuracy": 0.7853291928768158,
"num_tokens": 7435508.0,
"step": 455
},
{
"entropy": 0.5325040817260742,
"epoch": 1.7014925373134329,
"grad_norm": 0.18319375813007355,
"learning_rate": 0.0002,
"loss": 0.543100118637085,
"mean_token_accuracy": 0.7803790718317032,
"num_tokens": 7451755.0,
"step": 456
},
{
"entropy": 0.5267694145441055,
"epoch": 1.705223880597015,
"grad_norm": 0.1554267704486847,
"learning_rate": 0.0002,
"loss": 0.5240468978881836,
"mean_token_accuracy": 0.7871411740779877,
"num_tokens": 7467919.0,
"step": 457
},
{
"entropy": 0.5426032692193985,
"epoch": 1.7089552238805972,
"grad_norm": 0.13706867396831512,
"learning_rate": 0.0002,
"loss": 0.5412613749504089,
"mean_token_accuracy": 0.778879314661026,
"num_tokens": 7484289.0,
"step": 458
},
{
"entropy": 0.5340660065412521,
"epoch": 1.712686567164179,
"grad_norm": 0.16726213693618774,
"learning_rate": 0.0002,
"loss": 0.5392245650291443,
"mean_token_accuracy": 0.7805332094430923,
"num_tokens": 7500611.0,
"step": 459
},
{
"entropy": 0.5553819835186005,
"epoch": 1.716417910447761,
"grad_norm": 0.16255703568458557,
"learning_rate": 0.0002,
"loss": 0.5517896413803101,
"mean_token_accuracy": 0.7731162905693054,
"num_tokens": 7517206.0,
"step": 460
},
{
"entropy": 0.5343479365110397,
"epoch": 1.7201492537313432,
"grad_norm": 0.13407304883003235,
"learning_rate": 0.0002,
"loss": 0.5380552411079407,
"mean_token_accuracy": 0.778910294175148,
"num_tokens": 7533459.0,
"step": 461
},
{
"entropy": 0.5323963612318039,
"epoch": 1.7238805970149254,
"grad_norm": 0.1650952398777008,
"learning_rate": 0.0002,
"loss": 0.5314269661903381,
"mean_token_accuracy": 0.7864300310611725,
"num_tokens": 7549589.0,
"step": 462
},
{
"entropy": 0.5433520078659058,
"epoch": 1.7276119402985075,
"grad_norm": 0.1429263949394226,
"learning_rate": 0.0002,
"loss": 0.540563702583313,
"mean_token_accuracy": 0.7819092869758606,
"num_tokens": 7566158.0,
"step": 463
},
{
"entropy": 0.5436968952417374,
"epoch": 1.7313432835820897,
"grad_norm": 0.14086155593395233,
"learning_rate": 0.0002,
"loss": 0.5398205518722534,
"mean_token_accuracy": 0.7809909284114838,
"num_tokens": 7582422.0,
"step": 464
},
{
"entropy": 0.5534437447786331,
"epoch": 1.7350746268656716,
"grad_norm": 0.14618556201457977,
"learning_rate": 0.0002,
"loss": 0.5561552047729492,
"mean_token_accuracy": 0.7724596560001373,
"num_tokens": 7598771.0,
"step": 465
},
{
"entropy": 0.5396170765161514,
"epoch": 1.7388059701492538,
"grad_norm": 0.1190977543592453,
"learning_rate": 0.0002,
"loss": 0.5389412641525269,
"mean_token_accuracy": 0.7812270224094391,
"num_tokens": 7615418.0,
"step": 466
},
{
"entropy": 0.5390318781137466,
"epoch": 1.7425373134328357,
"grad_norm": 0.15372450649738312,
"learning_rate": 0.0002,
"loss": 0.5436992645263672,
"mean_token_accuracy": 0.7814512252807617,
"num_tokens": 7631840.0,
"step": 467
},
{
"entropy": 0.5206413939595222,
"epoch": 1.7462686567164178,
"grad_norm": 0.13495191931724548,
"learning_rate": 0.0002,
"loss": 0.5253979563713074,
"mean_token_accuracy": 0.7877579927444458,
"num_tokens": 7648131.0,
"step": 468
},
{
"entropy": 0.5223769247531891,
"epoch": 1.75,
"grad_norm": 0.15382781624794006,
"learning_rate": 0.0002,
"loss": 0.5363397002220154,
"mean_token_accuracy": 0.7828211337327957,
"num_tokens": 7664453.0,
"step": 469
},
{
"entropy": 0.5333149433135986,
"epoch": 1.7537313432835822,
"grad_norm": 0.13387013971805573,
"learning_rate": 0.0002,
"loss": 0.5351001620292664,
"mean_token_accuracy": 0.7830037176609039,
"num_tokens": 7680781.0,
"step": 470
},
{
"entropy": 0.5429620742797852,
"epoch": 1.7574626865671643,
"grad_norm": 0.13604114949703217,
"learning_rate": 0.0002,
"loss": 0.5358593463897705,
"mean_token_accuracy": 0.7837422788143158,
"num_tokens": 7697310.0,
"step": 471
},
{
"entropy": 0.5731407701969147,
"epoch": 1.7611940298507462,
"grad_norm": 0.1410369724035263,
"learning_rate": 0.0002,
"loss": 0.5635945796966553,
"mean_token_accuracy": 0.7718209028244019,
"num_tokens": 7713558.0,
"step": 472
},
{
"entropy": 0.5679037570953369,
"epoch": 1.7649253731343284,
"grad_norm": 0.14904598891735077,
"learning_rate": 0.0002,
"loss": 0.5656334161758423,
"mean_token_accuracy": 0.7714496552944183,
"num_tokens": 7730117.0,
"step": 473
},
{
"entropy": 0.5429675132036209,
"epoch": 1.7686567164179103,
"grad_norm": 0.1564645618200302,
"learning_rate": 0.0002,
"loss": 0.5466417670249939,
"mean_token_accuracy": 0.7782974392175674,
"num_tokens": 7746633.0,
"step": 474
},
{
"entropy": 0.5362623929977417,
"epoch": 1.7723880597014925,
"grad_norm": 0.14919337630271912,
"learning_rate": 0.0002,
"loss": 0.5442617535591125,
"mean_token_accuracy": 0.778479665517807,
"num_tokens": 7762813.0,
"step": 475
},
{
"entropy": 0.5283475816249847,
"epoch": 1.7761194029850746,
"grad_norm": 0.14363890886306763,
"learning_rate": 0.0002,
"loss": 0.5296353101730347,
"mean_token_accuracy": 0.7861494719982147,
"num_tokens": 7778873.0,
"step": 476
},
{
"entropy": 0.5252759754657745,
"epoch": 1.7798507462686568,
"grad_norm": 0.17697355151176453,
"learning_rate": 0.0002,
"loss": 0.5262605547904968,
"mean_token_accuracy": 0.7861870229244232,
"num_tokens": 7795362.0,
"step": 477
},
{
"entropy": 0.5341710150241852,
"epoch": 1.783582089552239,
"grad_norm": 0.13914838433265686,
"learning_rate": 0.0002,
"loss": 0.5387526750564575,
"mean_token_accuracy": 0.7779033482074738,
"num_tokens": 7811639.0,
"step": 478
},
{
"entropy": 0.5409186482429504,
"epoch": 1.787313432835821,
"grad_norm": 0.14785298705101013,
"learning_rate": 0.0002,
"loss": 0.5428853034973145,
"mean_token_accuracy": 0.7777274399995804,
"num_tokens": 7828116.0,
"step": 479
},
{
"entropy": 0.5548221617937088,
"epoch": 1.7910447761194028,
"grad_norm": 0.1457030326128006,
"learning_rate": 0.0002,
"loss": 0.5512540340423584,
"mean_token_accuracy": 0.7757317572832108,
"num_tokens": 7844457.0,
"step": 480
},
{
"entropy": 0.5340719819068909,
"epoch": 1.794776119402985,
"grad_norm": 0.13429081439971924,
"learning_rate": 0.0002,
"loss": 0.5289599299430847,
"mean_token_accuracy": 0.7837049216032028,
"num_tokens": 7860611.0,
"step": 481
},
{
"entropy": 0.5379914194345474,
"epoch": 1.7985074626865671,
"grad_norm": 0.13006342947483063,
"learning_rate": 0.0002,
"loss": 0.5363917350769043,
"mean_token_accuracy": 0.7821543663740158,
"num_tokens": 7876837.0,
"step": 482
},
{
"entropy": 0.5481665432453156,
"epoch": 1.8022388059701493,
"grad_norm": 0.14950798451900482,
"learning_rate": 0.0002,
"loss": 0.5466524362564087,
"mean_token_accuracy": 0.7806346863508224,
"num_tokens": 7893152.0,
"step": 483
},
{
"entropy": 0.5473506450653076,
"epoch": 1.8059701492537314,
"grad_norm": 0.14105349779129028,
"learning_rate": 0.0002,
"loss": 0.5428904891014099,
"mean_token_accuracy": 0.778725266456604,
"num_tokens": 7909608.0,
"step": 484
},
{
"entropy": 0.5446173995733261,
"epoch": 1.8097014925373134,
"grad_norm": 0.15689605474472046,
"learning_rate": 0.0002,
"loss": 0.5529049634933472,
"mean_token_accuracy": 0.7787118703126907,
"num_tokens": 7926042.0,
"step": 485
},
{
"entropy": 0.5260195583105087,
"epoch": 1.8134328358208955,
"grad_norm": 0.15744158625602722,
"learning_rate": 0.0002,
"loss": 0.5373381972312927,
"mean_token_accuracy": 0.7849460244178772,
"num_tokens": 7942407.0,
"step": 486
},
{
"entropy": 0.5418536812067032,
"epoch": 1.8171641791044775,
"grad_norm": 0.14664271473884583,
"learning_rate": 0.0002,
"loss": 0.5412867069244385,
"mean_token_accuracy": 0.7811890542507172,
"num_tokens": 7958995.0,
"step": 487
},
{
"entropy": 0.5519318580627441,
"epoch": 1.8208955223880596,
"grad_norm": 0.15384623408317566,
"learning_rate": 0.0002,
"loss": 0.5512985587120056,
"mean_token_accuracy": 0.7755472809076309,
"num_tokens": 7975615.0,
"step": 488
},
{
"entropy": 0.5366766899824142,
"epoch": 1.8246268656716418,
"grad_norm": 0.17651750147342682,
"learning_rate": 0.0002,
"loss": 0.5435804128646851,
"mean_token_accuracy": 0.7781522572040558,
"num_tokens": 7991932.0,
"step": 489
},
{
"entropy": 0.5274553596973419,
"epoch": 1.828358208955224,
"grad_norm": 0.13903461396694183,
"learning_rate": 0.0002,
"loss": 0.5304480195045471,
"mean_token_accuracy": 0.7822371274232864,
"num_tokens": 8008268.0,
"step": 490
},
{
"entropy": 0.5359211266040802,
"epoch": 1.832089552238806,
"grad_norm": 0.1657918393611908,
"learning_rate": 0.0002,
"loss": 0.5305460095405579,
"mean_token_accuracy": 0.7854030579328537,
"num_tokens": 8024551.0,
"step": 491
},
{
"entropy": 0.5484016537666321,
"epoch": 1.835820895522388,
"grad_norm": 0.16684608161449432,
"learning_rate": 0.0002,
"loss": 0.5452835559844971,
"mean_token_accuracy": 0.7772976756095886,
"num_tokens": 8040823.0,
"step": 492
},
{
"entropy": 0.5474873930215836,
"epoch": 1.8395522388059702,
"grad_norm": 0.151128351688385,
"learning_rate": 0.0002,
"loss": 0.5493411421775818,
"mean_token_accuracy": 0.7793968617916107,
"num_tokens": 8057509.0,
"step": 493
},
{
"entropy": 0.526735208928585,
"epoch": 1.8432835820895521,
"grad_norm": 0.1347130686044693,
"learning_rate": 0.0002,
"loss": 0.5294213891029358,
"mean_token_accuracy": 0.783684104681015,
"num_tokens": 8073599.0,
"step": 494
},
{
"entropy": 0.5525032877922058,
"epoch": 1.8470149253731343,
"grad_norm": 0.14043265581130981,
"learning_rate": 0.0002,
"loss": 0.5447618961334229,
"mean_token_accuracy": 0.7783424258232117,
"num_tokens": 8089819.0,
"step": 495
},
{
"entropy": 0.5403036177158356,
"epoch": 1.8507462686567164,
"grad_norm": 0.13459749519824982,
"learning_rate": 0.0002,
"loss": 0.543724775314331,
"mean_token_accuracy": 0.7801337391138077,
"num_tokens": 8106320.0,
"step": 496
},
{
"entropy": 0.5121283084154129,
"epoch": 1.8544776119402986,
"grad_norm": 0.13925622403621674,
"learning_rate": 0.0002,
"loss": 0.5182461142539978,
"mean_token_accuracy": 0.7902320176362991,
"num_tokens": 8122590.0,
"step": 497
},
{
"entropy": 0.5341223925352097,
"epoch": 1.8582089552238807,
"grad_norm": 0.1333732157945633,
"learning_rate": 0.0002,
"loss": 0.5352264642715454,
"mean_token_accuracy": 0.7827399671077728,
"num_tokens": 8138922.0,
"step": 498
},
{
"entropy": 0.5457236468791962,
"epoch": 1.8619402985074627,
"grad_norm": 0.13741785287857056,
"learning_rate": 0.0002,
"loss": 0.5454993844032288,
"mean_token_accuracy": 0.7798125892877579,
"num_tokens": 8155306.0,
"step": 499
},
{
"entropy": 0.5553978830575943,
"epoch": 1.8656716417910446,
"grad_norm": 0.12911130487918854,
"learning_rate": 0.0002,
"loss": 0.5489829778671265,
"mean_token_accuracy": 0.7798224687576294,
"num_tokens": 8171560.0,
"step": 500
},
{
"entropy": 0.5366699695587158,
"epoch": 1.8694029850746268,
"grad_norm": 0.14433807134628296,
"learning_rate": 0.0002,
"loss": 0.5305231213569641,
"mean_token_accuracy": 0.7864150553941727,
"num_tokens": 8188037.0,
"step": 501
},
{
"entropy": 0.5387077182531357,
"epoch": 1.873134328358209,
"grad_norm": 0.14472654461860657,
"learning_rate": 0.0002,
"loss": 0.5373876094818115,
"mean_token_accuracy": 0.7849652767181396,
"num_tokens": 8204628.0,
"step": 502
},
{
"entropy": 0.5305859744548798,
"epoch": 1.876865671641791,
"grad_norm": 0.16016830503940582,
"learning_rate": 0.0002,
"loss": 0.5409325361251831,
"mean_token_accuracy": 0.7806791961193085,
"num_tokens": 8220902.0,
"step": 503
},
{
"entropy": 0.5299341380596161,
"epoch": 1.8805970149253732,
"grad_norm": 0.15263962745666504,
"learning_rate": 0.0002,
"loss": 0.5375992655754089,
"mean_token_accuracy": 0.781559944152832,
"num_tokens": 8237185.0,
"step": 504
},
{
"entropy": 0.5437009185552597,
"epoch": 1.8843283582089554,
"grad_norm": 0.15553534030914307,
"learning_rate": 0.0002,
"loss": 0.5443401336669922,
"mean_token_accuracy": 0.7812230437994003,
"num_tokens": 8253677.0,
"step": 505
},
{
"entropy": 0.5481602549552917,
"epoch": 1.8880597014925373,
"grad_norm": 0.14724990725517273,
"learning_rate": 0.0002,
"loss": 0.540518581867218,
"mean_token_accuracy": 0.7784458547830582,
"num_tokens": 8270080.0,
"step": 506
},
{
"entropy": 0.5473358333110809,
"epoch": 1.8917910447761193,
"grad_norm": 0.13046710193157196,
"learning_rate": 0.0002,
"loss": 0.5379562973976135,
"mean_token_accuracy": 0.7840885818004608,
"num_tokens": 8286417.0,
"step": 507
},
{
"entropy": 0.5339422821998596,
"epoch": 1.8955223880597014,
"grad_norm": 0.11970847100019455,
"learning_rate": 0.0002,
"loss": 0.531002402305603,
"mean_token_accuracy": 0.7831601500511169,
"num_tokens": 8302558.0,
"step": 508
},
{
"entropy": 0.5296764224767685,
"epoch": 1.8992537313432836,
"grad_norm": 0.1354552060365677,
"learning_rate": 0.0002,
"loss": 0.5331873893737793,
"mean_token_accuracy": 0.7870133370161057,
"num_tokens": 8318741.0,
"step": 509
},
{
"entropy": 0.52724589407444,
"epoch": 1.9029850746268657,
"grad_norm": 0.1636589914560318,
"learning_rate": 0.0002,
"loss": 0.5382875800132751,
"mean_token_accuracy": 0.7812641561031342,
"num_tokens": 8335074.0,
"step": 510
},
{
"entropy": 0.5487582981586456,
"epoch": 1.9067164179104479,
"grad_norm": 0.15405811369419098,
"learning_rate": 0.0002,
"loss": 0.5569562315940857,
"mean_token_accuracy": 0.775174006819725,
"num_tokens": 8351357.0,
"step": 511
},
{
"entropy": 0.5199541226029396,
"epoch": 1.9104477611940298,
"grad_norm": 0.13167649507522583,
"learning_rate": 0.0002,
"loss": 0.5217406749725342,
"mean_token_accuracy": 0.788948193192482,
"num_tokens": 8367452.0,
"step": 512
},
{
"entropy": 0.5357903987169266,
"epoch": 1.914179104477612,
"grad_norm": 0.12568941712379456,
"learning_rate": 0.0002,
"loss": 0.5307230949401855,
"mean_token_accuracy": 0.7828755676746368,
"num_tokens": 8383786.0,
"step": 513
},
{
"entropy": 0.5289642512798309,
"epoch": 1.917910447761194,
"grad_norm": 0.130939319729805,
"learning_rate": 0.0002,
"loss": 0.5241107940673828,
"mean_token_accuracy": 0.786993533372879,
"num_tokens": 8400005.0,
"step": 514
},
{
"entropy": 0.5548314303159714,
"epoch": 1.921641791044776,
"grad_norm": 0.1255977749824524,
"learning_rate": 0.0002,
"loss": 0.5506734848022461,
"mean_token_accuracy": 0.7779561877250671,
"num_tokens": 8416502.0,
"step": 515
},
{
"entropy": 0.5388498157262802,
"epoch": 1.9253731343283582,
"grad_norm": 0.13658908009529114,
"learning_rate": 0.0002,
"loss": 0.5440253615379333,
"mean_token_accuracy": 0.7802704125642776,
"num_tokens": 8432771.0,
"step": 516
},
{
"entropy": 0.5444848537445068,
"epoch": 1.9291044776119404,
"grad_norm": 0.1361331045627594,
"learning_rate": 0.0002,
"loss": 0.5464693903923035,
"mean_token_accuracy": 0.7777076661586761,
"num_tokens": 8449261.0,
"step": 517
},
{
"entropy": 0.545665979385376,
"epoch": 1.9328358208955225,
"grad_norm": 0.1317397505044937,
"learning_rate": 0.0002,
"loss": 0.5444501638412476,
"mean_token_accuracy": 0.7814345061779022,
"num_tokens": 8465832.0,
"step": 518
},
{
"entropy": 0.5405286103487015,
"epoch": 1.9365671641791045,
"grad_norm": 0.13252875208854675,
"learning_rate": 0.0002,
"loss": 0.5404050946235657,
"mean_token_accuracy": 0.780963346362114,
"num_tokens": 8482176.0,
"step": 519
},
{
"entropy": 0.5433270484209061,
"epoch": 1.9402985074626866,
"grad_norm": 0.13105268776416779,
"learning_rate": 0.0002,
"loss": 0.5479311943054199,
"mean_token_accuracy": 0.7770702540874481,
"num_tokens": 8498438.0,
"step": 520
},
{
"entropy": 0.5341716408729553,
"epoch": 1.9440298507462686,
"grad_norm": 0.14269208908081055,
"learning_rate": 0.0002,
"loss": 0.535066545009613,
"mean_token_accuracy": 0.7825455218553543,
"num_tokens": 8514674.0,
"step": 521
},
{
"entropy": 0.5395411849021912,
"epoch": 1.9477611940298507,
"grad_norm": 0.13277186453342438,
"learning_rate": 0.0002,
"loss": 0.5376089215278625,
"mean_token_accuracy": 0.7824221551418304,
"num_tokens": 8530963.0,
"step": 522
},
{
"entropy": 0.5529618561267853,
"epoch": 1.9514925373134329,
"grad_norm": 0.1381501704454422,
"learning_rate": 0.0002,
"loss": 0.5493215918540955,
"mean_token_accuracy": 0.779175415635109,
"num_tokens": 8547488.0,
"step": 523
},
{
"entropy": 0.5260922610759735,
"epoch": 1.955223880597015,
"grad_norm": 0.1598714143037796,
"learning_rate": 0.0002,
"loss": 0.5309720039367676,
"mean_token_accuracy": 0.7842647433280945,
"num_tokens": 8564003.0,
"step": 524
},
{
"entropy": 0.5258769541978836,
"epoch": 1.9589552238805972,
"grad_norm": 0.1397145837545395,
"learning_rate": 0.0002,
"loss": 0.533185601234436,
"mean_token_accuracy": 0.7819601446390152,
"num_tokens": 8580280.0,
"step": 525
},
{
"entropy": 0.5250103250145912,
"epoch": 1.962686567164179,
"grad_norm": 0.19406840205192566,
"learning_rate": 0.0002,
"loss": 0.5373009443283081,
"mean_token_accuracy": 0.7827760279178619,
"num_tokens": 8596181.0,
"step": 526
},
{
"entropy": 0.556450217962265,
"epoch": 1.966417910447761,
"grad_norm": 0.13848020136356354,
"learning_rate": 0.0002,
"loss": 0.5526891946792603,
"mean_token_accuracy": 0.7767400592565536,
"num_tokens": 8612545.0,
"step": 527
},
{
"entropy": 0.5524493604898453,
"epoch": 1.9701492537313432,
"grad_norm": 0.13262905180454254,
"learning_rate": 0.0002,
"loss": 0.5456893444061279,
"mean_token_accuracy": 0.7794637978076935,
"num_tokens": 8628708.0,
"step": 528
},
{
"entropy": 0.5483785569667816,
"epoch": 1.9738805970149254,
"grad_norm": 0.13305608928203583,
"learning_rate": 0.0002,
"loss": 0.5419108271598816,
"mean_token_accuracy": 0.7776815295219421,
"num_tokens": 8645353.0,
"step": 529
},
{
"entropy": 0.5357464104890823,
"epoch": 1.9776119402985075,
"grad_norm": 0.18632404506206512,
"learning_rate": 0.0002,
"loss": 0.538067102432251,
"mean_token_accuracy": 0.7834661602973938,
"num_tokens": 8661338.0,
"step": 530
},
{
"entropy": 0.5424002707004547,
"epoch": 1.9813432835820897,
"grad_norm": 0.14013341069221497,
"learning_rate": 0.0002,
"loss": 0.5498412251472473,
"mean_token_accuracy": 0.7779710739850998,
"num_tokens": 8677558.0,
"step": 531
},
{
"entropy": 0.5473677217960358,
"epoch": 1.9850746268656716,
"grad_norm": 0.16677168011665344,
"learning_rate": 0.0002,
"loss": 0.5508783459663391,
"mean_token_accuracy": 0.7754979729652405,
"num_tokens": 8693871.0,
"step": 532
},
{
"entropy": 0.5417899936437607,
"epoch": 1.9888059701492538,
"grad_norm": 0.13049523532390594,
"learning_rate": 0.0002,
"loss": 0.5387138724327087,
"mean_token_accuracy": 0.7801752388477325,
"num_tokens": 8710295.0,
"step": 533
},
{
"entropy": 0.539973795413971,
"epoch": 1.9925373134328357,
"grad_norm": 0.13125836849212646,
"learning_rate": 0.0002,
"loss": 0.5384909510612488,
"mean_token_accuracy": 0.7825180888175964,
"num_tokens": 8726574.0,
"step": 534
},
{
"entropy": 0.5503594130277634,
"epoch": 1.9962686567164178,
"grad_norm": 0.13576547801494598,
"learning_rate": 0.0002,
"loss": 0.5558905005455017,
"mean_token_accuracy": 0.7731243073940277,
"num_tokens": 8742903.0,
"step": 535
},
{
"entropy": 0.5420230776071548,
"epoch": 2.0,
"grad_norm": 0.13022863864898682,
"learning_rate": 0.0002,
"loss": 0.5468026399612427,
"mean_token_accuracy": 0.7781520336866379,
"num_tokens": 8759542.0,
"step": 536
},
{
"entropy": 0.5381979197263718,
"epoch": 2.003731343283582,
"grad_norm": 0.14043375849723816,
"learning_rate": 0.0002,
"loss": 0.527134358882904,
"mean_token_accuracy": 0.7864610850811005,
"num_tokens": 8775884.0,
"step": 537
},
{
"entropy": 0.5298552364110947,
"epoch": 2.0074626865671643,
"grad_norm": 0.15086792409420013,
"learning_rate": 0.0002,
"loss": 0.525084912776947,
"mean_token_accuracy": 0.7869725525379181,
"num_tokens": 8792092.0,
"step": 538
},
{
"entropy": 0.5192188173532486,
"epoch": 2.0111940298507465,
"grad_norm": 0.19961106777191162,
"learning_rate": 0.0002,
"loss": 0.5296894907951355,
"mean_token_accuracy": 0.7826270759105682,
"num_tokens": 8808558.0,
"step": 539
},
{
"entropy": 0.5123308524489403,
"epoch": 2.014925373134328,
"grad_norm": 0.19111908972263336,
"learning_rate": 0.0002,
"loss": 0.5212836265563965,
"mean_token_accuracy": 0.789938747882843,
"num_tokens": 8824957.0,
"step": 540
},
{
"entropy": 0.5178431421518326,
"epoch": 2.0186567164179103,
"grad_norm": 0.19028709828853607,
"learning_rate": 0.0002,
"loss": 0.5238035917282104,
"mean_token_accuracy": 0.7860684394836426,
"num_tokens": 8841440.0,
"step": 541
},
{
"entropy": 0.531784176826477,
"epoch": 2.0223880597014925,
"grad_norm": 0.15052154660224915,
"learning_rate": 0.0002,
"loss": 0.5242434144020081,
"mean_token_accuracy": 0.7872632443904877,
"num_tokens": 8857544.0,
"step": 542
},
{
"entropy": 0.523473396897316,
"epoch": 2.0261194029850746,
"grad_norm": 0.16107355058193207,
"learning_rate": 0.0002,
"loss": 0.5132102966308594,
"mean_token_accuracy": 0.7902694642543793,
"num_tokens": 8873855.0,
"step": 543
},
{
"entropy": 0.5190383419394493,
"epoch": 2.029850746268657,
"grad_norm": 0.1708311289548874,
"learning_rate": 0.0002,
"loss": 0.5148621797561646,
"mean_token_accuracy": 0.7895102798938751,
"num_tokens": 8890117.0,
"step": 544
},
{
"entropy": 0.529280424118042,
"epoch": 2.033582089552239,
"grad_norm": 0.16680803894996643,
"learning_rate": 0.0002,
"loss": 0.5307912826538086,
"mean_token_accuracy": 0.7853487432003021,
"num_tokens": 8906392.0,
"step": 545
},
{
"entropy": 0.49614501744508743,
"epoch": 2.0373134328358207,
"grad_norm": 0.1503826081752777,
"learning_rate": 0.0002,
"loss": 0.5012757182121277,
"mean_token_accuracy": 0.7970542311668396,
"num_tokens": 8922509.0,
"step": 546
},
{
"entropy": 0.509469673037529,
"epoch": 2.041044776119403,
"grad_norm": 0.15220946073532104,
"learning_rate": 0.0002,
"loss": 0.5193155407905579,
"mean_token_accuracy": 0.7900224179029465,
"num_tokens": 8938730.0,
"step": 547
},
{
"entropy": 0.5206529274582863,
"epoch": 2.044776119402985,
"grad_norm": 0.15667758882045746,
"learning_rate": 0.0002,
"loss": 0.5237014293670654,
"mean_token_accuracy": 0.7895828038454056,
"num_tokens": 8955181.0,
"step": 548
},
{
"entropy": 0.5195223838090897,
"epoch": 2.048507462686567,
"grad_norm": 0.1412286013364792,
"learning_rate": 0.0002,
"loss": 0.5065000653266907,
"mean_token_accuracy": 0.7948807328939438,
"num_tokens": 8971652.0,
"step": 549
},
{
"entropy": 0.5343464240431786,
"epoch": 2.0522388059701493,
"grad_norm": 0.17040982842445374,
"learning_rate": 0.0002,
"loss": 0.5262223482131958,
"mean_token_accuracy": 0.7864163517951965,
"num_tokens": 8987886.0,
"step": 550
},
{
"entropy": 0.5151650607585907,
"epoch": 2.0559701492537314,
"grad_norm": 0.18324047327041626,
"learning_rate": 0.0002,
"loss": 0.5181486010551453,
"mean_token_accuracy": 0.7915034592151642,
"num_tokens": 9004065.0,
"step": 551
},
{
"entropy": 0.5399871617555618,
"epoch": 2.0597014925373136,
"grad_norm": 0.18549422919750214,
"learning_rate": 0.0002,
"loss": 0.5452507138252258,
"mean_token_accuracy": 0.7797505408525467,
"num_tokens": 9020548.0,
"step": 552
},
{
"entropy": 0.5106882750988007,
"epoch": 2.0634328358208953,
"grad_norm": 0.18570005893707275,
"learning_rate": 0.0002,
"loss": 0.5167975425720215,
"mean_token_accuracy": 0.7912678271532059,
"num_tokens": 9036842.0,
"step": 553
},
{
"entropy": 0.5242500603199005,
"epoch": 2.0671641791044775,
"grad_norm": 0.16008509695529938,
"learning_rate": 0.0002,
"loss": 0.5222814083099365,
"mean_token_accuracy": 0.7895151823759079,
"num_tokens": 9053207.0,
"step": 554
},
{
"entropy": 0.5202578157186508,
"epoch": 2.0708955223880596,
"grad_norm": 0.158061683177948,
"learning_rate": 0.0002,
"loss": 0.510570228099823,
"mean_token_accuracy": 0.7938546240329742,
"num_tokens": 9069710.0,
"step": 555
},
{
"entropy": 0.5159406885504723,
"epoch": 2.074626865671642,
"grad_norm": 0.1673257201910019,
"learning_rate": 0.0002,
"loss": 0.5130877494812012,
"mean_token_accuracy": 0.7952297329902649,
"num_tokens": 9085896.0,
"step": 556
},
{
"entropy": 0.5333143472671509,
"epoch": 2.078358208955224,
"grad_norm": 0.1610044240951538,
"learning_rate": 0.0002,
"loss": 0.534683108329773,
"mean_token_accuracy": 0.7838889360427856,
"num_tokens": 9102330.0,
"step": 557
},
{
"entropy": 0.5199142321944237,
"epoch": 2.082089552238806,
"grad_norm": 0.18822608888149261,
"learning_rate": 0.0002,
"loss": 0.5304499864578247,
"mean_token_accuracy": 0.7855323851108551,
"num_tokens": 9118702.0,
"step": 558
},
{
"entropy": 0.5128015950322151,
"epoch": 2.0858208955223883,
"grad_norm": 0.16853775084018707,
"learning_rate": 0.0002,
"loss": 0.5243670344352722,
"mean_token_accuracy": 0.7870570570230484,
"num_tokens": 9135161.0,
"step": 559
},
{
"entropy": 0.5174604654312134,
"epoch": 2.08955223880597,
"grad_norm": 0.1812400370836258,
"learning_rate": 0.0002,
"loss": 0.5177437663078308,
"mean_token_accuracy": 0.7915796935558319,
"num_tokens": 9151704.0,
"step": 560
},
{
"entropy": 0.5173925012350082,
"epoch": 2.093283582089552,
"grad_norm": 0.1714162975549698,
"learning_rate": 0.0002,
"loss": 0.5103091597557068,
"mean_token_accuracy": 0.7926450222730637,
"num_tokens": 9167936.0,
"step": 561
},
{
"entropy": 0.5338417440652847,
"epoch": 2.0970149253731343,
"grad_norm": 0.18883411586284637,
"learning_rate": 0.0002,
"loss": 0.5264431834220886,
"mean_token_accuracy": 0.7850892692804337,
"num_tokens": 9184252.0,
"step": 562
},
{
"entropy": 0.5227560251951218,
"epoch": 2.1007462686567164,
"grad_norm": 0.16431209444999695,
"learning_rate": 0.0002,
"loss": 0.5194032192230225,
"mean_token_accuracy": 0.7891248762607574,
"num_tokens": 9200663.0,
"step": 563
},
{
"entropy": 0.5161062777042389,
"epoch": 2.1044776119402986,
"grad_norm": 0.19406329095363617,
"learning_rate": 0.0002,
"loss": 0.5161796808242798,
"mean_token_accuracy": 0.7907394468784332,
"num_tokens": 9216947.0,
"step": 564
},
{
"entropy": 0.5179730951786041,
"epoch": 2.1082089552238807,
"grad_norm": 0.1819450706243515,
"learning_rate": 0.0002,
"loss": 0.5243360996246338,
"mean_token_accuracy": 0.7889621257781982,
"num_tokens": 9233374.0,
"step": 565
},
{
"entropy": 0.5069013833999634,
"epoch": 2.111940298507463,
"grad_norm": 0.18256594240665436,
"learning_rate": 0.0002,
"loss": 0.5135838389396667,
"mean_token_accuracy": 0.7917103320360184,
"num_tokens": 9249879.0,
"step": 566
},
{
"entropy": 0.5135505869984627,
"epoch": 2.1156716417910446,
"grad_norm": 0.20573152601718903,
"learning_rate": 0.0002,
"loss": 0.5165933966636658,
"mean_token_accuracy": 0.7909833937883377,
"num_tokens": 9266246.0,
"step": 567
},
{
"entropy": 0.5395868420600891,
"epoch": 2.1194029850746268,
"grad_norm": 0.18927782773971558,
"learning_rate": 0.0002,
"loss": 0.5330281853675842,
"mean_token_accuracy": 0.7855703681707382,
"num_tokens": 9282481.0,
"step": 568
},
{
"entropy": 0.4938410297036171,
"epoch": 2.123134328358209,
"grad_norm": 0.19526073336601257,
"learning_rate": 0.0002,
"loss": 0.49382245540618896,
"mean_token_accuracy": 0.7996838092803955,
"num_tokens": 9298815.0,
"step": 569
},
{
"entropy": 0.5009667873382568,
"epoch": 2.126865671641791,
"grad_norm": 0.16595199704170227,
"learning_rate": 0.0002,
"loss": 0.5045086741447449,
"mean_token_accuracy": 0.7978608906269073,
"num_tokens": 9315340.0,
"step": 570
},
{
"entropy": 0.5141628980636597,
"epoch": 2.1305970149253732,
"grad_norm": 0.21891801059246063,
"learning_rate": 0.0002,
"loss": 0.5266185998916626,
"mean_token_accuracy": 0.787352979183197,
"num_tokens": 9331498.0,
"step": 571
},
{
"entropy": 0.5307284891605377,
"epoch": 2.1343283582089554,
"grad_norm": 0.1866699457168579,
"learning_rate": 0.0002,
"loss": 0.5273443460464478,
"mean_token_accuracy": 0.7860653698444366,
"num_tokens": 9347831.0,
"step": 572
},
{
"entropy": 0.5239406228065491,
"epoch": 2.138059701492537,
"grad_norm": 0.16141167283058167,
"learning_rate": 0.0002,
"loss": 0.5189298391342163,
"mean_token_accuracy": 0.7913686484098434,
"num_tokens": 9364053.0,
"step": 573
},
{
"entropy": 0.5423860549926758,
"epoch": 2.1417910447761193,
"grad_norm": 0.21419642865657806,
"learning_rate": 0.0002,
"loss": 0.5438653826713562,
"mean_token_accuracy": 0.7800484448671341,
"num_tokens": 9380482.0,
"step": 574
},
{
"entropy": 0.5319498926401138,
"epoch": 2.1455223880597014,
"grad_norm": 0.15394842624664307,
"learning_rate": 0.0002,
"loss": 0.5297288298606873,
"mean_token_accuracy": 0.7861971110105515,
"num_tokens": 9396762.0,
"step": 575
},
{
"entropy": 0.5272255092859268,
"epoch": 2.1492537313432836,
"grad_norm": 0.17917747795581818,
"learning_rate": 0.0002,
"loss": 0.5221657156944275,
"mean_token_accuracy": 0.78948013484478,
"num_tokens": 9412981.0,
"step": 576
},
{
"entropy": 0.5195171386003494,
"epoch": 2.1529850746268657,
"grad_norm": 0.16095657646656036,
"learning_rate": 0.0002,
"loss": 0.5160609483718872,
"mean_token_accuracy": 0.7911281585693359,
"num_tokens": 9429393.0,
"step": 577
},
{
"entropy": 0.5020652115345001,
"epoch": 2.156716417910448,
"grad_norm": 0.1592203974723816,
"learning_rate": 0.0002,
"loss": 0.5017430782318115,
"mean_token_accuracy": 0.7959037572145462,
"num_tokens": 9445763.0,
"step": 578
},
{
"entropy": 0.5353998094797134,
"epoch": 2.16044776119403,
"grad_norm": 0.18405838310718536,
"learning_rate": 0.0002,
"loss": 0.5360097885131836,
"mean_token_accuracy": 0.7805107831954956,
"num_tokens": 9462245.0,
"step": 579
},
{
"entropy": 0.5231145322322845,
"epoch": 2.1641791044776117,
"grad_norm": 0.16262777149677277,
"learning_rate": 0.0002,
"loss": 0.5238299369812012,
"mean_token_accuracy": 0.7883976399898529,
"num_tokens": 9478792.0,
"step": 580
},
{
"entropy": 0.5025703385472298,
"epoch": 2.167910447761194,
"grad_norm": 0.16886277496814728,
"learning_rate": 0.0002,
"loss": 0.5095133185386658,
"mean_token_accuracy": 0.7930570840835571,
"num_tokens": 9495042.0,
"step": 581
},
{
"entropy": 0.5041064321994781,
"epoch": 2.171641791044776,
"grad_norm": 0.1545090675354004,
"learning_rate": 0.0002,
"loss": 0.5001657605171204,
"mean_token_accuracy": 0.7950020581483841,
"num_tokens": 9511399.0,
"step": 582
},
{
"entropy": 0.533274233341217,
"epoch": 2.175373134328358,
"grad_norm": 0.15395475924015045,
"learning_rate": 0.0002,
"loss": 0.5321199893951416,
"mean_token_accuracy": 0.7817400395870209,
"num_tokens": 9527796.0,
"step": 583
},
{
"entropy": 0.5225674957036972,
"epoch": 2.1791044776119404,
"grad_norm": 0.1874343305826187,
"learning_rate": 0.0002,
"loss": 0.5301029682159424,
"mean_token_accuracy": 0.7839690893888474,
"num_tokens": 9544098.0,
"step": 584
},
{
"entropy": 0.5206504017114639,
"epoch": 2.1828358208955225,
"grad_norm": 0.18132635951042175,
"learning_rate": 0.0002,
"loss": 0.5191587209701538,
"mean_token_accuracy": 0.7905547767877579,
"num_tokens": 9560486.0,
"step": 585
},
{
"entropy": 0.5231298729777336,
"epoch": 2.1865671641791047,
"grad_norm": 0.19394823908805847,
"learning_rate": 0.0002,
"loss": 0.5234656929969788,
"mean_token_accuracy": 0.7889635264873505,
"num_tokens": 9576893.0,
"step": 586
},
{
"entropy": 0.4975113570690155,
"epoch": 2.1902985074626864,
"grad_norm": 0.1897096484899521,
"learning_rate": 0.0002,
"loss": 0.5067098736763,
"mean_token_accuracy": 0.7950832843780518,
"num_tokens": 9593176.0,
"step": 587
},
{
"entropy": 0.5182362198829651,
"epoch": 2.1940298507462686,
"grad_norm": 0.21101859211921692,
"learning_rate": 0.0002,
"loss": 0.5240258574485779,
"mean_token_accuracy": 0.7852578610181808,
"num_tokens": 9609529.0,
"step": 588
},
{
"entropy": 0.5308810174465179,
"epoch": 2.1977611940298507,
"grad_norm": 0.15612205862998962,
"learning_rate": 0.0002,
"loss": 0.5230595469474792,
"mean_token_accuracy": 0.7886761873960495,
"num_tokens": 9626018.0,
"step": 589
},
{
"entropy": 0.5405040681362152,
"epoch": 2.201492537313433,
"grad_norm": 0.16354262828826904,
"learning_rate": 0.0002,
"loss": 0.5339536666870117,
"mean_token_accuracy": 0.7827159017324448,
"num_tokens": 9642340.0,
"step": 590
},
{
"entropy": 0.5320803225040436,
"epoch": 2.205223880597015,
"grad_norm": 0.1848597228527069,
"learning_rate": 0.0002,
"loss": 0.5349913835525513,
"mean_token_accuracy": 0.7858193665742874,
"num_tokens": 9658780.0,
"step": 591
},
{
"entropy": 0.5458312928676605,
"epoch": 2.208955223880597,
"grad_norm": 0.16995884478092194,
"learning_rate": 0.0002,
"loss": 0.5466773509979248,
"mean_token_accuracy": 0.7766650468111038,
"num_tokens": 9675184.0,
"step": 592
},
{
"entropy": 0.520288422703743,
"epoch": 2.2126865671641793,
"grad_norm": 0.17533989250659943,
"learning_rate": 0.0002,
"loss": 0.5276610851287842,
"mean_token_accuracy": 0.7833162993192673,
"num_tokens": 9691587.0,
"step": 593
},
{
"entropy": 0.5230257883667946,
"epoch": 2.216417910447761,
"grad_norm": 0.1576543152332306,
"learning_rate": 0.0002,
"loss": 0.5214830040931702,
"mean_token_accuracy": 0.7887468189001083,
"num_tokens": 9707639.0,
"step": 594
},
{
"entropy": 0.5276977717876434,
"epoch": 2.220149253731343,
"grad_norm": 0.16972552239894867,
"learning_rate": 0.0002,
"loss": 0.5270232558250427,
"mean_token_accuracy": 0.7899148017168045,
"num_tokens": 9723826.0,
"step": 595
},
{
"entropy": 0.5177433490753174,
"epoch": 2.2238805970149254,
"grad_norm": 0.17887970805168152,
"learning_rate": 0.0002,
"loss": 0.5160896182060242,
"mean_token_accuracy": 0.7925579845905304,
"num_tokens": 9740088.0,
"step": 596
},
{
"entropy": 0.525688573718071,
"epoch": 2.2276119402985075,
"grad_norm": 0.1659506857395172,
"learning_rate": 0.0002,
"loss": 0.5277712345123291,
"mean_token_accuracy": 0.7854456752538681,
"num_tokens": 9756214.0,
"step": 597
},
{
"entropy": 0.5137215405702591,
"epoch": 2.2313432835820897,
"grad_norm": 0.18150706589221954,
"learning_rate": 0.0002,
"loss": 0.5194687247276306,
"mean_token_accuracy": 0.7904618233442307,
"num_tokens": 9772511.0,
"step": 598
},
{
"entropy": 0.529701828956604,
"epoch": 2.235074626865672,
"grad_norm": 0.17603962123394012,
"learning_rate": 0.0002,
"loss": 0.5309550166130066,
"mean_token_accuracy": 0.7836979478597641,
"num_tokens": 9788956.0,
"step": 599
},
{
"entropy": 0.5346364378929138,
"epoch": 2.2388059701492535,
"grad_norm": 0.17556419968605042,
"learning_rate": 0.0002,
"loss": 0.5340572595596313,
"mean_token_accuracy": 0.7827766090631485,
"num_tokens": 9805350.0,
"step": 600
},
{
"entropy": 0.5358438938856125,
"epoch": 2.2425373134328357,
"grad_norm": 0.19660161435604095,
"learning_rate": 0.0002,
"loss": 0.5320678353309631,
"mean_token_accuracy": 0.7855796813964844,
"num_tokens": 9821744.0,
"step": 601
},
{
"entropy": 0.5096235424280167,
"epoch": 2.246268656716418,
"grad_norm": 0.15900631248950958,
"learning_rate": 0.0002,
"loss": 0.5056334137916565,
"mean_token_accuracy": 0.7966822683811188,
"num_tokens": 9837824.0,
"step": 602
},
{
"entropy": 0.5357042700052261,
"epoch": 2.25,
"grad_norm": 0.1657211184501648,
"learning_rate": 0.0002,
"loss": 0.5354617238044739,
"mean_token_accuracy": 0.7830197513103485,
"num_tokens": 9854305.0,
"step": 603
},
{
"entropy": 0.5109390839934349,
"epoch": 2.253731343283582,
"grad_norm": 0.1763714998960495,
"learning_rate": 0.0002,
"loss": 0.5157687664031982,
"mean_token_accuracy": 0.7923711538314819,
"num_tokens": 9870793.0,
"step": 604
},
{
"entropy": 0.5191235095262527,
"epoch": 2.2574626865671643,
"grad_norm": 0.20325957238674164,
"learning_rate": 0.0002,
"loss": 0.5273858308792114,
"mean_token_accuracy": 0.7857847660779953,
"num_tokens": 9887144.0,
"step": 605
},
{
"entropy": 0.5128894448280334,
"epoch": 2.2611940298507465,
"grad_norm": 0.18303951621055603,
"learning_rate": 0.0002,
"loss": 0.5150971412658691,
"mean_token_accuracy": 0.7911935448646545,
"num_tokens": 9903362.0,
"step": 606
},
{
"entropy": 0.518405131995678,
"epoch": 2.264925373134328,
"grad_norm": 0.16138286888599396,
"learning_rate": 0.0002,
"loss": 0.5196152925491333,
"mean_token_accuracy": 0.7916755676269531,
"num_tokens": 9919665.0,
"step": 607
},
{
"entropy": 0.5238161385059357,
"epoch": 2.2686567164179103,
"grad_norm": 0.15336841344833374,
"learning_rate": 0.0002,
"loss": 0.5234584808349609,
"mean_token_accuracy": 0.7885531485080719,
"num_tokens": 9936204.0,
"step": 608
},
{
"entropy": 0.5139288082718849,
"epoch": 2.2723880597014925,
"grad_norm": 0.15460564196109772,
"learning_rate": 0.0002,
"loss": 0.516942024230957,
"mean_token_accuracy": 0.7878196388483047,
"num_tokens": 9952444.0,
"step": 609
},
{
"entropy": 0.5144378393888474,
"epoch": 2.2761194029850746,
"grad_norm": 0.16456560790538788,
"learning_rate": 0.0002,
"loss": 0.5143165588378906,
"mean_token_accuracy": 0.7900296002626419,
"num_tokens": 9968772.0,
"step": 610
},
{
"entropy": 0.5115328878164291,
"epoch": 2.279850746268657,
"grad_norm": 0.17883925139904022,
"learning_rate": 0.0002,
"loss": 0.5190625190734863,
"mean_token_accuracy": 0.7872501909732819,
"num_tokens": 9985174.0,
"step": 611
},
{
"entropy": 0.535979226231575,
"epoch": 2.283582089552239,
"grad_norm": 0.1744793951511383,
"learning_rate": 0.0002,
"loss": 0.5318659543991089,
"mean_token_accuracy": 0.7878114283084869,
"num_tokens": 10001610.0,
"step": 612
},
{
"entropy": 0.5348420441150665,
"epoch": 2.2873134328358207,
"grad_norm": 0.17023774981498718,
"learning_rate": 0.0002,
"loss": 0.5370223522186279,
"mean_token_accuracy": 0.783968135714531,
"num_tokens": 10017829.0,
"step": 613
},
{
"entropy": 0.5138903260231018,
"epoch": 2.291044776119403,
"grad_norm": 0.17115749418735504,
"learning_rate": 0.0002,
"loss": 0.5157005190849304,
"mean_token_accuracy": 0.7915801256895065,
"num_tokens": 10034135.0,
"step": 614
},
{
"entropy": 0.514953039586544,
"epoch": 2.294776119402985,
"grad_norm": 0.1999882459640503,
"learning_rate": 0.0002,
"loss": 0.5170516967773438,
"mean_token_accuracy": 0.7916076630353928,
"num_tokens": 10050500.0,
"step": 615
},
{
"entropy": 0.5247506201267242,
"epoch": 2.298507462686567,
"grad_norm": 0.16434574127197266,
"learning_rate": 0.0002,
"loss": 0.5179375410079956,
"mean_token_accuracy": 0.7906480133533478,
"num_tokens": 10066822.0,
"step": 616
},
{
"entropy": 0.5195427983999252,
"epoch": 2.3022388059701493,
"grad_norm": 0.16079425811767578,
"learning_rate": 0.0002,
"loss": 0.5192772746086121,
"mean_token_accuracy": 0.788419172167778,
"num_tokens": 10083211.0,
"step": 617
},
{
"entropy": 0.5161983221769333,
"epoch": 2.3059701492537314,
"grad_norm": 0.15893937647342682,
"learning_rate": 0.0002,
"loss": 0.5151652097702026,
"mean_token_accuracy": 0.7913366705179214,
"num_tokens": 10099502.0,
"step": 618
},
{
"entropy": 0.5129862576723099,
"epoch": 2.3097014925373136,
"grad_norm": 0.1990455985069275,
"learning_rate": 0.0002,
"loss": 0.5226958394050598,
"mean_token_accuracy": 0.7890161275863647,
"num_tokens": 10115875.0,
"step": 619
},
{
"entropy": 0.5259782820940018,
"epoch": 2.3134328358208958,
"grad_norm": 0.17600762844085693,
"learning_rate": 0.0002,
"loss": 0.5303045511245728,
"mean_token_accuracy": 0.784588485956192,
"num_tokens": 10132329.0,
"step": 620
},
{
"entropy": 0.5374605804681778,
"epoch": 2.3171641791044775,
"grad_norm": 0.15160205960273743,
"learning_rate": 0.0002,
"loss": 0.5319960117340088,
"mean_token_accuracy": 0.7856357097625732,
"num_tokens": 10148660.0,
"step": 621
},
{
"entropy": 0.5202681869268417,
"epoch": 2.3208955223880596,
"grad_norm": 0.17217791080474854,
"learning_rate": 0.0002,
"loss": 0.513685405254364,
"mean_token_accuracy": 0.7912963330745697,
"num_tokens": 10164847.0,
"step": 622
},
{
"entropy": 0.5351561158895493,
"epoch": 2.324626865671642,
"grad_norm": 0.16189849376678467,
"learning_rate": 0.0002,
"loss": 0.5341706275939941,
"mean_token_accuracy": 0.7827345281839371,
"num_tokens": 10181330.0,
"step": 623
},
{
"entropy": 0.5096163898706436,
"epoch": 2.328358208955224,
"grad_norm": 0.17251546680927277,
"learning_rate": 0.0002,
"loss": 0.5183389186859131,
"mean_token_accuracy": 0.7891778647899628,
"num_tokens": 10197593.0,
"step": 624
},
{
"entropy": 0.5043528005480766,
"epoch": 2.332089552238806,
"grad_norm": 0.19364336133003235,
"learning_rate": 0.0002,
"loss": 0.5169776082038879,
"mean_token_accuracy": 0.792061522603035,
"num_tokens": 10213821.0,
"step": 625
},
{
"entropy": 0.5118814930319786,
"epoch": 2.3358208955223883,
"grad_norm": 0.21755088865756989,
"learning_rate": 0.0002,
"loss": 0.5260127782821655,
"mean_token_accuracy": 0.7870439440011978,
"num_tokens": 10229959.0,
"step": 626
},
{
"entropy": 0.5387731194496155,
"epoch": 2.33955223880597,
"grad_norm": 0.15599676966667175,
"learning_rate": 0.0002,
"loss": 0.5359347462654114,
"mean_token_accuracy": 0.7821696400642395,
"num_tokens": 10246325.0,
"step": 627
},
{
"entropy": 0.5259936600923538,
"epoch": 2.343283582089552,
"grad_norm": 0.17784081399440765,
"learning_rate": 0.0002,
"loss": 0.5117411613464355,
"mean_token_accuracy": 0.7913538813591003,
"num_tokens": 10262854.0,
"step": 628
},
{
"entropy": 0.5261276811361313,
"epoch": 2.3470149253731343,
"grad_norm": 0.15290921926498413,
"learning_rate": 0.0002,
"loss": 0.5141685009002686,
"mean_token_accuracy": 0.7897167503833771,
"num_tokens": 10279167.0,
"step": 629
},
{
"entropy": 0.516872301697731,
"epoch": 2.3507462686567164,
"grad_norm": 0.16548150777816772,
"learning_rate": 0.0002,
"loss": 0.518975555896759,
"mean_token_accuracy": 0.7876042425632477,
"num_tokens": 10295367.0,
"step": 630
},
{
"entropy": 0.5166520774364471,
"epoch": 2.3544776119402986,
"grad_norm": 0.2100355476140976,
"learning_rate": 0.0002,
"loss": 0.5216490030288696,
"mean_token_accuracy": 0.7918855249881744,
"num_tokens": 10311818.0,
"step": 631
},
{
"entropy": 0.5158288031816483,
"epoch": 2.3582089552238807,
"grad_norm": 0.19722220301628113,
"learning_rate": 0.0002,
"loss": 0.5301001667976379,
"mean_token_accuracy": 0.785649761557579,
"num_tokens": 10328226.0,
"step": 632
},
{
"entropy": 0.5121333077549934,
"epoch": 2.361940298507463,
"grad_norm": 0.18101061880588531,
"learning_rate": 0.0002,
"loss": 0.514575719833374,
"mean_token_accuracy": 0.7912623584270477,
"num_tokens": 10344492.0,
"step": 633
},
{
"entropy": 0.5286690294742584,
"epoch": 2.3656716417910446,
"grad_norm": 0.18992973864078522,
"learning_rate": 0.0002,
"loss": 0.5238395929336548,
"mean_token_accuracy": 0.7872939556837082,
"num_tokens": 10360763.0,
"step": 634
},
{
"entropy": 0.504866473376751,
"epoch": 2.3694029850746268,
"grad_norm": 0.17053747177124023,
"learning_rate": 0.0002,
"loss": 0.5018288493156433,
"mean_token_accuracy": 0.7963565587997437,
"num_tokens": 10376794.0,
"step": 635
},
{
"entropy": 0.5348407328128815,
"epoch": 2.373134328358209,
"grad_norm": 0.1969325840473175,
"learning_rate": 0.0002,
"loss": 0.5392089486122131,
"mean_token_accuracy": 0.781823992729187,
"num_tokens": 10393125.0,
"step": 636
},
{
"entropy": 0.5291974544525146,
"epoch": 2.376865671641791,
"grad_norm": 0.19346994161605835,
"learning_rate": 0.0002,
"loss": 0.5330736637115479,
"mean_token_accuracy": 0.781773254275322,
"num_tokens": 10409537.0,
"step": 637
},
{
"entropy": 0.5348323583602905,
"epoch": 2.3805970149253732,
"grad_norm": 0.18969298899173737,
"learning_rate": 0.0002,
"loss": 0.5274794101715088,
"mean_token_accuracy": 0.787670373916626,
"num_tokens": 10425973.0,
"step": 638
},
{
"entropy": 0.5205499678850174,
"epoch": 2.3843283582089554,
"grad_norm": 0.17864486575126648,
"learning_rate": 0.0002,
"loss": 0.5213812589645386,
"mean_token_accuracy": 0.7890082150697708,
"num_tokens": 10442180.0,
"step": 639
},
{
"entropy": 0.528412714600563,
"epoch": 2.388059701492537,
"grad_norm": 0.1959443986415863,
"learning_rate": 0.0002,
"loss": 0.534969687461853,
"mean_token_accuracy": 0.7831798046827316,
"num_tokens": 10458477.0,
"step": 640
},
{
"entropy": 0.5136244520545006,
"epoch": 2.3917910447761193,
"grad_norm": 0.20498400926589966,
"learning_rate": 0.0002,
"loss": 0.511573314666748,
"mean_token_accuracy": 0.7939646393060684,
"num_tokens": 10475023.0,
"step": 641
},
{
"entropy": 0.5202098488807678,
"epoch": 2.3955223880597014,
"grad_norm": 0.20506030321121216,
"learning_rate": 0.0002,
"loss": 0.5162352919578552,
"mean_token_accuracy": 0.7906180173158646,
"num_tokens": 10491313.0,
"step": 642
},
{
"entropy": 0.5307043790817261,
"epoch": 2.3992537313432836,
"grad_norm": 0.17971979081630707,
"learning_rate": 0.0002,
"loss": 0.5288392305374146,
"mean_token_accuracy": 0.7879067957401276,
"num_tokens": 10507682.0,
"step": 643
},
{
"entropy": 0.5393616259098053,
"epoch": 2.4029850746268657,
"grad_norm": 0.23341259360313416,
"learning_rate": 0.0002,
"loss": 0.5383281707763672,
"mean_token_accuracy": 0.781504288315773,
"num_tokens": 10524138.0,
"step": 644
},
{
"entropy": 0.5379284471273422,
"epoch": 2.406716417910448,
"grad_norm": 0.16890797019004822,
"learning_rate": 0.0002,
"loss": 0.5414294004440308,
"mean_token_accuracy": 0.7795721143484116,
"num_tokens": 10540308.0,
"step": 645
},
{
"entropy": 0.5295774638652802,
"epoch": 2.41044776119403,
"grad_norm": 0.2540934085845947,
"learning_rate": 0.0002,
"loss": 0.5318943858146667,
"mean_token_accuracy": 0.7859358042478561,
"num_tokens": 10556760.0,
"step": 646
},
{
"entropy": 0.5170229598879814,
"epoch": 2.4141791044776117,
"grad_norm": 0.16737528145313263,
"learning_rate": 0.0002,
"loss": 0.517413318157196,
"mean_token_accuracy": 0.7901816219091415,
"num_tokens": 10573293.0,
"step": 647
},
{
"entropy": 0.526155412197113,
"epoch": 2.417910447761194,
"grad_norm": 0.2225574254989624,
"learning_rate": 0.0002,
"loss": 0.529864490032196,
"mean_token_accuracy": 0.7856150567531586,
"num_tokens": 10589674.0,
"step": 648
},
{
"entropy": 0.5266731381416321,
"epoch": 2.421641791044776,
"grad_norm": 0.16272951662540436,
"learning_rate": 0.0002,
"loss": 0.5234624743461609,
"mean_token_accuracy": 0.7885357886552811,
"num_tokens": 10606101.0,
"step": 649
},
{
"entropy": 0.5251661986112595,
"epoch": 2.425373134328358,
"grad_norm": 0.17834821343421936,
"learning_rate": 0.0002,
"loss": 0.5261815190315247,
"mean_token_accuracy": 0.7859483957290649,
"num_tokens": 10622240.0,
"step": 650
},
{
"entropy": 0.5259936600923538,
"epoch": 2.4291044776119404,
"grad_norm": 0.16211281716823578,
"learning_rate": 0.0002,
"loss": 0.5267058610916138,
"mean_token_accuracy": 0.7840430587530136,
"num_tokens": 10638728.0,
"step": 651
},
{
"entropy": 0.5017556846141815,
"epoch": 2.4328358208955225,
"grad_norm": 0.3111971616744995,
"learning_rate": 0.0002,
"loss": 0.5085122585296631,
"mean_token_accuracy": 0.7949473708868027,
"num_tokens": 10654954.0,
"step": 652
},
{
"entropy": 0.53680419921875,
"epoch": 2.4365671641791042,
"grad_norm": 0.17920435965061188,
"learning_rate": 0.0002,
"loss": 0.5438150763511658,
"mean_token_accuracy": 0.7806514501571655,
"num_tokens": 10671142.0,
"step": 653
},
{
"entropy": 0.5328411310911179,
"epoch": 2.4402985074626864,
"grad_norm": 0.36842888593673706,
"learning_rate": 0.0002,
"loss": 0.5365176200866699,
"mean_token_accuracy": 0.7864848077297211,
"num_tokens": 10687527.0,
"step": 654
},
{
"entropy": 0.5214048027992249,
"epoch": 2.4440298507462686,
"grad_norm": 0.15488730370998383,
"learning_rate": 0.0002,
"loss": 0.5212221145629883,
"mean_token_accuracy": 0.7904541194438934,
"num_tokens": 10703637.0,
"step": 655
},
{
"entropy": 0.5198699831962585,
"epoch": 2.4477611940298507,
"grad_norm": 0.17918945848941803,
"learning_rate": 0.0002,
"loss": 0.5142287015914917,
"mean_token_accuracy": 0.7930866479873657,
"num_tokens": 10719755.0,
"step": 656
},
{
"entropy": 0.5371468216180801,
"epoch": 2.451492537313433,
"grad_norm": 0.17966963350772858,
"learning_rate": 0.0002,
"loss": 0.5387783050537109,
"mean_token_accuracy": 0.7836030423641205,
"num_tokens": 10736159.0,
"step": 657
},
{
"entropy": 0.523772120475769,
"epoch": 2.455223880597015,
"grad_norm": 0.17708872258663177,
"learning_rate": 0.0002,
"loss": 0.5304325819015503,
"mean_token_accuracy": 0.7857228368520737,
"num_tokens": 10752300.0,
"step": 658
},
{
"entropy": 0.5180701240897179,
"epoch": 2.458955223880597,
"grad_norm": 0.18428592383861542,
"learning_rate": 0.0002,
"loss": 0.5193667411804199,
"mean_token_accuracy": 0.7911625355482101,
"num_tokens": 10768483.0,
"step": 659
},
{
"entropy": 0.528245247900486,
"epoch": 2.4626865671641793,
"grad_norm": 0.1747596561908722,
"learning_rate": 0.0002,
"loss": 0.5231127142906189,
"mean_token_accuracy": 0.7906267046928406,
"num_tokens": 10784872.0,
"step": 660
},
{
"entropy": 0.5145193934440613,
"epoch": 2.466417910447761,
"grad_norm": 0.16311223804950714,
"learning_rate": 0.0002,
"loss": 0.5083698630332947,
"mean_token_accuracy": 0.7954908460378647,
"num_tokens": 10801264.0,
"step": 661
},
{
"entropy": 0.5249892026185989,
"epoch": 2.470149253731343,
"grad_norm": 0.15471886098384857,
"learning_rate": 0.0002,
"loss": 0.5246090292930603,
"mean_token_accuracy": 0.7875058203935623,
"num_tokens": 10817509.0,
"step": 662
},
{
"entropy": 0.5209084749221802,
"epoch": 2.4738805970149254,
"grad_norm": 0.17972545325756073,
"learning_rate": 0.0002,
"loss": 0.5200228095054626,
"mean_token_accuracy": 0.7910773009061813,
"num_tokens": 10833875.0,
"step": 663
},
{
"entropy": 0.5148312151432037,
"epoch": 2.4776119402985075,
"grad_norm": 0.20573753118515015,
"learning_rate": 0.0002,
"loss": 0.5257189273834229,
"mean_token_accuracy": 0.7857212275266647,
"num_tokens": 10849915.0,
"step": 664
},
{
"entropy": 0.5218161419034004,
"epoch": 2.4813432835820897,
"grad_norm": 0.18017825484275818,
"learning_rate": 0.0002,
"loss": 0.5281471014022827,
"mean_token_accuracy": 0.7845035791397095,
"num_tokens": 10866228.0,
"step": 665
},
{
"entropy": 0.5220426917076111,
"epoch": 2.485074626865672,
"grad_norm": 0.16190138459205627,
"learning_rate": 0.0002,
"loss": 0.521308183670044,
"mean_token_accuracy": 0.7905032187700272,
"num_tokens": 10882941.0,
"step": 666
},
{
"entropy": 0.5130190551280975,
"epoch": 2.4888059701492535,
"grad_norm": 0.17984949052333832,
"learning_rate": 0.0002,
"loss": 0.5067973732948303,
"mean_token_accuracy": 0.7954512685537338,
"num_tokens": 10899165.0,
"step": 667
},
{
"entropy": 0.5297238677740097,
"epoch": 2.4925373134328357,
"grad_norm": 0.15996725857257843,
"learning_rate": 0.0002,
"loss": 0.5296366810798645,
"mean_token_accuracy": 0.785218670964241,
"num_tokens": 10915443.0,
"step": 668
},
{
"entropy": 0.4974808022379875,
"epoch": 2.496268656716418,
"grad_norm": 0.1793019324541092,
"learning_rate": 0.0002,
"loss": 0.4990445077419281,
"mean_token_accuracy": 0.7966191321611404,
"num_tokens": 10931711.0,
"step": 669
},
{
"entropy": 0.5239012390375137,
"epoch": 2.5,
"grad_norm": 0.19087010622024536,
"learning_rate": 0.0002,
"loss": 0.5348339676856995,
"mean_token_accuracy": 0.7859302014112473,
"num_tokens": 10948023.0,
"step": 670
},
{
"entropy": 0.502729706466198,
"epoch": 2.503731343283582,
"grad_norm": 0.17360597848892212,
"learning_rate": 0.0002,
"loss": 0.5077179074287415,
"mean_token_accuracy": 0.7953527718782425,
"num_tokens": 10964233.0,
"step": 671
},
{
"entropy": 0.5206915363669395,
"epoch": 2.5074626865671643,
"grad_norm": 0.19746483862400055,
"learning_rate": 0.0002,
"loss": 0.5238724946975708,
"mean_token_accuracy": 0.7870853841304779,
"num_tokens": 10980379.0,
"step": 672
},
{
"entropy": 0.5450692474842072,
"epoch": 2.5111940298507465,
"grad_norm": 0.20202518999576569,
"learning_rate": 0.0002,
"loss": 0.5349087119102478,
"mean_token_accuracy": 0.7814089059829712,
"num_tokens": 10996761.0,
"step": 673
},
{
"entropy": 0.5313533395528793,
"epoch": 2.5149253731343286,
"grad_norm": 0.16622328758239746,
"learning_rate": 0.0002,
"loss": 0.5273463726043701,
"mean_token_accuracy": 0.7876841723918915,
"num_tokens": 11013002.0,
"step": 674
},
{
"entropy": 0.5233149528503418,
"epoch": 2.5186567164179103,
"grad_norm": 0.1762213557958603,
"learning_rate": 0.0002,
"loss": 0.5284275412559509,
"mean_token_accuracy": 0.7885796874761581,
"num_tokens": 11029461.0,
"step": 675
},
{
"entropy": 0.5161427110433578,
"epoch": 2.5223880597014925,
"grad_norm": 0.1734134405851364,
"learning_rate": 0.0002,
"loss": 0.5218281149864197,
"mean_token_accuracy": 0.7900317013263702,
"num_tokens": 11045513.0,
"step": 676
},
{
"entropy": 0.527386263012886,
"epoch": 2.5261194029850746,
"grad_norm": 0.18649046123027802,
"learning_rate": 0.0002,
"loss": 0.5264036655426025,
"mean_token_accuracy": 0.7881919145584106,
"num_tokens": 11061764.0,
"step": 677
},
{
"entropy": 0.5335260331630707,
"epoch": 2.529850746268657,
"grad_norm": 0.16608470678329468,
"learning_rate": 0.0002,
"loss": 0.5327720046043396,
"mean_token_accuracy": 0.7845087051391602,
"num_tokens": 11077973.0,
"step": 678
},
{
"entropy": 0.5215242803096771,
"epoch": 2.533582089552239,
"grad_norm": 0.16991843283176422,
"learning_rate": 0.0002,
"loss": 0.5201636552810669,
"mean_token_accuracy": 0.7907481640577316,
"num_tokens": 11094025.0,
"step": 679
},
{
"entropy": 0.5226395204663277,
"epoch": 2.5373134328358207,
"grad_norm": 0.16204343736171722,
"learning_rate": 0.0002,
"loss": 0.5192615389823914,
"mean_token_accuracy": 0.7913714349269867,
"num_tokens": 11110340.0,
"step": 680
},
{
"entropy": 0.5280646532773972,
"epoch": 2.541044776119403,
"grad_norm": 0.17025548219680786,
"learning_rate": 0.0002,
"loss": 0.5243014097213745,
"mean_token_accuracy": 0.7887150794267654,
"num_tokens": 11126766.0,
"step": 681
},
{
"entropy": 0.5295235440135002,
"epoch": 2.544776119402985,
"grad_norm": 0.17332811653614044,
"learning_rate": 0.0002,
"loss": 0.5264289975166321,
"mean_token_accuracy": 0.7893000990152359,
"num_tokens": 11143383.0,
"step": 682
},
{
"entropy": 0.5350908041000366,
"epoch": 2.548507462686567,
"grad_norm": 0.16494929790496826,
"learning_rate": 0.0002,
"loss": 0.5385511517524719,
"mean_token_accuracy": 0.7832952737808228,
"num_tokens": 11159798.0,
"step": 683
},
{
"entropy": 0.5189319550991058,
"epoch": 2.5522388059701493,
"grad_norm": 0.1749635636806488,
"learning_rate": 0.0002,
"loss": 0.5244334936141968,
"mean_token_accuracy": 0.7889615148305893,
"num_tokens": 11176116.0,
"step": 684
},
{
"entropy": 0.5297338515520096,
"epoch": 2.5559701492537314,
"grad_norm": 0.16473545134067535,
"learning_rate": 0.0002,
"loss": 0.5357664227485657,
"mean_token_accuracy": 0.7839798331260681,
"num_tokens": 11192242.0,
"step": 685
},
{
"entropy": 0.5161855816841125,
"epoch": 2.5597014925373136,
"grad_norm": 0.19246211647987366,
"learning_rate": 0.0002,
"loss": 0.5211361050605774,
"mean_token_accuracy": 0.790752574801445,
"num_tokens": 11208617.0,
"step": 686
},
{
"entropy": 0.539324015378952,
"epoch": 2.5634328358208958,
"grad_norm": 0.16890385746955872,
"learning_rate": 0.0002,
"loss": 0.5382983684539795,
"mean_token_accuracy": 0.7826134711503983,
"num_tokens": 11225201.0,
"step": 687
},
{
"entropy": 0.5158891677856445,
"epoch": 2.5671641791044775,
"grad_norm": 0.16682742536067963,
"learning_rate": 0.0002,
"loss": 0.5142616629600525,
"mean_token_accuracy": 0.7918410003185272,
"num_tokens": 11241695.0,
"step": 688
},
{
"entropy": 0.5267701372504234,
"epoch": 2.5708955223880596,
"grad_norm": 0.1687549650669098,
"learning_rate": 0.0002,
"loss": 0.5238382816314697,
"mean_token_accuracy": 0.7866890728473663,
"num_tokens": 11258089.0,
"step": 689
},
{
"entropy": 0.5255937725305557,
"epoch": 2.574626865671642,
"grad_norm": 0.1738496869802475,
"learning_rate": 0.0002,
"loss": 0.5248072147369385,
"mean_token_accuracy": 0.7852340638637543,
"num_tokens": 11274450.0,
"step": 690
},
{
"entropy": 0.5198262184858322,
"epoch": 2.578358208955224,
"grad_norm": 0.1690807044506073,
"learning_rate": 0.0002,
"loss": 0.5270042419433594,
"mean_token_accuracy": 0.7855731099843979,
"num_tokens": 11290865.0,
"step": 691
},
{
"entropy": 0.5405410379171371,
"epoch": 2.582089552238806,
"grad_norm": 0.18134285509586334,
"learning_rate": 0.0002,
"loss": 0.5444961786270142,
"mean_token_accuracy": 0.7780175656080246,
"num_tokens": 11307409.0,
"step": 692
},
{
"entropy": 0.5347141325473785,
"epoch": 2.585820895522388,
"grad_norm": 0.1676827371120453,
"learning_rate": 0.0002,
"loss": 0.5311787724494934,
"mean_token_accuracy": 0.784485325217247,
"num_tokens": 11323946.0,
"step": 693
},
{
"entropy": 0.503664955496788,
"epoch": 2.58955223880597,
"grad_norm": 0.17767618596553802,
"learning_rate": 0.0002,
"loss": 0.5004390478134155,
"mean_token_accuracy": 0.7965147197246552,
"num_tokens": 11340062.0,
"step": 694
},
{
"entropy": 0.5400541573762894,
"epoch": 2.593283582089552,
"grad_norm": 0.17085346579551697,
"learning_rate": 0.0002,
"loss": 0.5395094156265259,
"mean_token_accuracy": 0.781545028090477,
"num_tokens": 11356660.0,
"step": 695
},
{
"entropy": 0.5177017226815224,
"epoch": 2.5970149253731343,
"grad_norm": 0.169759601354599,
"learning_rate": 0.0002,
"loss": 0.515388011932373,
"mean_token_accuracy": 0.7907217293977737,
"num_tokens": 11372963.0,
"step": 696
},
{
"entropy": 0.5352813154459,
"epoch": 2.6007462686567164,
"grad_norm": 0.17281876504421234,
"learning_rate": 0.0002,
"loss": 0.5351260900497437,
"mean_token_accuracy": 0.7841326892375946,
"num_tokens": 11389640.0,
"step": 697
},
{
"entropy": 0.5045363381505013,
"epoch": 2.6044776119402986,
"grad_norm": 0.18615856766700745,
"learning_rate": 0.0002,
"loss": 0.5119503736495972,
"mean_token_accuracy": 0.7933619618415833,
"num_tokens": 11405795.0,
"step": 698
},
{
"entropy": 0.521905705332756,
"epoch": 2.6082089552238807,
"grad_norm": 0.18743987381458282,
"learning_rate": 0.0002,
"loss": 0.5299134850502014,
"mean_token_accuracy": 0.7850409299135208,
"num_tokens": 11422045.0,
"step": 699
},
{
"entropy": 0.5174702405929565,
"epoch": 2.611940298507463,
"grad_norm": 0.17414018511772156,
"learning_rate": 0.0002,
"loss": 0.5177151560783386,
"mean_token_accuracy": 0.7891951948404312,
"num_tokens": 11438392.0,
"step": 700
},
{
"entropy": 0.5343185365200043,
"epoch": 2.6156716417910446,
"grad_norm": 0.17761462926864624,
"learning_rate": 0.0002,
"loss": 0.5284934043884277,
"mean_token_accuracy": 0.7868274599313736,
"num_tokens": 11455009.0,
"step": 701
},
{
"entropy": 0.53134885430336,
"epoch": 2.6194029850746268,
"grad_norm": 0.16672612726688385,
"learning_rate": 0.0002,
"loss": 0.5203122496604919,
"mean_token_accuracy": 0.7913379818201065,
"num_tokens": 11471341.0,
"step": 702
},
{
"entropy": 0.523793414235115,
"epoch": 2.623134328358209,
"grad_norm": 0.15720658004283905,
"learning_rate": 0.0002,
"loss": 0.5188941359519958,
"mean_token_accuracy": 0.7898289412260056,
"num_tokens": 11487565.0,
"step": 703
},
{
"entropy": 0.5335910320281982,
"epoch": 2.626865671641791,
"grad_norm": 0.18207021057605743,
"learning_rate": 0.0002,
"loss": 0.5383012294769287,
"mean_token_accuracy": 0.7841922044754028,
"num_tokens": 11503932.0,
"step": 704
},
{
"entropy": 0.5070014595985413,
"epoch": 2.6305970149253732,
"grad_norm": 0.18818838894367218,
"learning_rate": 0.0002,
"loss": 0.521304726600647,
"mean_token_accuracy": 0.7882455736398697,
"num_tokens": 11519876.0,
"step": 705
},
{
"entropy": 0.5179764032363892,
"epoch": 2.6343283582089554,
"grad_norm": 0.16391263902187347,
"learning_rate": 0.0002,
"loss": 0.5277372598648071,
"mean_token_accuracy": 0.7888714224100113,
"num_tokens": 11536317.0,
"step": 706
},
{
"entropy": 0.5383756011724472,
"epoch": 2.638059701492537,
"grad_norm": 0.20110981166362762,
"learning_rate": 0.0002,
"loss": 0.5405253171920776,
"mean_token_accuracy": 0.7808063477277756,
"num_tokens": 11552655.0,
"step": 707
},
{
"entropy": 0.5268357321619987,
"epoch": 2.6417910447761193,
"grad_norm": 0.17326846718788147,
"learning_rate": 0.0002,
"loss": 0.5239301919937134,
"mean_token_accuracy": 0.7901074439287186,
"num_tokens": 11568724.0,
"step": 708
},
{
"entropy": 0.5407274663448334,
"epoch": 2.6455223880597014,
"grad_norm": 0.16851350665092468,
"learning_rate": 0.0002,
"loss": 0.5350074172019958,
"mean_token_accuracy": 0.7861216068267822,
"num_tokens": 11585225.0,
"step": 709
},
{
"entropy": 0.5268073230981827,
"epoch": 2.6492537313432836,
"grad_norm": 0.19633817672729492,
"learning_rate": 0.0002,
"loss": 0.5214436054229736,
"mean_token_accuracy": 0.7898468226194382,
"num_tokens": 11601498.0,
"step": 710
},
{
"entropy": 0.535712480545044,
"epoch": 2.6529850746268657,
"grad_norm": 0.15659253299236298,
"learning_rate": 0.0002,
"loss": 0.5353400707244873,
"mean_token_accuracy": 0.7835351228713989,
"num_tokens": 11617811.0,
"step": 711
},
{
"entropy": 0.539536863565445,
"epoch": 2.656716417910448,
"grad_norm": 0.19012975692749023,
"learning_rate": 0.0002,
"loss": 0.5403158068656921,
"mean_token_accuracy": 0.780579537153244,
"num_tokens": 11634295.0,
"step": 712
},
{
"entropy": 0.5134764388203621,
"epoch": 2.66044776119403,
"grad_norm": 0.16630828380584717,
"learning_rate": 0.0002,
"loss": 0.5213350653648376,
"mean_token_accuracy": 0.7890530824661255,
"num_tokens": 11650834.0,
"step": 713
},
{
"entropy": 0.4917012006044388,
"epoch": 2.664179104477612,
"grad_norm": 0.1683693677186966,
"learning_rate": 0.0002,
"loss": 0.49927788972854614,
"mean_token_accuracy": 0.797902062535286,
"num_tokens": 11667060.0,
"step": 714
},
{
"entropy": 0.5247212499380112,
"epoch": 2.667910447761194,
"grad_norm": 0.17371122539043427,
"learning_rate": 0.0002,
"loss": 0.5344932079315186,
"mean_token_accuracy": 0.783098891377449,
"num_tokens": 11683574.0,
"step": 715
},
{
"entropy": 0.5191128477454185,
"epoch": 2.671641791044776,
"grad_norm": 0.16527095437049866,
"learning_rate": 0.0002,
"loss": 0.5183148384094238,
"mean_token_accuracy": 0.790424644947052,
"num_tokens": 11699720.0,
"step": 716
},
{
"entropy": 0.5185272991657257,
"epoch": 2.675373134328358,
"grad_norm": 0.16154323518276215,
"learning_rate": 0.0002,
"loss": 0.5092360973358154,
"mean_token_accuracy": 0.7955475896596909,
"num_tokens": 11716469.0,
"step": 717
},
{
"entropy": 0.5372938513755798,
"epoch": 2.6791044776119404,
"grad_norm": 0.15932703018188477,
"learning_rate": 0.0002,
"loss": 0.5302359461784363,
"mean_token_accuracy": 0.786151722073555,
"num_tokens": 11732748.0,
"step": 718
},
{
"entropy": 0.5596635788679123,
"epoch": 2.6828358208955225,
"grad_norm": 0.18202805519104004,
"learning_rate": 0.0002,
"loss": 0.5571697950363159,
"mean_token_accuracy": 0.7754980325698853,
"num_tokens": 11749150.0,
"step": 719
},
{
"entropy": 0.5210409909486771,
"epoch": 2.6865671641791042,
"grad_norm": 0.1875341236591339,
"learning_rate": 0.0002,
"loss": 0.5226970314979553,
"mean_token_accuracy": 0.7895162850618362,
"num_tokens": 11765442.0,
"step": 720
},
{
"entropy": 0.528057724237442,
"epoch": 2.6902985074626864,
"grad_norm": 0.16192083060741425,
"learning_rate": 0.0002,
"loss": 0.5281423330307007,
"mean_token_accuracy": 0.788543164730072,
"num_tokens": 11781875.0,
"step": 721
},
{
"entropy": 0.5093352198600769,
"epoch": 2.6940298507462686,
"grad_norm": 0.15824586153030396,
"learning_rate": 0.0002,
"loss": 0.5047670602798462,
"mean_token_accuracy": 0.7923571020364761,
"num_tokens": 11798168.0,
"step": 722
},
{
"entropy": 0.5319179147481918,
"epoch": 2.6977611940298507,
"grad_norm": 0.1545802354812622,
"learning_rate": 0.0002,
"loss": 0.5334397554397583,
"mean_token_accuracy": 0.7845843136310577,
"num_tokens": 11814632.0,
"step": 723
},
{
"entropy": 0.5133816972374916,
"epoch": 2.701492537313433,
"grad_norm": 0.16241911053657532,
"learning_rate": 0.0002,
"loss": 0.51878821849823,
"mean_token_accuracy": 0.7933190315961838,
"num_tokens": 11831088.0,
"step": 724
},
{
"entropy": 0.5164139419794083,
"epoch": 2.705223880597015,
"grad_norm": 0.14982916414737701,
"learning_rate": 0.0002,
"loss": 0.5140745639801025,
"mean_token_accuracy": 0.7934172451496124,
"num_tokens": 11847470.0,
"step": 725
},
{
"entropy": 0.521071195602417,
"epoch": 2.708955223880597,
"grad_norm": 0.17015258967876434,
"learning_rate": 0.0002,
"loss": 0.5232289433479309,
"mean_token_accuracy": 0.7887244522571564,
"num_tokens": 11863757.0,
"step": 726
},
{
"entropy": 0.5184628516435623,
"epoch": 2.7126865671641793,
"grad_norm": 0.1840510219335556,
"learning_rate": 0.0002,
"loss": 0.5194827318191528,
"mean_token_accuracy": 0.7879429012537003,
"num_tokens": 11880261.0,
"step": 727
},
{
"entropy": 0.5139294788241386,
"epoch": 2.716417910447761,
"grad_norm": 0.19588088989257812,
"learning_rate": 0.0002,
"loss": 0.5200832486152649,
"mean_token_accuracy": 0.7899386137723923,
"num_tokens": 11896585.0,
"step": 728
},
{
"entropy": 0.5239543169736862,
"epoch": 2.720149253731343,
"grad_norm": 0.20819295942783356,
"learning_rate": 0.0002,
"loss": 0.5261701345443726,
"mean_token_accuracy": 0.7911202013492584,
"num_tokens": 11912923.0,
"step": 729
},
{
"entropy": 0.5407283902168274,
"epoch": 2.7238805970149254,
"grad_norm": 0.17276515066623688,
"learning_rate": 0.0002,
"loss": 0.5370129942893982,
"mean_token_accuracy": 0.7848152667284012,
"num_tokens": 11929303.0,
"step": 730
},
{
"entropy": 0.542425274848938,
"epoch": 2.7276119402985075,
"grad_norm": 0.25132983922958374,
"learning_rate": 0.0002,
"loss": 0.5359519720077515,
"mean_token_accuracy": 0.7846331894397736,
"num_tokens": 11945440.0,
"step": 731
},
{
"entropy": 0.5357621908187866,
"epoch": 2.7313432835820897,
"grad_norm": 0.222070574760437,
"learning_rate": 0.0002,
"loss": 0.5348407626152039,
"mean_token_accuracy": 0.7818550020456314,
"num_tokens": 11961949.0,
"step": 732
},
{
"entropy": 0.5185696631669998,
"epoch": 2.7350746268656714,
"grad_norm": 0.19711528718471527,
"learning_rate": 0.0002,
"loss": 0.5264403223991394,
"mean_token_accuracy": 0.7884511202573776,
"num_tokens": 11978063.0,
"step": 733
},
{
"entropy": 0.516778826713562,
"epoch": 2.7388059701492535,
"grad_norm": 0.24369676411151886,
"learning_rate": 0.0002,
"loss": 0.5253380537033081,
"mean_token_accuracy": 0.7903653234243393,
"num_tokens": 11994278.0,
"step": 734
},
{
"entropy": 0.5164884254336357,
"epoch": 2.7425373134328357,
"grad_norm": 0.18417784571647644,
"learning_rate": 0.0002,
"loss": 0.5214477181434631,
"mean_token_accuracy": 0.789106622338295,
"num_tokens": 12010558.0,
"step": 735
},
{
"entropy": 0.5068091601133347,
"epoch": 2.746268656716418,
"grad_norm": 0.21942751109600067,
"learning_rate": 0.0002,
"loss": 0.513481855392456,
"mean_token_accuracy": 0.7899149656295776,
"num_tokens": 12026889.0,
"step": 736
},
{
"entropy": 0.5316798090934753,
"epoch": 2.75,
"grad_norm": 0.1581851989030838,
"learning_rate": 0.0002,
"loss": 0.5230653285980225,
"mean_token_accuracy": 0.7884569317102432,
"num_tokens": 12043341.0,
"step": 737
},
{
"entropy": 0.539380818605423,
"epoch": 2.753731343283582,
"grad_norm": 0.1578167974948883,
"learning_rate": 0.0002,
"loss": 0.5292148590087891,
"mean_token_accuracy": 0.7852563858032227,
"num_tokens": 12059848.0,
"step": 738
},
{
"entropy": 0.5343874096870422,
"epoch": 2.7574626865671643,
"grad_norm": 0.19632823765277863,
"learning_rate": 0.0002,
"loss": 0.5295359492301941,
"mean_token_accuracy": 0.783517986536026,
"num_tokens": 12076134.0,
"step": 739
},
{
"entropy": 0.5188475027680397,
"epoch": 2.7611940298507465,
"grad_norm": 0.16950450837612152,
"learning_rate": 0.0002,
"loss": 0.521928071975708,
"mean_token_accuracy": 0.7883510291576385,
"num_tokens": 12092406.0,
"step": 740
},
{
"entropy": 0.5121756568551064,
"epoch": 2.7649253731343286,
"grad_norm": 0.20061862468719482,
"learning_rate": 0.0002,
"loss": 0.5192751884460449,
"mean_token_accuracy": 0.7898274064064026,
"num_tokens": 12108773.0,
"step": 741
},
{
"entropy": 0.5244594514369965,
"epoch": 2.7686567164179103,
"grad_norm": 0.16218306124210358,
"learning_rate": 0.0002,
"loss": 0.5296685695648193,
"mean_token_accuracy": 0.7826414853334427,
"num_tokens": 12125082.0,
"step": 742
},
{
"entropy": 0.5244700759649277,
"epoch": 2.7723880597014925,
"grad_norm": 0.19114060699939728,
"learning_rate": 0.0002,
"loss": 0.5232917070388794,
"mean_token_accuracy": 0.7893050163984299,
"num_tokens": 12141570.0,
"step": 743
},
{
"entropy": 0.5299672707915306,
"epoch": 2.7761194029850746,
"grad_norm": 0.15443415939807892,
"learning_rate": 0.0002,
"loss": 0.5207250714302063,
"mean_token_accuracy": 0.7905602306127548,
"num_tokens": 12157874.0,
"step": 744
},
{
"entropy": 0.5345348864793777,
"epoch": 2.779850746268657,
"grad_norm": 0.1817025989294052,
"learning_rate": 0.0002,
"loss": 0.5311155319213867,
"mean_token_accuracy": 0.785017192363739,
"num_tokens": 12174053.0,
"step": 745
},
{
"entropy": 0.5195724815130234,
"epoch": 2.783582089552239,
"grad_norm": 0.157354474067688,
"learning_rate": 0.0002,
"loss": 0.5159887075424194,
"mean_token_accuracy": 0.790684700012207,
"num_tokens": 12190613.0,
"step": 746
},
{
"entropy": 0.5138278231024742,
"epoch": 2.7873134328358207,
"grad_norm": 0.16088353097438812,
"learning_rate": 0.0002,
"loss": 0.5184983611106873,
"mean_token_accuracy": 0.7899224907159805,
"num_tokens": 12206928.0,
"step": 747
},
{
"entropy": 0.5161465555429459,
"epoch": 2.791044776119403,
"grad_norm": 0.2099459022283554,
"learning_rate": 0.0002,
"loss": 0.5232690572738647,
"mean_token_accuracy": 0.7870688289403915,
"num_tokens": 12223267.0,
"step": 748
},
{
"entropy": 0.5158911049365997,
"epoch": 2.794776119402985,
"grad_norm": 0.15817788243293762,
"learning_rate": 0.0002,
"loss": 0.5168994665145874,
"mean_token_accuracy": 0.7899310439825058,
"num_tokens": 12239601.0,
"step": 749
},
{
"entropy": 0.5070392489433289,
"epoch": 2.798507462686567,
"grad_norm": 0.2228090614080429,
"learning_rate": 0.0002,
"loss": 0.5200591087341309,
"mean_token_accuracy": 0.7891372889280319,
"num_tokens": 12256032.0,
"step": 750
},
{
"entropy": 0.5438189208507538,
"epoch": 2.8022388059701493,
"grad_norm": 0.1719558835029602,
"learning_rate": 0.0002,
"loss": 0.5426724553108215,
"mean_token_accuracy": 0.7774887979030609,
"num_tokens": 12272514.0,
"step": 751
},
{
"entropy": 0.519834965467453,
"epoch": 2.8059701492537314,
"grad_norm": 0.18933889269828796,
"learning_rate": 0.0002,
"loss": 0.523102343082428,
"mean_token_accuracy": 0.7904316037893295,
"num_tokens": 12288877.0,
"step": 752
},
{
"entropy": 0.512350045144558,
"epoch": 2.8097014925373136,
"grad_norm": 0.1864548623561859,
"learning_rate": 0.0002,
"loss": 0.5090078115463257,
"mean_token_accuracy": 0.7945949882268906,
"num_tokens": 12305044.0,
"step": 753
},
{
"entropy": 0.5358164459466934,
"epoch": 2.8134328358208958,
"grad_norm": 0.17895784974098206,
"learning_rate": 0.0002,
"loss": 0.5349195003509521,
"mean_token_accuracy": 0.7841221541166306,
"num_tokens": 12321579.0,
"step": 754
},
{
"entropy": 0.5124004110693932,
"epoch": 2.8171641791044775,
"grad_norm": 0.17669007182121277,
"learning_rate": 0.0002,
"loss": 0.5126450657844543,
"mean_token_accuracy": 0.7929520756006241,
"num_tokens": 12338186.0,
"step": 755
},
{
"entropy": 0.5246561616659164,
"epoch": 2.8208955223880596,
"grad_norm": 0.19795700907707214,
"learning_rate": 0.0002,
"loss": 0.5288596153259277,
"mean_token_accuracy": 0.7869751006364822,
"num_tokens": 12354327.0,
"step": 756
},
{
"entropy": 0.5311583876609802,
"epoch": 2.824626865671642,
"grad_norm": 0.18146470189094543,
"learning_rate": 0.0002,
"loss": 0.5294592976570129,
"mean_token_accuracy": 0.7862387895584106,
"num_tokens": 12370923.0,
"step": 757
},
{
"entropy": 0.5319194048643112,
"epoch": 2.828358208955224,
"grad_norm": 0.19238857924938202,
"learning_rate": 0.0002,
"loss": 0.5317291617393494,
"mean_token_accuracy": 0.7854786366224289,
"num_tokens": 12387257.0,
"step": 758
},
{
"entropy": 0.526064857840538,
"epoch": 2.832089552238806,
"grad_norm": 0.1526212990283966,
"learning_rate": 0.0002,
"loss": 0.5222187042236328,
"mean_token_accuracy": 0.7932349592447281,
"num_tokens": 12403635.0,
"step": 759
},
{
"entropy": 0.5247229933738708,
"epoch": 2.835820895522388,
"grad_norm": 0.2871471047401428,
"learning_rate": 0.0002,
"loss": 0.5314409136772156,
"mean_token_accuracy": 0.7845473885536194,
"num_tokens": 12420097.0,
"step": 760
},
{
"entropy": 0.5259681046009064,
"epoch": 2.83955223880597,
"grad_norm": 0.1705760359764099,
"learning_rate": 0.0002,
"loss": 0.5313333868980408,
"mean_token_accuracy": 0.787728413939476,
"num_tokens": 12436382.0,
"step": 761
},
{
"entropy": 0.5318069308996201,
"epoch": 2.843283582089552,
"grad_norm": 0.20162752270698547,
"learning_rate": 0.0002,
"loss": 0.5359828472137451,
"mean_token_accuracy": 0.7834303081035614,
"num_tokens": 12452497.0,
"step": 762
},
{
"entropy": 0.5508353263139725,
"epoch": 2.8470149253731343,
"grad_norm": 0.161021888256073,
"learning_rate": 0.0002,
"loss": 0.5432707667350769,
"mean_token_accuracy": 0.7808051556348801,
"num_tokens": 12468969.0,
"step": 763
},
{
"entropy": 0.5287757962942123,
"epoch": 2.8507462686567164,
"grad_norm": 0.2050207257270813,
"learning_rate": 0.0002,
"loss": 0.5284628868103027,
"mean_token_accuracy": 0.7843392193317413,
"num_tokens": 12485354.0,
"step": 764
},
{
"entropy": 0.5344215333461761,
"epoch": 2.8544776119402986,
"grad_norm": 0.1695808321237564,
"learning_rate": 0.0002,
"loss": 0.535874605178833,
"mean_token_accuracy": 0.782726377248764,
"num_tokens": 12501936.0,
"step": 765
},
{
"entropy": 0.522572860121727,
"epoch": 2.8582089552238807,
"grad_norm": 0.19520296156406403,
"learning_rate": 0.0002,
"loss": 0.5247471332550049,
"mean_token_accuracy": 0.7886104881763458,
"num_tokens": 12518330.0,
"step": 766
},
{
"entropy": 0.5314962714910507,
"epoch": 2.861940298507463,
"grad_norm": 0.17423976957798004,
"learning_rate": 0.0002,
"loss": 0.5297841429710388,
"mean_token_accuracy": 0.7862118780612946,
"num_tokens": 12534665.0,
"step": 767
},
{
"entropy": 0.5281147062778473,
"epoch": 2.8656716417910446,
"grad_norm": 0.18605203926563263,
"learning_rate": 0.0002,
"loss": 0.5324077606201172,
"mean_token_accuracy": 0.787416860461235,
"num_tokens": 12551009.0,
"step": 768
},
{
"entropy": 0.5187551081180573,
"epoch": 2.8694029850746268,
"grad_norm": 0.1616411954164505,
"learning_rate": 0.0002,
"loss": 0.512826144695282,
"mean_token_accuracy": 0.7936854958534241,
"num_tokens": 12567387.0,
"step": 769
},
{
"entropy": 0.5136809647083282,
"epoch": 2.873134328358209,
"grad_norm": 0.17406195402145386,
"learning_rate": 0.0002,
"loss": 0.5155330300331116,
"mean_token_accuracy": 0.7908283174037933,
"num_tokens": 12583985.0,
"step": 770
},
{
"entropy": 0.5185445547103882,
"epoch": 2.876865671641791,
"grad_norm": 0.1833800971508026,
"learning_rate": 0.0002,
"loss": 0.5192956328392029,
"mean_token_accuracy": 0.7888920605182648,
"num_tokens": 12600193.0,
"step": 771
},
{
"entropy": 0.5310780256986618,
"epoch": 2.8805970149253732,
"grad_norm": 0.2313033938407898,
"learning_rate": 0.0002,
"loss": 0.5360459685325623,
"mean_token_accuracy": 0.7845909744501114,
"num_tokens": 12616559.0,
"step": 772
},
{
"entropy": 0.5207322463393211,
"epoch": 2.8843283582089554,
"grad_norm": 0.15698477625846863,
"learning_rate": 0.0002,
"loss": 0.514286994934082,
"mean_token_accuracy": 0.789328083395958,
"num_tokens": 12633075.0,
"step": 773
},
{
"entropy": 0.5343746095895767,
"epoch": 2.888059701492537,
"grad_norm": 0.21191926300525665,
"learning_rate": 0.0002,
"loss": 0.5408198833465576,
"mean_token_accuracy": 0.7812719643115997,
"num_tokens": 12649414.0,
"step": 774
},
{
"entropy": 0.5095183849334717,
"epoch": 2.8917910447761193,
"grad_norm": 0.1665944755077362,
"learning_rate": 0.0002,
"loss": 0.5066861510276794,
"mean_token_accuracy": 0.7972470223903656,
"num_tokens": 12665839.0,
"step": 775
},
{
"entropy": 0.5341623723506927,
"epoch": 2.8955223880597014,
"grad_norm": 0.19015316665172577,
"learning_rate": 0.0002,
"loss": 0.5310372114181519,
"mean_token_accuracy": 0.7861314713954926,
"num_tokens": 12682165.0,
"step": 776
},
{
"entropy": 0.5464024096727371,
"epoch": 2.8992537313432836,
"grad_norm": 0.19810722768306732,
"learning_rate": 0.0002,
"loss": 0.5392264723777771,
"mean_token_accuracy": 0.7843339294195175,
"num_tokens": 12698576.0,
"step": 777
},
{
"entropy": 0.5175042897462845,
"epoch": 2.9029850746268657,
"grad_norm": 0.16263291239738464,
"learning_rate": 0.0002,
"loss": 0.5172262191772461,
"mean_token_accuracy": 0.792850524187088,
"num_tokens": 12714766.0,
"step": 778
},
{
"entropy": 0.5199488997459412,
"epoch": 2.906716417910448,
"grad_norm": 0.2083202749490738,
"learning_rate": 0.0002,
"loss": 0.5252541899681091,
"mean_token_accuracy": 0.7852817475795746,
"num_tokens": 12731205.0,
"step": 779
},
{
"entropy": 0.4941527247428894,
"epoch": 2.91044776119403,
"grad_norm": 0.17050482332706451,
"learning_rate": 0.0002,
"loss": 0.4989524185657501,
"mean_token_accuracy": 0.7972326874732971,
"num_tokens": 12747594.0,
"step": 780
},
{
"entropy": 0.5078647658228874,
"epoch": 2.914179104477612,
"grad_norm": 0.23199598491191864,
"learning_rate": 0.0002,
"loss": 0.5211161971092224,
"mean_token_accuracy": 0.7884382009506226,
"num_tokens": 12763932.0,
"step": 781
},
{
"entropy": 0.5114319175481796,
"epoch": 2.917910447761194,
"grad_norm": 0.2023877501487732,
"learning_rate": 0.0002,
"loss": 0.5166995525360107,
"mean_token_accuracy": 0.7941331118345261,
"num_tokens": 12780023.0,
"step": 782
},
{
"entropy": 0.5287023633718491,
"epoch": 2.921641791044776,
"grad_norm": 0.21876347064971924,
"learning_rate": 0.0002,
"loss": 0.5263211727142334,
"mean_token_accuracy": 0.7864357531070709,
"num_tokens": 12796441.0,
"step": 783
},
{
"entropy": 0.5223046839237213,
"epoch": 2.925373134328358,
"grad_norm": 0.14650550484657288,
"learning_rate": 0.0002,
"loss": 0.5140995979309082,
"mean_token_accuracy": 0.7916091233491898,
"num_tokens": 12812793.0,
"step": 784
},
{
"entropy": 0.5247595310211182,
"epoch": 2.9291044776119404,
"grad_norm": 0.25079336762428284,
"learning_rate": 0.0002,
"loss": 0.5263584852218628,
"mean_token_accuracy": 0.786608412861824,
"num_tokens": 12829172.0,
"step": 785
},
{
"entropy": 0.5266484171152115,
"epoch": 2.9328358208955225,
"grad_norm": 0.16101892292499542,
"learning_rate": 0.0002,
"loss": 0.5220364928245544,
"mean_token_accuracy": 0.7872611582279205,
"num_tokens": 12845573.0,
"step": 786
},
{
"entropy": 0.5139588639140129,
"epoch": 2.9365671641791042,
"grad_norm": 0.21128332614898682,
"learning_rate": 0.0002,
"loss": 0.5196605920791626,
"mean_token_accuracy": 0.7880596816539764,
"num_tokens": 12861897.0,
"step": 787
},
{
"entropy": 0.5052976161241531,
"epoch": 2.9402985074626864,
"grad_norm": 0.1861787587404251,
"learning_rate": 0.0002,
"loss": 0.5119534134864807,
"mean_token_accuracy": 0.7939311414957047,
"num_tokens": 12878193.0,
"step": 788
},
{
"entropy": 0.5310614109039307,
"epoch": 2.9440298507462686,
"grad_norm": 0.1857159435749054,
"learning_rate": 0.0002,
"loss": 0.5301690101623535,
"mean_token_accuracy": 0.786168098449707,
"num_tokens": 12894935.0,
"step": 789
},
{
"entropy": 0.5310661867260933,
"epoch": 2.9477611940298507,
"grad_norm": 0.18339301645755768,
"learning_rate": 0.0002,
"loss": 0.5257419347763062,
"mean_token_accuracy": 0.788611650466919,
"num_tokens": 12911289.0,
"step": 790
},
{
"entropy": 0.5245337337255478,
"epoch": 2.951492537313433,
"grad_norm": 0.17652840912342072,
"learning_rate": 0.0002,
"loss": 0.5265839099884033,
"mean_token_accuracy": 0.7901091575622559,
"num_tokens": 12927670.0,
"step": 791
},
{
"entropy": 0.5245234072208405,
"epoch": 2.955223880597015,
"grad_norm": 0.17611214518547058,
"learning_rate": 0.0002,
"loss": 0.5243083834648132,
"mean_token_accuracy": 0.7856577485799789,
"num_tokens": 12944015.0,
"step": 792
},
{
"entropy": 0.5191880911588669,
"epoch": 2.958955223880597,
"grad_norm": 0.18345631659030914,
"learning_rate": 0.0002,
"loss": 0.5257253050804138,
"mean_token_accuracy": 0.7881710231304169,
"num_tokens": 12960131.0,
"step": 793
},
{
"entropy": 0.5140431523323059,
"epoch": 2.9626865671641793,
"grad_norm": 0.2098158448934555,
"learning_rate": 0.0002,
"loss": 0.5169271230697632,
"mean_token_accuracy": 0.786968320608139,
"num_tokens": 12976187.0,
"step": 794
},
{
"entropy": 0.5335211008787155,
"epoch": 2.966417910447761,
"grad_norm": 0.15838965773582458,
"learning_rate": 0.0002,
"loss": 0.5324181318283081,
"mean_token_accuracy": 0.7819865345954895,
"num_tokens": 12992461.0,
"step": 795
},
{
"entropy": 0.5252291113138199,
"epoch": 2.970149253731343,
"grad_norm": 0.19166119396686554,
"learning_rate": 0.0002,
"loss": 0.5205749869346619,
"mean_token_accuracy": 0.7911773473024368,
"num_tokens": 13008737.0,
"step": 796
},
{
"entropy": 0.5154759585857391,
"epoch": 2.9738805970149254,
"grad_norm": 0.16444922983646393,
"learning_rate": 0.0002,
"loss": 0.5141779184341431,
"mean_token_accuracy": 0.7922156006097794,
"num_tokens": 13025092.0,
"step": 797
},
{
"entropy": 0.5257436707615852,
"epoch": 2.9776119402985075,
"grad_norm": 0.19890975952148438,
"learning_rate": 0.0002,
"loss": 0.5353443622589111,
"mean_token_accuracy": 0.7844508290290833,
"num_tokens": 13041631.0,
"step": 798
},
{
"entropy": 0.5554878115653992,
"epoch": 2.9813432835820897,
"grad_norm": 0.19347697496414185,
"learning_rate": 0.0002,
"loss": 0.5568645596504211,
"mean_token_accuracy": 0.7741395682096481,
"num_tokens": 13058045.0,
"step": 799
},
{
"entropy": 0.5262391567230225,
"epoch": 2.9850746268656714,
"grad_norm": 0.17874093353748322,
"learning_rate": 0.0002,
"loss": 0.5202043056488037,
"mean_token_accuracy": 0.7870875149965286,
"num_tokens": 13074443.0,
"step": 800
},
{
"entropy": 0.5318054854869843,
"epoch": 2.9888059701492535,
"grad_norm": 0.182646706700325,
"learning_rate": 0.0002,
"loss": 0.5253685712814331,
"mean_token_accuracy": 0.786090537905693,
"num_tokens": 13090582.0,
"step": 801
},
{
"entropy": 0.5484406352043152,
"epoch": 2.9925373134328357,
"grad_norm": 0.15745747089385986,
"learning_rate": 0.0002,
"loss": 0.5452413558959961,
"mean_token_accuracy": 0.7798783183097839,
"num_tokens": 13106832.0,
"step": 802
},
{
"entropy": 0.527185246348381,
"epoch": 2.996268656716418,
"grad_norm": 0.1789730340242386,
"learning_rate": 0.0002,
"loss": 0.5218254923820496,
"mean_token_accuracy": 0.7895842045545578,
"num_tokens": 13123002.0,
"step": 803
},
{
"entropy": 0.5108470022678375,
"epoch": 3.0,
"grad_norm": 0.1871774047613144,
"learning_rate": 0.0002,
"loss": 0.5190352201461792,
"mean_token_accuracy": 0.7890540361404419,
"num_tokens": 13139156.0,
"step": 804
}
],
"logging_steps": 1,
"max_steps": 804,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2242940510926275e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}