{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 804, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.1324340403079987, "epoch": 0.0037313432835820895, "grad_norm": 1.6067556142807007, "learning_rate": 0.0002, "loss": 2.4804701805114746, "mean_token_accuracy": 0.5353229343891144, "num_tokens": 16370.0, "step": 1 }, { "entropy": 1.2276706099510193, "epoch": 0.007462686567164179, "grad_norm": 1.4987447261810303, "learning_rate": 0.0002, "loss": 2.135417938232422, "mean_token_accuracy": 0.5693617165088654, "num_tokens": 33043.0, "step": 2 }, { "entropy": 1.4045527577400208, "epoch": 0.011194029850746268, "grad_norm": 1.1359604597091675, "learning_rate": 0.0002, "loss": 1.72599196434021, "mean_token_accuracy": 0.5919849574565887, "num_tokens": 49458.0, "step": 3 }, { "entropy": 1.3863026201725006, "epoch": 0.014925373134328358, "grad_norm": 0.9200887084007263, "learning_rate": 0.0002, "loss": 1.4096770286560059, "mean_token_accuracy": 0.6369052678346634, "num_tokens": 65795.0, "step": 4 }, { "entropy": 1.331774890422821, "epoch": 0.018656716417910446, "grad_norm": 1.2737244367599487, "learning_rate": 0.0002, "loss": 1.2862391471862793, "mean_token_accuracy": 0.6422256380319595, "num_tokens": 82033.0, "step": 5 }, { "entropy": 1.2540993690490723, "epoch": 0.022388059701492536, "grad_norm": 0.6736201643943787, "learning_rate": 0.0002, "loss": 1.1756055355072021, "mean_token_accuracy": 0.6605449765920639, "num_tokens": 97997.0, "step": 6 }, { "entropy": 1.169641524553299, "epoch": 0.026119402985074626, "grad_norm": 0.3927549719810486, "learning_rate": 0.0002, "loss": 1.1019014120101929, "mean_token_accuracy": 0.6672378480434418, "num_tokens": 114186.0, "step": 7 }, { "entropy": 1.0887874066829681, "epoch": 0.029850746268656716, "grad_norm": 0.4364261329174042, "learning_rate": 0.0002, "loss": 1.0323972702026367, "mean_token_accuracy": 0.6782350987195969, "num_tokens": 130751.0, "step": 8 }, { "entropy": 1.0042430609464645, "epoch": 0.033582089552238806, "grad_norm": 0.5108282566070557, "learning_rate": 0.0002, "loss": 0.9582932591438293, "mean_token_accuracy": 0.692020371556282, "num_tokens": 147264.0, "step": 9 }, { "entropy": 0.9632741063833237, "epoch": 0.03731343283582089, "grad_norm": 0.4669722616672516, "learning_rate": 0.0002, "loss": 0.8919203877449036, "mean_token_accuracy": 0.7046539932489395, "num_tokens": 163507.0, "step": 10 }, { "entropy": 0.9305494576692581, "epoch": 0.041044776119402986, "grad_norm": 0.4794766902923584, "learning_rate": 0.0002, "loss": 0.8569780588150024, "mean_token_accuracy": 0.7103458344936371, "num_tokens": 179680.0, "step": 11 }, { "entropy": 0.8464002013206482, "epoch": 0.04477611940298507, "grad_norm": 0.396366685628891, "learning_rate": 0.0002, "loss": 0.7772667407989502, "mean_token_accuracy": 0.7248742878437042, "num_tokens": 196084.0, "step": 12 }, { "entropy": 0.8053079694509506, "epoch": 0.048507462686567165, "grad_norm": 3.4283485412597656, "learning_rate": 0.0002, "loss": 0.7701212763786316, "mean_token_accuracy": 0.7237996459007263, "num_tokens": 212421.0, "step": 13 }, { "entropy": 0.7701881229877472, "epoch": 0.05223880597014925, "grad_norm": 0.4621308147907257, "learning_rate": 0.0002, "loss": 0.7581663727760315, "mean_token_accuracy": 0.725386381149292, "num_tokens": 228835.0, "step": 14 }, { "entropy": 0.7058936208486557, "epoch": 0.055970149253731345, "grad_norm": 0.45394617319107056, "learning_rate": 0.0002, "loss": 0.7281949520111084, "mean_token_accuracy": 0.731869712471962, "num_tokens": 245106.0, "step": 15 }, { "entropy": 0.7007950246334076, "epoch": 0.05970149253731343, "grad_norm": 0.38048553466796875, "learning_rate": 0.0002, "loss": 0.6906558871269226, "mean_token_accuracy": 0.7422550022602081, "num_tokens": 261510.0, "step": 16 }, { "entropy": 0.6775622367858887, "epoch": 0.06343283582089553, "grad_norm": 0.3588451147079468, "learning_rate": 0.0002, "loss": 0.6660153865814209, "mean_token_accuracy": 0.7494668215513229, "num_tokens": 278002.0, "step": 17 }, { "entropy": 0.6844813376665115, "epoch": 0.06716417910447761, "grad_norm": 0.34310266375541687, "learning_rate": 0.0002, "loss": 0.6606006026268005, "mean_token_accuracy": 0.745672732591629, "num_tokens": 294482.0, "step": 18 }, { "entropy": 0.6752376109361649, "epoch": 0.0708955223880597, "grad_norm": 0.3563651740550995, "learning_rate": 0.0002, "loss": 0.6529812216758728, "mean_token_accuracy": 0.7467419356107712, "num_tokens": 310804.0, "step": 19 }, { "entropy": 0.655072346329689, "epoch": 0.07462686567164178, "grad_norm": 0.30358463525772095, "learning_rate": 0.0002, "loss": 0.6404100656509399, "mean_token_accuracy": 0.7505071759223938, "num_tokens": 327252.0, "step": 20 }, { "entropy": 0.6286358386278152, "epoch": 0.07835820895522388, "grad_norm": 0.30567091703414917, "learning_rate": 0.0002, "loss": 0.6207510232925415, "mean_token_accuracy": 0.7580177336931229, "num_tokens": 343737.0, "step": 21 }, { "entropy": 0.6086345314979553, "epoch": 0.08208955223880597, "grad_norm": 0.27747389674186707, "learning_rate": 0.0002, "loss": 0.6111672520637512, "mean_token_accuracy": 0.760840117931366, "num_tokens": 359961.0, "step": 22 }, { "entropy": 0.5925645977258682, "epoch": 0.08582089552238806, "grad_norm": 0.25484028458595276, "learning_rate": 0.0002, "loss": 0.5915433168411255, "mean_token_accuracy": 0.7686687558889389, "num_tokens": 376034.0, "step": 23 }, { "entropy": 0.6192648261785507, "epoch": 0.08955223880597014, "grad_norm": 0.2309548258781433, "learning_rate": 0.0002, "loss": 0.6154056787490845, "mean_token_accuracy": 0.7575328648090363, "num_tokens": 392454.0, "step": 24 }, { "entropy": 0.6046310663223267, "epoch": 0.09328358208955224, "grad_norm": 0.24919550120830536, "learning_rate": 0.0002, "loss": 0.5856317281723022, "mean_token_accuracy": 0.769055038690567, "num_tokens": 408673.0, "step": 25 }, { "entropy": 0.6073041707277298, "epoch": 0.09701492537313433, "grad_norm": 0.22897422313690186, "learning_rate": 0.0002, "loss": 0.6000080108642578, "mean_token_accuracy": 0.7657780200242996, "num_tokens": 425147.0, "step": 26 }, { "entropy": 0.5694791227579117, "epoch": 0.10074626865671642, "grad_norm": 0.26130226254463196, "learning_rate": 0.0002, "loss": 0.5651018619537354, "mean_token_accuracy": 0.7780718505382538, "num_tokens": 441676.0, "step": 27 }, { "entropy": 0.5705035477876663, "epoch": 0.1044776119402985, "grad_norm": 0.2569018304347992, "learning_rate": 0.0002, "loss": 0.5736910700798035, "mean_token_accuracy": 0.7736188471317291, "num_tokens": 457862.0, "step": 28 }, { "entropy": 0.5686106830835342, "epoch": 0.10820895522388059, "grad_norm": 0.24455995857715607, "learning_rate": 0.0002, "loss": 0.5789230465888977, "mean_token_accuracy": 0.7694863677024841, "num_tokens": 473929.0, "step": 29 }, { "entropy": 0.5674358904361725, "epoch": 0.11194029850746269, "grad_norm": 0.2457604557275772, "learning_rate": 0.0002, "loss": 0.581587553024292, "mean_token_accuracy": 0.7700542360544205, "num_tokens": 490261.0, "step": 30 }, { "entropy": 0.5924967974424362, "epoch": 0.11567164179104478, "grad_norm": 0.24704386293888092, "learning_rate": 0.0002, "loss": 0.5963209271430969, "mean_token_accuracy": 0.7627938687801361, "num_tokens": 506614.0, "step": 31 }, { "entropy": 0.5728770643472672, "epoch": 0.11940298507462686, "grad_norm": 0.24360406398773193, "learning_rate": 0.0002, "loss": 0.570555567741394, "mean_token_accuracy": 0.7713408023118973, "num_tokens": 523175.0, "step": 32 }, { "entropy": 0.5846883952617645, "epoch": 0.12313432835820895, "grad_norm": 0.20197518169879913, "learning_rate": 0.0002, "loss": 0.5723189115524292, "mean_token_accuracy": 0.7742884606122971, "num_tokens": 539383.0, "step": 33 }, { "entropy": 0.5598815232515335, "epoch": 0.12686567164179105, "grad_norm": 0.25282159447669983, "learning_rate": 0.0002, "loss": 0.5645520687103271, "mean_token_accuracy": 0.7759677618741989, "num_tokens": 555484.0, "step": 34 }, { "entropy": 0.5746279805898666, "epoch": 0.13059701492537312, "grad_norm": 0.20525087416172028, "learning_rate": 0.0002, "loss": 0.5774482488632202, "mean_token_accuracy": 0.7711690366268158, "num_tokens": 572050.0, "step": 35 }, { "entropy": 0.5689367800951004, "epoch": 0.13432835820895522, "grad_norm": 0.2016289383172989, "learning_rate": 0.0002, "loss": 0.5688468217849731, "mean_token_accuracy": 0.7752531915903091, "num_tokens": 588229.0, "step": 36 }, { "entropy": 0.5673371106386185, "epoch": 0.13805970149253732, "grad_norm": 0.20251700282096863, "learning_rate": 0.0002, "loss": 0.5676092505455017, "mean_token_accuracy": 0.7740599513053894, "num_tokens": 604842.0, "step": 37 }, { "entropy": 0.5538036525249481, "epoch": 0.1417910447761194, "grad_norm": 0.18855363130569458, "learning_rate": 0.0002, "loss": 0.5636182427406311, "mean_token_accuracy": 0.7732492536306381, "num_tokens": 621334.0, "step": 38 }, { "entropy": 0.5772293359041214, "epoch": 0.1455223880597015, "grad_norm": 0.1829119771718979, "learning_rate": 0.0002, "loss": 0.5749870538711548, "mean_token_accuracy": 0.7699291855096817, "num_tokens": 637861.0, "step": 39 }, { "entropy": 0.5583464652299881, "epoch": 0.14925373134328357, "grad_norm": 0.16470657289028168, "learning_rate": 0.0002, "loss": 0.5537322163581848, "mean_token_accuracy": 0.7790806740522385, "num_tokens": 653894.0, "step": 40 }, { "entropy": 0.5681058615446091, "epoch": 0.15298507462686567, "grad_norm": 0.17573200166225433, "learning_rate": 0.0002, "loss": 0.5643278360366821, "mean_token_accuracy": 0.7733141183853149, "num_tokens": 670015.0, "step": 41 }, { "entropy": 0.566686749458313, "epoch": 0.15671641791044777, "grad_norm": 0.16218754649162292, "learning_rate": 0.0002, "loss": 0.5597659945487976, "mean_token_accuracy": 0.7758253067731857, "num_tokens": 686056.0, "step": 42 }, { "entropy": 0.5558898448944092, "epoch": 0.16044776119402984, "grad_norm": 0.18278591334819794, "learning_rate": 0.0002, "loss": 0.558386504650116, "mean_token_accuracy": 0.7759624123573303, "num_tokens": 702659.0, "step": 43 }, { "entropy": 0.5585661381483078, "epoch": 0.16417910447761194, "grad_norm": 0.17696230113506317, "learning_rate": 0.0002, "loss": 0.5635029673576355, "mean_token_accuracy": 0.7751695066690445, "num_tokens": 718850.0, "step": 44 }, { "entropy": 0.5506571680307388, "epoch": 0.16791044776119404, "grad_norm": 0.1652524471282959, "learning_rate": 0.0002, "loss": 0.5565558671951294, "mean_token_accuracy": 0.7778312116861343, "num_tokens": 735246.0, "step": 45 }, { "entropy": 0.5514795780181885, "epoch": 0.17164179104477612, "grad_norm": 0.18487824499607086, "learning_rate": 0.0002, "loss": 0.5487773418426514, "mean_token_accuracy": 0.7793762385845184, "num_tokens": 751565.0, "step": 46 }, { "entropy": 0.5588273853063583, "epoch": 0.17537313432835822, "grad_norm": 0.19246406853199005, "learning_rate": 0.0002, "loss": 0.5596141219139099, "mean_token_accuracy": 0.7778225541114807, "num_tokens": 767932.0, "step": 47 }, { "entropy": 0.5591737627983093, "epoch": 0.1791044776119403, "grad_norm": 0.15891006588935852, "learning_rate": 0.0002, "loss": 0.5638841390609741, "mean_token_accuracy": 0.7727467268705368, "num_tokens": 784014.0, "step": 48 }, { "entropy": 0.5501811355352402, "epoch": 0.1828358208955224, "grad_norm": 0.16706983745098114, "learning_rate": 0.0002, "loss": 0.5501376986503601, "mean_token_accuracy": 0.7761423140764236, "num_tokens": 800374.0, "step": 49 }, { "entropy": 0.5606948286294937, "epoch": 0.1865671641791045, "grad_norm": 0.17230357229709625, "learning_rate": 0.0002, "loss": 0.5634580850601196, "mean_token_accuracy": 0.7727725654840469, "num_tokens": 816520.0, "step": 50 }, { "entropy": 0.5541675686836243, "epoch": 0.19029850746268656, "grad_norm": 0.1744348555803299, "learning_rate": 0.0002, "loss": 0.5664834380149841, "mean_token_accuracy": 0.7722806632518768, "num_tokens": 832574.0, "step": 51 }, { "entropy": 0.5447754859924316, "epoch": 0.19402985074626866, "grad_norm": 0.1993291825056076, "learning_rate": 0.0002, "loss": 0.5500599145889282, "mean_token_accuracy": 0.7812339067459106, "num_tokens": 848524.0, "step": 52 }, { "entropy": 0.55513696372509, "epoch": 0.19776119402985073, "grad_norm": 0.18667836487293243, "learning_rate": 0.0002, "loss": 0.5566352605819702, "mean_token_accuracy": 0.7776180505752563, "num_tokens": 864701.0, "step": 53 }, { "entropy": 0.5591137707233429, "epoch": 0.20149253731343283, "grad_norm": 0.1556427925825119, "learning_rate": 0.0002, "loss": 0.5615472197532654, "mean_token_accuracy": 0.7761439085006714, "num_tokens": 881019.0, "step": 54 }, { "entropy": 0.5678103417158127, "epoch": 0.20522388059701493, "grad_norm": 0.176001638174057, "learning_rate": 0.0002, "loss": 0.5604614615440369, "mean_token_accuracy": 0.7737350314855576, "num_tokens": 897731.0, "step": 55 }, { "entropy": 0.5736003369092941, "epoch": 0.208955223880597, "grad_norm": 0.17963656783103943, "learning_rate": 0.0002, "loss": 0.5741879940032959, "mean_token_accuracy": 0.7709980905056, "num_tokens": 914031.0, "step": 56 }, { "entropy": 0.5704395622014999, "epoch": 0.2126865671641791, "grad_norm": 0.15910783410072327, "learning_rate": 0.0002, "loss": 0.571160078048706, "mean_token_accuracy": 0.7722027599811554, "num_tokens": 930606.0, "step": 57 }, { "entropy": 0.5746669173240662, "epoch": 0.21641791044776118, "grad_norm": 0.15874247252941132, "learning_rate": 0.0002, "loss": 0.5674406886100769, "mean_token_accuracy": 0.7708650529384613, "num_tokens": 947244.0, "step": 58 }, { "entropy": 0.5582200437784195, "epoch": 0.22014925373134328, "grad_norm": 0.16829723119735718, "learning_rate": 0.0002, "loss": 0.5581406950950623, "mean_token_accuracy": 0.7757681459188461, "num_tokens": 963619.0, "step": 59 }, { "entropy": 0.5504408478736877, "epoch": 0.22388059701492538, "grad_norm": 0.14540037512779236, "learning_rate": 0.0002, "loss": 0.5557159781455994, "mean_token_accuracy": 0.776930645108223, "num_tokens": 980040.0, "step": 60 }, { "entropy": 0.5402641594409943, "epoch": 0.22761194029850745, "grad_norm": 0.14897902309894562, "learning_rate": 0.0002, "loss": 0.5523658394813538, "mean_token_accuracy": 0.7773705869913101, "num_tokens": 996383.0, "step": 61 }, { "entropy": 0.5391396135091782, "epoch": 0.23134328358208955, "grad_norm": 0.16873425245285034, "learning_rate": 0.0002, "loss": 0.5509910583496094, "mean_token_accuracy": 0.7777218073606491, "num_tokens": 1012664.0, "step": 62 }, { "entropy": 0.5582114011049271, "epoch": 0.23507462686567165, "grad_norm": 0.1502108871936798, "learning_rate": 0.0002, "loss": 0.5559942126274109, "mean_token_accuracy": 0.7745993584394455, "num_tokens": 1029022.0, "step": 63 }, { "entropy": 0.5812249481678009, "epoch": 0.23880597014925373, "grad_norm": 0.13852274417877197, "learning_rate": 0.0002, "loss": 0.5768259167671204, "mean_token_accuracy": 0.766035184264183, "num_tokens": 1045337.0, "step": 64 }, { "entropy": 0.555647611618042, "epoch": 0.24253731343283583, "grad_norm": 0.1643349826335907, "learning_rate": 0.0002, "loss": 0.5524765849113464, "mean_token_accuracy": 0.7790125608444214, "num_tokens": 1061843.0, "step": 65 }, { "entropy": 0.5712831914424896, "epoch": 0.2462686567164179, "grad_norm": 0.1458103060722351, "learning_rate": 0.0002, "loss": 0.5671954154968262, "mean_token_accuracy": 0.7726651430130005, "num_tokens": 1078313.0, "step": 66 }, { "entropy": 0.548685610294342, "epoch": 0.25, "grad_norm": 0.13704419136047363, "learning_rate": 0.0002, "loss": 0.5478826761245728, "mean_token_accuracy": 0.7788915038108826, "num_tokens": 1094803.0, "step": 67 }, { "entropy": 0.5427667200565338, "epoch": 0.2537313432835821, "grad_norm": 0.16616535186767578, "learning_rate": 0.0002, "loss": 0.5495492815971375, "mean_token_accuracy": 0.7795749753713608, "num_tokens": 1111058.0, "step": 68 }, { "entropy": 0.5463619232177734, "epoch": 0.2574626865671642, "grad_norm": 0.1541680544614792, "learning_rate": 0.0002, "loss": 0.5557973980903625, "mean_token_accuracy": 0.7797737270593643, "num_tokens": 1127187.0, "step": 69 }, { "entropy": 0.5503609925508499, "epoch": 0.26119402985074625, "grad_norm": 0.16344738006591797, "learning_rate": 0.0002, "loss": 0.5560310482978821, "mean_token_accuracy": 0.7764633148908615, "num_tokens": 1143517.0, "step": 70 }, { "entropy": 0.564177006483078, "epoch": 0.26492537313432835, "grad_norm": 0.1369864046573639, "learning_rate": 0.0002, "loss": 0.5619618892669678, "mean_token_accuracy": 0.774873822927475, "num_tokens": 1160191.0, "step": 71 }, { "entropy": 0.5624472498893738, "epoch": 0.26865671641791045, "grad_norm": 0.16099311411380768, "learning_rate": 0.0002, "loss": 0.5546153783798218, "mean_token_accuracy": 0.7775298207998276, "num_tokens": 1176379.0, "step": 72 }, { "entropy": 0.5442378669977188, "epoch": 0.27238805970149255, "grad_norm": 0.18382063508033752, "learning_rate": 0.0002, "loss": 0.5439026951789856, "mean_token_accuracy": 0.7808986604213715, "num_tokens": 1192611.0, "step": 73 }, { "entropy": 0.5539779812097549, "epoch": 0.27611940298507465, "grad_norm": 0.14527475833892822, "learning_rate": 0.0002, "loss": 0.5488794445991516, "mean_token_accuracy": 0.7770136892795563, "num_tokens": 1209218.0, "step": 74 }, { "entropy": 0.5399174243211746, "epoch": 0.2798507462686567, "grad_norm": 0.16744667291641235, "learning_rate": 0.0002, "loss": 0.5474289059638977, "mean_token_accuracy": 0.7779674381017685, "num_tokens": 1225760.0, "step": 75 }, { "entropy": 0.5410275682806969, "epoch": 0.2835820895522388, "grad_norm": 0.1709633320569992, "learning_rate": 0.0002, "loss": 0.548405110836029, "mean_token_accuracy": 0.7785314917564392, "num_tokens": 1242263.0, "step": 76 }, { "entropy": 0.5613621175289154, "epoch": 0.2873134328358209, "grad_norm": 0.13462653756141663, "learning_rate": 0.0002, "loss": 0.5592188835144043, "mean_token_accuracy": 0.7736580222845078, "num_tokens": 1258802.0, "step": 77 }, { "entropy": 0.5370856672525406, "epoch": 0.291044776119403, "grad_norm": 0.14010556042194366, "learning_rate": 0.0002, "loss": 0.5362333655357361, "mean_token_accuracy": 0.7829223275184631, "num_tokens": 1274985.0, "step": 78 }, { "entropy": 0.5476308465003967, "epoch": 0.2947761194029851, "grad_norm": 0.14489887654781342, "learning_rate": 0.0002, "loss": 0.549788236618042, "mean_token_accuracy": 0.7797223776578903, "num_tokens": 1291341.0, "step": 79 }, { "entropy": 0.5441256165504456, "epoch": 0.29850746268656714, "grad_norm": 0.14331087470054626, "learning_rate": 0.0002, "loss": 0.5457456111907959, "mean_token_accuracy": 0.7812238931655884, "num_tokens": 1307441.0, "step": 80 }, { "entropy": 0.5347439795732498, "epoch": 0.30223880597014924, "grad_norm": 0.13690398633480072, "learning_rate": 0.0002, "loss": 0.5451613068580627, "mean_token_accuracy": 0.7763567119836807, "num_tokens": 1323409.0, "step": 81 }, { "entropy": 0.5473417937755585, "epoch": 0.30597014925373134, "grad_norm": 0.16063734889030457, "learning_rate": 0.0002, "loss": 0.5565767288208008, "mean_token_accuracy": 0.7768999934196472, "num_tokens": 1339750.0, "step": 82 }, { "entropy": 0.5419514924287796, "epoch": 0.30970149253731344, "grad_norm": 0.16186301410198212, "learning_rate": 0.0002, "loss": 0.5480918288230896, "mean_token_accuracy": 0.7810427248477936, "num_tokens": 1355977.0, "step": 83 }, { "entropy": 0.5665269196033478, "epoch": 0.31343283582089554, "grad_norm": 0.14284147322177887, "learning_rate": 0.0002, "loss": 0.5600348711013794, "mean_token_accuracy": 0.7740004658699036, "num_tokens": 1372396.0, "step": 84 }, { "entropy": 0.5530648082494736, "epoch": 0.31716417910447764, "grad_norm": 0.1373152732849121, "learning_rate": 0.0002, "loss": 0.547944962978363, "mean_token_accuracy": 0.7793020755052567, "num_tokens": 1388474.0, "step": 85 }, { "entropy": 0.5625097453594208, "epoch": 0.3208955223880597, "grad_norm": 0.1248691976070404, "learning_rate": 0.0002, "loss": 0.5582663416862488, "mean_token_accuracy": 0.7758172750473022, "num_tokens": 1404880.0, "step": 86 }, { "entropy": 0.5460606664419174, "epoch": 0.3246268656716418, "grad_norm": 0.16231709718704224, "learning_rate": 0.0002, "loss": 0.5510202646255493, "mean_token_accuracy": 0.7779169529676437, "num_tokens": 1421168.0, "step": 87 }, { "entropy": 0.5403235554695129, "epoch": 0.3283582089552239, "grad_norm": 0.15352240204811096, "learning_rate": 0.0002, "loss": 0.5474361181259155, "mean_token_accuracy": 0.7786824256181717, "num_tokens": 1437433.0, "step": 88 }, { "entropy": 0.550665482878685, "epoch": 0.332089552238806, "grad_norm": 0.17033375799655914, "learning_rate": 0.0002, "loss": 0.5535221695899963, "mean_token_accuracy": 0.7792181968688965, "num_tokens": 1453476.0, "step": 89 }, { "entropy": 0.563551127910614, "epoch": 0.3358208955223881, "grad_norm": 0.13113154470920563, "learning_rate": 0.0002, "loss": 0.5608611106872559, "mean_token_accuracy": 0.7760418206453323, "num_tokens": 1469909.0, "step": 90 }, { "entropy": 0.5737572461366653, "epoch": 0.33955223880597013, "grad_norm": 0.12551374733448029, "learning_rate": 0.0002, "loss": 0.5643397569656372, "mean_token_accuracy": 0.7728746980428696, "num_tokens": 1486426.0, "step": 91 }, { "entropy": 0.5659501850605011, "epoch": 0.34328358208955223, "grad_norm": 0.15791846811771393, "learning_rate": 0.0002, "loss": 0.5704576969146729, "mean_token_accuracy": 0.7684866786003113, "num_tokens": 1502522.0, "step": 92 }, { "entropy": 0.5568918883800507, "epoch": 0.34701492537313433, "grad_norm": 0.14071005582809448, "learning_rate": 0.0002, "loss": 0.559943437576294, "mean_token_accuracy": 0.7734934538602829, "num_tokens": 1518718.0, "step": 93 }, { "entropy": 0.5584161728620529, "epoch": 0.35074626865671643, "grad_norm": 0.14257407188415527, "learning_rate": 0.0002, "loss": 0.5574990510940552, "mean_token_accuracy": 0.7743052095174789, "num_tokens": 1534997.0, "step": 94 }, { "entropy": 0.5583510845899582, "epoch": 0.35447761194029853, "grad_norm": 0.13653768599033356, "learning_rate": 0.0002, "loss": 0.5597235560417175, "mean_token_accuracy": 0.7758298218250275, "num_tokens": 1551457.0, "step": 95 }, { "entropy": 0.5537077486515045, "epoch": 0.3582089552238806, "grad_norm": 0.14674222469329834, "learning_rate": 0.0002, "loss": 0.5539477467536926, "mean_token_accuracy": 0.7744529694318771, "num_tokens": 1567731.0, "step": 96 }, { "entropy": 0.5472210198640823, "epoch": 0.3619402985074627, "grad_norm": 0.1276751160621643, "learning_rate": 0.0002, "loss": 0.5464935898780823, "mean_token_accuracy": 0.7826344817876816, "num_tokens": 1584021.0, "step": 97 }, { "entropy": 0.5479029715061188, "epoch": 0.3656716417910448, "grad_norm": 0.16119465231895447, "learning_rate": 0.0002, "loss": 0.5547060966491699, "mean_token_accuracy": 0.7760697901248932, "num_tokens": 1600533.0, "step": 98 }, { "entropy": 0.5536443293094635, "epoch": 0.3694029850746269, "grad_norm": 0.12991106510162354, "learning_rate": 0.0002, "loss": 0.5573412775993347, "mean_token_accuracy": 0.7744511961936951, "num_tokens": 1616690.0, "step": 99 }, { "entropy": 0.5505102574825287, "epoch": 0.373134328358209, "grad_norm": 0.1364317238330841, "learning_rate": 0.0002, "loss": 0.5571202635765076, "mean_token_accuracy": 0.7761907130479813, "num_tokens": 1632957.0, "step": 100 }, { "entropy": 0.5503265261650085, "epoch": 0.376865671641791, "grad_norm": 0.14918965101242065, "learning_rate": 0.0002, "loss": 0.5452536344528198, "mean_token_accuracy": 0.7773023992776871, "num_tokens": 1649397.0, "step": 101 }, { "entropy": 0.5523863285779953, "epoch": 0.3805970149253731, "grad_norm": 0.14225420355796814, "learning_rate": 0.0002, "loss": 0.5425117611885071, "mean_token_accuracy": 0.7800490856170654, "num_tokens": 1665876.0, "step": 102 }, { "entropy": 0.5518430918455124, "epoch": 0.3843283582089552, "grad_norm": 0.12764710187911987, "learning_rate": 0.0002, "loss": 0.5529345870018005, "mean_token_accuracy": 0.7768139094114304, "num_tokens": 1682296.0, "step": 103 }, { "entropy": 0.5581493228673935, "epoch": 0.3880597014925373, "grad_norm": 0.16170883178710938, "learning_rate": 0.0002, "loss": 0.5702566504478455, "mean_token_accuracy": 0.7671579420566559, "num_tokens": 1698550.0, "step": 104 }, { "entropy": 0.558798760175705, "epoch": 0.3917910447761194, "grad_norm": 0.14736565947532654, "learning_rate": 0.0002, "loss": 0.5634024143218994, "mean_token_accuracy": 0.7718724012374878, "num_tokens": 1714882.0, "step": 105 }, { "entropy": 0.5496668964624405, "epoch": 0.39552238805970147, "grad_norm": 0.150962695479393, "learning_rate": 0.0002, "loss": 0.5452749133110046, "mean_token_accuracy": 0.7789688110351562, "num_tokens": 1731436.0, "step": 106 }, { "entropy": 0.5397633910179138, "epoch": 0.39925373134328357, "grad_norm": 0.12951846420764923, "learning_rate": 0.0002, "loss": 0.5374678373336792, "mean_token_accuracy": 0.7823840379714966, "num_tokens": 1747667.0, "step": 107 }, { "entropy": 0.5504965782165527, "epoch": 0.40298507462686567, "grad_norm": 0.1469883769750595, "learning_rate": 0.0002, "loss": 0.5489968061447144, "mean_token_accuracy": 0.7779988348484039, "num_tokens": 1763956.0, "step": 108 }, { "entropy": 0.5401955544948578, "epoch": 0.40671641791044777, "grad_norm": 0.14114412665367126, "learning_rate": 0.0002, "loss": 0.5469740033149719, "mean_token_accuracy": 0.7791216820478439, "num_tokens": 1780050.0, "step": 109 }, { "entropy": 0.5623095035552979, "epoch": 0.41044776119402987, "grad_norm": 0.12923510372638702, "learning_rate": 0.0002, "loss": 0.5578881502151489, "mean_token_accuracy": 0.7777072787284851, "num_tokens": 1796820.0, "step": 110 }, { "entropy": 0.5413771942257881, "epoch": 0.4141791044776119, "grad_norm": 0.1528160274028778, "learning_rate": 0.0002, "loss": 0.5452436208724976, "mean_token_accuracy": 0.7776108086109161, "num_tokens": 1813232.0, "step": 111 }, { "entropy": 0.5609131902456284, "epoch": 0.417910447761194, "grad_norm": 0.12400584667921066, "learning_rate": 0.0002, "loss": 0.5644053816795349, "mean_token_accuracy": 0.7719212174415588, "num_tokens": 1829542.0, "step": 112 }, { "entropy": 0.543258398771286, "epoch": 0.4216417910447761, "grad_norm": 0.11892957985401154, "learning_rate": 0.0002, "loss": 0.5409727692604065, "mean_token_accuracy": 0.7800008654594421, "num_tokens": 1845855.0, "step": 113 }, { "entropy": 0.5490185469388962, "epoch": 0.4253731343283582, "grad_norm": 0.1497296690940857, "learning_rate": 0.0002, "loss": 0.5536864995956421, "mean_token_accuracy": 0.7792476564645767, "num_tokens": 1862087.0, "step": 114 }, { "entropy": 0.53768490254879, "epoch": 0.4291044776119403, "grad_norm": 0.13764707744121552, "learning_rate": 0.0002, "loss": 0.5394353866577148, "mean_token_accuracy": 0.7829310894012451, "num_tokens": 1878496.0, "step": 115 }, { "entropy": 0.548382118344307, "epoch": 0.43283582089552236, "grad_norm": 0.1350480020046234, "learning_rate": 0.0002, "loss": 0.5588696002960205, "mean_token_accuracy": 0.773399829864502, "num_tokens": 1894649.0, "step": 116 }, { "entropy": 0.5273909568786621, "epoch": 0.43656716417910446, "grad_norm": 0.1509886085987091, "learning_rate": 0.0002, "loss": 0.5329999923706055, "mean_token_accuracy": 0.7835660129785538, "num_tokens": 1910828.0, "step": 117 }, { "entropy": 0.5727127343416214, "epoch": 0.44029850746268656, "grad_norm": 0.12369527667760849, "learning_rate": 0.0002, "loss": 0.5647591948509216, "mean_token_accuracy": 0.7721648663282394, "num_tokens": 1927319.0, "step": 118 }, { "entropy": 0.5657652169466019, "epoch": 0.44402985074626866, "grad_norm": 0.14263150095939636, "learning_rate": 0.0002, "loss": 0.5616084337234497, "mean_token_accuracy": 0.7732421457767487, "num_tokens": 1943783.0, "step": 119 }, { "entropy": 0.5638687461614609, "epoch": 0.44776119402985076, "grad_norm": 0.11849121749401093, "learning_rate": 0.0002, "loss": 0.5577123165130615, "mean_token_accuracy": 0.7739600390195847, "num_tokens": 1960125.0, "step": 120 }, { "entropy": 0.5605282336473465, "epoch": 0.45149253731343286, "grad_norm": 0.1323515772819519, "learning_rate": 0.0002, "loss": 0.557800829410553, "mean_token_accuracy": 0.7727965116500854, "num_tokens": 1976458.0, "step": 121 }, { "entropy": 0.5336878746747971, "epoch": 0.4552238805970149, "grad_norm": 0.14154070615768433, "learning_rate": 0.0002, "loss": 0.5429147481918335, "mean_token_accuracy": 0.7805563360452652, "num_tokens": 1992835.0, "step": 122 }, { "entropy": 0.5291022211313248, "epoch": 0.458955223880597, "grad_norm": 0.15199723839759827, "learning_rate": 0.0002, "loss": 0.5432179570198059, "mean_token_accuracy": 0.7801262736320496, "num_tokens": 2008972.0, "step": 123 }, { "entropy": 0.551175132393837, "epoch": 0.4626865671641791, "grad_norm": 0.11983563005924225, "learning_rate": 0.0002, "loss": 0.5541180968284607, "mean_token_accuracy": 0.7762188464403152, "num_tokens": 2025359.0, "step": 124 }, { "entropy": 0.5533900856971741, "epoch": 0.4664179104477612, "grad_norm": 0.11737282574176788, "learning_rate": 0.0002, "loss": 0.5463876724243164, "mean_token_accuracy": 0.7790547609329224, "num_tokens": 2041643.0, "step": 125 }, { "entropy": 0.5509413182735443, "epoch": 0.4701492537313433, "grad_norm": 0.13276953995227814, "learning_rate": 0.0002, "loss": 0.5425540208816528, "mean_token_accuracy": 0.7806166559457779, "num_tokens": 2057820.0, "step": 126 }, { "entropy": 0.5531751215457916, "epoch": 0.47388059701492535, "grad_norm": 0.12553741037845612, "learning_rate": 0.0002, "loss": 0.5523180961608887, "mean_token_accuracy": 0.7784822881221771, "num_tokens": 2074179.0, "step": 127 }, { "entropy": 0.546363577246666, "epoch": 0.47761194029850745, "grad_norm": 0.13337954878807068, "learning_rate": 0.0002, "loss": 0.5551460981369019, "mean_token_accuracy": 0.7742737084627151, "num_tokens": 2090654.0, "step": 128 }, { "entropy": 0.5285965204238892, "epoch": 0.48134328358208955, "grad_norm": 0.13400429487228394, "learning_rate": 0.0002, "loss": 0.5407966375350952, "mean_token_accuracy": 0.7815738469362259, "num_tokens": 2107063.0, "step": 129 }, { "entropy": 0.5335082858800888, "epoch": 0.48507462686567165, "grad_norm": 0.13302984833717346, "learning_rate": 0.0002, "loss": 0.5388374328613281, "mean_token_accuracy": 0.7839466333389282, "num_tokens": 2123452.0, "step": 130 }, { "entropy": 0.557282879948616, "epoch": 0.48880597014925375, "grad_norm": 0.13119758665561676, "learning_rate": 0.0002, "loss": 0.5534148812294006, "mean_token_accuracy": 0.7738241106271744, "num_tokens": 2139585.0, "step": 131 }, { "entropy": 0.5428808927536011, "epoch": 0.4925373134328358, "grad_norm": 0.12375836819410324, "learning_rate": 0.0002, "loss": 0.5381428003311157, "mean_token_accuracy": 0.7813713997602463, "num_tokens": 2155902.0, "step": 132 }, { "entropy": 0.5618433207273483, "epoch": 0.4962686567164179, "grad_norm": 0.13146650791168213, "learning_rate": 0.0002, "loss": 0.552733838558197, "mean_token_accuracy": 0.7768221199512482, "num_tokens": 2172496.0, "step": 133 }, { "entropy": 0.5565268397331238, "epoch": 0.5, "grad_norm": 0.11766450107097626, "learning_rate": 0.0002, "loss": 0.5559637546539307, "mean_token_accuracy": 0.7758495062589645, "num_tokens": 2188987.0, "step": 134 }, { "entropy": 0.5205433219671249, "epoch": 0.503731343283582, "grad_norm": 0.12712325155735016, "learning_rate": 0.0002, "loss": 0.5280570387840271, "mean_token_accuracy": 0.7863014787435532, "num_tokens": 2205010.0, "step": 135 }, { "entropy": 0.5373736917972565, "epoch": 0.5074626865671642, "grad_norm": 0.13094842433929443, "learning_rate": 0.0002, "loss": 0.5430901050567627, "mean_token_accuracy": 0.780227467417717, "num_tokens": 2221474.0, "step": 136 }, { "entropy": 0.5688028186559677, "epoch": 0.5111940298507462, "grad_norm": 0.1379985511302948, "learning_rate": 0.0002, "loss": 0.5740535855293274, "mean_token_accuracy": 0.7692983150482178, "num_tokens": 2238030.0, "step": 137 }, { "entropy": 0.5621554553508759, "epoch": 0.5149253731343284, "grad_norm": 0.13305246829986572, "learning_rate": 0.0002, "loss": 0.5573163032531738, "mean_token_accuracy": 0.7748852521181107, "num_tokens": 2254436.0, "step": 138 }, { "entropy": 0.5507737994194031, "epoch": 0.5186567164179104, "grad_norm": 0.12606868147850037, "learning_rate": 0.0002, "loss": 0.5473536849021912, "mean_token_accuracy": 0.7785522937774658, "num_tokens": 2270806.0, "step": 139 }, { "entropy": 0.5534549057483673, "epoch": 0.5223880597014925, "grad_norm": 0.14390718936920166, "learning_rate": 0.0002, "loss": 0.5571063756942749, "mean_token_accuracy": 0.7750511020421982, "num_tokens": 2286975.0, "step": 140 }, { "entropy": 0.5419649630784988, "epoch": 0.5261194029850746, "grad_norm": 0.13526654243469238, "learning_rate": 0.0002, "loss": 0.5507834553718567, "mean_token_accuracy": 0.7767505496740341, "num_tokens": 2303373.0, "step": 141 }, { "entropy": 0.5532436519861221, "epoch": 0.5298507462686567, "grad_norm": 0.1307537853717804, "learning_rate": 0.0002, "loss": 0.5537344813346863, "mean_token_accuracy": 0.7779698222875595, "num_tokens": 2319833.0, "step": 142 }, { "entropy": 0.5443145930767059, "epoch": 0.5335820895522388, "grad_norm": 0.12360236793756485, "learning_rate": 0.0002, "loss": 0.5414459109306335, "mean_token_accuracy": 0.7796581238508224, "num_tokens": 2336100.0, "step": 143 }, { "entropy": 0.5436644405126572, "epoch": 0.5373134328358209, "grad_norm": 0.13813567161560059, "learning_rate": 0.0002, "loss": 0.5399284362792969, "mean_token_accuracy": 0.781887099146843, "num_tokens": 2352431.0, "step": 144 }, { "entropy": 0.554161787033081, "epoch": 0.5410447761194029, "grad_norm": 0.1234111338853836, "learning_rate": 0.0002, "loss": 0.5504522323608398, "mean_token_accuracy": 0.7768333554267883, "num_tokens": 2368781.0, "step": 145 }, { "entropy": 0.540039673447609, "epoch": 0.5447761194029851, "grad_norm": 0.12760984897613525, "learning_rate": 0.0002, "loss": 0.5470931529998779, "mean_token_accuracy": 0.7785885185003281, "num_tokens": 2385030.0, "step": 146 }, { "entropy": 0.538455605506897, "epoch": 0.5485074626865671, "grad_norm": 0.11708244681358337, "learning_rate": 0.0002, "loss": 0.540416419506073, "mean_token_accuracy": 0.782222330570221, "num_tokens": 2401529.0, "step": 147 }, { "entropy": 0.5445697456598282, "epoch": 0.5522388059701493, "grad_norm": 0.11756740510463715, "learning_rate": 0.0002, "loss": 0.5511283278465271, "mean_token_accuracy": 0.7760586440563202, "num_tokens": 2417920.0, "step": 148 }, { "entropy": 0.5568743199110031, "epoch": 0.5559701492537313, "grad_norm": 0.1262131929397583, "learning_rate": 0.0002, "loss": 0.5587324500083923, "mean_token_accuracy": 0.7755658030509949, "num_tokens": 2434402.0, "step": 149 }, { "entropy": 0.5476635098457336, "epoch": 0.5597014925373134, "grad_norm": 0.14212746918201447, "learning_rate": 0.0002, "loss": 0.5485654473304749, "mean_token_accuracy": 0.7787987738847733, "num_tokens": 2450648.0, "step": 150 }, { "entropy": 0.5328710079193115, "epoch": 0.5634328358208955, "grad_norm": 0.1456608921289444, "learning_rate": 0.0002, "loss": 0.5320286750793457, "mean_token_accuracy": 0.7839557826519012, "num_tokens": 2466701.0, "step": 151 }, { "entropy": 0.5372531861066818, "epoch": 0.5671641791044776, "grad_norm": 0.11793923377990723, "learning_rate": 0.0002, "loss": 0.5379877090454102, "mean_token_accuracy": 0.7800156623125076, "num_tokens": 2482627.0, "step": 152 }, { "entropy": 0.5532563626766205, "epoch": 0.5708955223880597, "grad_norm": 0.13809776306152344, "learning_rate": 0.0002, "loss": 0.551555871963501, "mean_token_accuracy": 0.7761517316102982, "num_tokens": 2499250.0, "step": 153 }, { "entropy": 0.5471682995557785, "epoch": 0.5746268656716418, "grad_norm": 0.1408306509256363, "learning_rate": 0.0002, "loss": 0.5491219758987427, "mean_token_accuracy": 0.7767983973026276, "num_tokens": 2515443.0, "step": 154 }, { "entropy": 0.571009948849678, "epoch": 0.5783582089552238, "grad_norm": 0.1486109346151352, "learning_rate": 0.0002, "loss": 0.5713759660720825, "mean_token_accuracy": 0.7713276296854019, "num_tokens": 2531761.0, "step": 155 }, { "entropy": 0.5617386847734451, "epoch": 0.582089552238806, "grad_norm": 0.15764987468719482, "learning_rate": 0.0002, "loss": 0.5562607645988464, "mean_token_accuracy": 0.7755531519651413, "num_tokens": 2548176.0, "step": 156 }, { "entropy": 0.5492932498455048, "epoch": 0.585820895522388, "grad_norm": 0.153673455119133, "learning_rate": 0.0002, "loss": 0.5581745505332947, "mean_token_accuracy": 0.7730790227651596, "num_tokens": 2564448.0, "step": 157 }, { "entropy": 0.555228590965271, "epoch": 0.5895522388059702, "grad_norm": 0.1345115751028061, "learning_rate": 0.0002, "loss": 0.5605562329292297, "mean_token_accuracy": 0.7717746198177338, "num_tokens": 2580905.0, "step": 158 }, { "entropy": 0.5399526059627533, "epoch": 0.5932835820895522, "grad_norm": 0.11657729744911194, "learning_rate": 0.0002, "loss": 0.5369132161140442, "mean_token_accuracy": 0.7842999547719955, "num_tokens": 2597180.0, "step": 159 }, { "entropy": 0.5353947132825851, "epoch": 0.5970149253731343, "grad_norm": 0.1333966851234436, "learning_rate": 0.0002, "loss": 0.5362208485603333, "mean_token_accuracy": 0.7827091217041016, "num_tokens": 2613444.0, "step": 160 }, { "entropy": 0.5535644590854645, "epoch": 0.6007462686567164, "grad_norm": 0.13608874380588531, "learning_rate": 0.0002, "loss": 0.5567671656608582, "mean_token_accuracy": 0.7774695008993149, "num_tokens": 2629983.0, "step": 161 }, { "entropy": 0.5560604184865952, "epoch": 0.6044776119402985, "grad_norm": 0.1163283959031105, "learning_rate": 0.0002, "loss": 0.5636521577835083, "mean_token_accuracy": 0.7745625525712967, "num_tokens": 2646578.0, "step": 162 }, { "entropy": 0.5764736235141754, "epoch": 0.6082089552238806, "grad_norm": 0.1255754828453064, "learning_rate": 0.0002, "loss": 0.578213632106781, "mean_token_accuracy": 0.7662594020366669, "num_tokens": 2663032.0, "step": 163 }, { "entropy": 0.5460716336965561, "epoch": 0.6119402985074627, "grad_norm": 0.13686135411262512, "learning_rate": 0.0002, "loss": 0.5406862497329712, "mean_token_accuracy": 0.7790546417236328, "num_tokens": 2679368.0, "step": 164 }, { "entropy": 0.5340383723378181, "epoch": 0.6156716417910447, "grad_norm": 0.12064651399850845, "learning_rate": 0.0002, "loss": 0.5316583514213562, "mean_token_accuracy": 0.7829991579055786, "num_tokens": 2695866.0, "step": 165 }, { "entropy": 0.5442641973495483, "epoch": 0.6194029850746269, "grad_norm": 0.12049891799688339, "learning_rate": 0.0002, "loss": 0.5513224005699158, "mean_token_accuracy": 0.7753165811300278, "num_tokens": 2712061.0, "step": 166 }, { "entropy": 0.5361381322145462, "epoch": 0.6231343283582089, "grad_norm": 0.13572274148464203, "learning_rate": 0.0002, "loss": 0.5410642623901367, "mean_token_accuracy": 0.7834690064191818, "num_tokens": 2728405.0, "step": 167 }, { "entropy": 0.542312353849411, "epoch": 0.6268656716417911, "grad_norm": 0.12791581451892853, "learning_rate": 0.0002, "loss": 0.5421413779258728, "mean_token_accuracy": 0.7781463712453842, "num_tokens": 2744612.0, "step": 168 }, { "entropy": 0.5568868666887283, "epoch": 0.6305970149253731, "grad_norm": 0.12156295031309128, "learning_rate": 0.0002, "loss": 0.5577100515365601, "mean_token_accuracy": 0.7726946324110031, "num_tokens": 2761047.0, "step": 169 }, { "entropy": 0.5537672489881516, "epoch": 0.6343283582089553, "grad_norm": 0.1293496936559677, "learning_rate": 0.0002, "loss": 0.5571946501731873, "mean_token_accuracy": 0.7751306742429733, "num_tokens": 2777250.0, "step": 170 }, { "entropy": 0.5509191900491714, "epoch": 0.6380597014925373, "grad_norm": 0.1272898018360138, "learning_rate": 0.0002, "loss": 0.5516744256019592, "mean_token_accuracy": 0.7766414433717728, "num_tokens": 2793605.0, "step": 171 }, { "entropy": 0.5510837286710739, "epoch": 0.6417910447761194, "grad_norm": 0.14305925369262695, "learning_rate": 0.0002, "loss": 0.5544188618659973, "mean_token_accuracy": 0.7760672718286514, "num_tokens": 2809948.0, "step": 172 }, { "entropy": 0.5232614651322365, "epoch": 0.6455223880597015, "grad_norm": 0.1384088695049286, "learning_rate": 0.0002, "loss": 0.5274964570999146, "mean_token_accuracy": 0.7859550416469574, "num_tokens": 2826128.0, "step": 173 }, { "entropy": 0.5601816028356552, "epoch": 0.6492537313432836, "grad_norm": 0.1388508826494217, "learning_rate": 0.0002, "loss": 0.5543120503425598, "mean_token_accuracy": 0.7758214622735977, "num_tokens": 2842612.0, "step": 174 }, { "entropy": 0.5437414795160294, "epoch": 0.6529850746268657, "grad_norm": 0.11655397713184357, "learning_rate": 0.0002, "loss": 0.5404227375984192, "mean_token_accuracy": 0.7822663187980652, "num_tokens": 2859123.0, "step": 175 }, { "entropy": 0.55133356153965, "epoch": 0.6567164179104478, "grad_norm": 0.1398521363735199, "learning_rate": 0.0002, "loss": 0.5518021583557129, "mean_token_accuracy": 0.7771210372447968, "num_tokens": 2875360.0, "step": 176 }, { "entropy": 0.5468268245458603, "epoch": 0.6604477611940298, "grad_norm": 0.12005320936441422, "learning_rate": 0.0002, "loss": 0.5481685996055603, "mean_token_accuracy": 0.7786961048841476, "num_tokens": 2891626.0, "step": 177 }, { "entropy": 0.5444129258394241, "epoch": 0.664179104477612, "grad_norm": 0.16883929073810577, "learning_rate": 0.0002, "loss": 0.5526378750801086, "mean_token_accuracy": 0.7768739610910416, "num_tokens": 2907939.0, "step": 178 }, { "entropy": 0.5393242985010147, "epoch": 0.667910447761194, "grad_norm": 0.1297578513622284, "learning_rate": 0.0002, "loss": 0.5451361536979675, "mean_token_accuracy": 0.7800205200910568, "num_tokens": 2924294.0, "step": 179 }, { "entropy": 0.5417011380195618, "epoch": 0.6716417910447762, "grad_norm": 0.12030332535505295, "learning_rate": 0.0002, "loss": 0.5440862774848938, "mean_token_accuracy": 0.7813349515199661, "num_tokens": 2940716.0, "step": 180 }, { "entropy": 0.5521986186504364, "epoch": 0.6753731343283582, "grad_norm": 0.11406023800373077, "learning_rate": 0.0002, "loss": 0.5487515926361084, "mean_token_accuracy": 0.7764244675636292, "num_tokens": 2956993.0, "step": 181 }, { "entropy": 0.5547273755073547, "epoch": 0.6791044776119403, "grad_norm": 0.13328734040260315, "learning_rate": 0.0002, "loss": 0.552635669708252, "mean_token_accuracy": 0.7759450674057007, "num_tokens": 2973622.0, "step": 182 }, { "entropy": 0.5548880398273468, "epoch": 0.6828358208955224, "grad_norm": 0.11328119784593582, "learning_rate": 0.0002, "loss": 0.5517279505729675, "mean_token_accuracy": 0.7757984399795532, "num_tokens": 2989995.0, "step": 183 }, { "entropy": 0.5576671957969666, "epoch": 0.6865671641791045, "grad_norm": 0.1849256306886673, "learning_rate": 0.0002, "loss": 0.5650368332862854, "mean_token_accuracy": 0.7731626927852631, "num_tokens": 3006538.0, "step": 184 }, { "entropy": 0.537109300494194, "epoch": 0.6902985074626866, "grad_norm": 0.1240711435675621, "learning_rate": 0.0002, "loss": 0.5376191139221191, "mean_token_accuracy": 0.7854040563106537, "num_tokens": 3022770.0, "step": 185 }, { "entropy": 0.5537560731172562, "epoch": 0.6940298507462687, "grad_norm": 0.1654159426689148, "learning_rate": 0.0002, "loss": 0.5570691227912903, "mean_token_accuracy": 0.7766956984996796, "num_tokens": 3039407.0, "step": 186 }, { "entropy": 0.5552389770746231, "epoch": 0.6977611940298507, "grad_norm": 0.10993515700101852, "learning_rate": 0.0002, "loss": 0.5586962103843689, "mean_token_accuracy": 0.7749262005090714, "num_tokens": 3055780.0, "step": 187 }, { "entropy": 0.5666979551315308, "epoch": 0.7014925373134329, "grad_norm": 0.11159558594226837, "learning_rate": 0.0002, "loss": 0.5667304992675781, "mean_token_accuracy": 0.7695165723562241, "num_tokens": 3072362.0, "step": 188 }, { "entropy": 0.5639722347259521, "epoch": 0.7052238805970149, "grad_norm": 0.14158234000205994, "learning_rate": 0.0002, "loss": 0.5614078044891357, "mean_token_accuracy": 0.7733878195285797, "num_tokens": 3088887.0, "step": 189 }, { "entropy": 0.5518735945224762, "epoch": 0.7089552238805971, "grad_norm": 0.12406881153583527, "learning_rate": 0.0002, "loss": 0.5611676573753357, "mean_token_accuracy": 0.7746167629957199, "num_tokens": 3105332.0, "step": 190 }, { "entropy": 0.5349650382995605, "epoch": 0.7126865671641791, "grad_norm": 0.13473471999168396, "learning_rate": 0.0002, "loss": 0.54412841796875, "mean_token_accuracy": 0.7769501060247421, "num_tokens": 3121582.0, "step": 191 }, { "entropy": 0.5316546410322189, "epoch": 0.7164179104477612, "grad_norm": 0.11828400939702988, "learning_rate": 0.0002, "loss": 0.530936062335968, "mean_token_accuracy": 0.7848189175128937, "num_tokens": 3137920.0, "step": 192 }, { "entropy": 0.556887611746788, "epoch": 0.7201492537313433, "grad_norm": 0.1256878823041916, "learning_rate": 0.0002, "loss": 0.555519700050354, "mean_token_accuracy": 0.7738869190216064, "num_tokens": 3154339.0, "step": 193 }, { "entropy": 0.5477663427591324, "epoch": 0.7238805970149254, "grad_norm": 0.11984176933765411, "learning_rate": 0.0002, "loss": 0.5489908456802368, "mean_token_accuracy": 0.7780539244413376, "num_tokens": 3170574.0, "step": 194 }, { "entropy": 0.5371970534324646, "epoch": 0.7276119402985075, "grad_norm": 0.11440598219633102, "learning_rate": 0.0002, "loss": 0.5346511602401733, "mean_token_accuracy": 0.7856602966785431, "num_tokens": 3187140.0, "step": 195 }, { "entropy": 0.5374069362878799, "epoch": 0.7313432835820896, "grad_norm": 0.1220874935388565, "learning_rate": 0.0002, "loss": 0.5448272228240967, "mean_token_accuracy": 0.7792176902294159, "num_tokens": 3203454.0, "step": 196 }, { "entropy": 0.5373833179473877, "epoch": 0.7350746268656716, "grad_norm": 0.14692658185958862, "learning_rate": 0.0002, "loss": 0.547886312007904, "mean_token_accuracy": 0.7767521291971207, "num_tokens": 3219558.0, "step": 197 }, { "entropy": 0.554410994052887, "epoch": 0.7388059701492538, "grad_norm": 0.12380608916282654, "learning_rate": 0.0002, "loss": 0.550884485244751, "mean_token_accuracy": 0.7776724547147751, "num_tokens": 3235877.0, "step": 198 }, { "entropy": 0.5471773892641068, "epoch": 0.7425373134328358, "grad_norm": 0.11140885949134827, "learning_rate": 0.0002, "loss": 0.5401238799095154, "mean_token_accuracy": 0.7774412035942078, "num_tokens": 3252209.0, "step": 199 }, { "entropy": 0.5380608141422272, "epoch": 0.746268656716418, "grad_norm": 0.1454455554485321, "learning_rate": 0.0002, "loss": 0.5387637615203857, "mean_token_accuracy": 0.7800891399383545, "num_tokens": 3268329.0, "step": 200 }, { "entropy": 0.5308581739664078, "epoch": 0.75, "grad_norm": 0.1361016035079956, "learning_rate": 0.0002, "loss": 0.5343608260154724, "mean_token_accuracy": 0.7855110317468643, "num_tokens": 3284338.0, "step": 201 }, { "entropy": 0.5632822811603546, "epoch": 0.753731343283582, "grad_norm": 0.13291221857070923, "learning_rate": 0.0002, "loss": 0.5640154480934143, "mean_token_accuracy": 0.767445370554924, "num_tokens": 3300776.0, "step": 202 }, { "entropy": 0.554180920124054, "epoch": 0.7574626865671642, "grad_norm": 0.12478666007518768, "learning_rate": 0.0002, "loss": 0.5525573492050171, "mean_token_accuracy": 0.774932399392128, "num_tokens": 3317196.0, "step": 203 }, { "entropy": 0.5349105298519135, "epoch": 0.7611940298507462, "grad_norm": 0.12442342936992645, "learning_rate": 0.0002, "loss": 0.5401512980461121, "mean_token_accuracy": 0.7819676995277405, "num_tokens": 3333516.0, "step": 204 }, { "entropy": 0.5417488664388657, "epoch": 0.7649253731343284, "grad_norm": 0.12787121534347534, "learning_rate": 0.0002, "loss": 0.5460774302482605, "mean_token_accuracy": 0.7793125957250595, "num_tokens": 3349860.0, "step": 205 }, { "entropy": 0.5238666534423828, "epoch": 0.7686567164179104, "grad_norm": 0.14022648334503174, "learning_rate": 0.0002, "loss": 0.5336724519729614, "mean_token_accuracy": 0.7843347638845444, "num_tokens": 3365954.0, "step": 206 }, { "entropy": 0.5506514012813568, "epoch": 0.7723880597014925, "grad_norm": 0.10952670127153397, "learning_rate": 0.0002, "loss": 0.5459721684455872, "mean_token_accuracy": 0.7809877097606659, "num_tokens": 3382344.0, "step": 207 }, { "entropy": 0.5601198077201843, "epoch": 0.7761194029850746, "grad_norm": 0.14921848475933075, "learning_rate": 0.0002, "loss": 0.5593782663345337, "mean_token_accuracy": 0.7718043476343155, "num_tokens": 3398687.0, "step": 208 }, { "entropy": 0.5334768891334534, "epoch": 0.7798507462686567, "grad_norm": 0.11596426367759705, "learning_rate": 0.0002, "loss": 0.5338318943977356, "mean_token_accuracy": 0.783938467502594, "num_tokens": 3414913.0, "step": 209 }, { "entropy": 0.5415135025978088, "epoch": 0.7835820895522388, "grad_norm": 0.13524818420410156, "learning_rate": 0.0002, "loss": 0.5422087907791138, "mean_token_accuracy": 0.7810906171798706, "num_tokens": 3431071.0, "step": 210 }, { "entropy": 0.5562594383955002, "epoch": 0.7873134328358209, "grad_norm": 0.14714977145195007, "learning_rate": 0.0002, "loss": 0.5575138926506042, "mean_token_accuracy": 0.7743899971246719, "num_tokens": 3447417.0, "step": 211 }, { "entropy": 0.536840409040451, "epoch": 0.7910447761194029, "grad_norm": 0.1191772073507309, "learning_rate": 0.0002, "loss": 0.539043664932251, "mean_token_accuracy": 0.7791986167430878, "num_tokens": 3463951.0, "step": 212 }, { "entropy": 0.5601708441972733, "epoch": 0.7947761194029851, "grad_norm": 0.14285218715667725, "learning_rate": 0.0002, "loss": 0.5604355931282043, "mean_token_accuracy": 0.7729564011096954, "num_tokens": 3480303.0, "step": 213 }, { "entropy": 0.5470457077026367, "epoch": 0.7985074626865671, "grad_norm": 0.13420677185058594, "learning_rate": 0.0002, "loss": 0.554261326789856, "mean_token_accuracy": 0.7758394628763199, "num_tokens": 3496665.0, "step": 214 }, { "entropy": 0.5595335066318512, "epoch": 0.8022388059701493, "grad_norm": 0.12468434125185013, "learning_rate": 0.0002, "loss": 0.5626363158226013, "mean_token_accuracy": 0.7708792388439178, "num_tokens": 3512987.0, "step": 215 }, { "entropy": 0.5410265326499939, "epoch": 0.8059701492537313, "grad_norm": 0.1368313878774643, "learning_rate": 0.0002, "loss": 0.5424209237098694, "mean_token_accuracy": 0.780338704586029, "num_tokens": 3529322.0, "step": 216 }, { "entropy": 0.5611067861318588, "epoch": 0.8097014925373134, "grad_norm": 0.12065284699201584, "learning_rate": 0.0002, "loss": 0.5554131269454956, "mean_token_accuracy": 0.775262787938118, "num_tokens": 3545541.0, "step": 217 }, { "entropy": 0.5451776385307312, "epoch": 0.8134328358208955, "grad_norm": 0.13018189370632172, "learning_rate": 0.0002, "loss": 0.5477407574653625, "mean_token_accuracy": 0.7790820002555847, "num_tokens": 3562081.0, "step": 218 }, { "entropy": 0.5475118607282639, "epoch": 0.8171641791044776, "grad_norm": 0.1309870183467865, "learning_rate": 0.0002, "loss": 0.548214852809906, "mean_token_accuracy": 0.7790254205465317, "num_tokens": 3578349.0, "step": 219 }, { "entropy": 0.5216370671987534, "epoch": 0.8208955223880597, "grad_norm": 0.1223544329404831, "learning_rate": 0.0002, "loss": 0.5256963968276978, "mean_token_accuracy": 0.787861168384552, "num_tokens": 3594724.0, "step": 220 }, { "entropy": 0.5441537946462631, "epoch": 0.8246268656716418, "grad_norm": 0.1324274092912674, "learning_rate": 0.0002, "loss": 0.5496052503585815, "mean_token_accuracy": 0.7781362533569336, "num_tokens": 3611250.0, "step": 221 }, { "entropy": 0.5336802899837494, "epoch": 0.8283582089552238, "grad_norm": 0.15294679999351501, "learning_rate": 0.0002, "loss": 0.5427975654602051, "mean_token_accuracy": 0.7801742255687714, "num_tokens": 3627526.0, "step": 222 }, { "entropy": 0.5635577589273453, "epoch": 0.832089552238806, "grad_norm": 0.1364123523235321, "learning_rate": 0.0002, "loss": 0.5619288682937622, "mean_token_accuracy": 0.768532395362854, "num_tokens": 3643553.0, "step": 223 }, { "entropy": 0.5576212853193283, "epoch": 0.835820895522388, "grad_norm": 0.1353282779455185, "learning_rate": 0.0002, "loss": 0.5438153147697449, "mean_token_accuracy": 0.779265359044075, "num_tokens": 3660133.0, "step": 224 }, { "entropy": 0.5412103980779648, "epoch": 0.8395522388059702, "grad_norm": 0.12540455162525177, "learning_rate": 0.0002, "loss": 0.5397533774375916, "mean_token_accuracy": 0.7794700264930725, "num_tokens": 3676295.0, "step": 225 }, { "entropy": 0.5455985218286514, "epoch": 0.8432835820895522, "grad_norm": 0.13320018351078033, "learning_rate": 0.0002, "loss": 0.5485510230064392, "mean_token_accuracy": 0.778446152806282, "num_tokens": 3692894.0, "step": 226 }, { "entropy": 0.5248135328292847, "epoch": 0.8470149253731343, "grad_norm": 0.13709791004657745, "learning_rate": 0.0002, "loss": 0.536843478679657, "mean_token_accuracy": 0.7809243649244308, "num_tokens": 3709122.0, "step": 227 }, { "entropy": 0.53542160987854, "epoch": 0.8507462686567164, "grad_norm": 0.12484195083379745, "learning_rate": 0.0002, "loss": 0.5407888293266296, "mean_token_accuracy": 0.7803395837545395, "num_tokens": 3725461.0, "step": 228 }, { "entropy": 0.5458493530750275, "epoch": 0.8544776119402985, "grad_norm": 0.13020864129066467, "learning_rate": 0.0002, "loss": 0.5498859882354736, "mean_token_accuracy": 0.7766377329826355, "num_tokens": 3741717.0, "step": 229 }, { "entropy": 0.5359915047883987, "epoch": 0.8582089552238806, "grad_norm": 0.11409227550029755, "learning_rate": 0.0002, "loss": 0.5289561748504639, "mean_token_accuracy": 0.7882120311260223, "num_tokens": 3757988.0, "step": 230 }, { "entropy": 0.5659278780221939, "epoch": 0.8619402985074627, "grad_norm": 0.10721168667078018, "learning_rate": 0.0002, "loss": 0.5621720552444458, "mean_token_accuracy": 0.7705938816070557, "num_tokens": 3774220.0, "step": 231 }, { "entropy": 0.5599822998046875, "epoch": 0.8656716417910447, "grad_norm": 0.12365678697824478, "learning_rate": 0.0002, "loss": 0.5598929524421692, "mean_token_accuracy": 0.7715335041284561, "num_tokens": 3790653.0, "step": 232 }, { "entropy": 0.54929418861866, "epoch": 0.8694029850746269, "grad_norm": 0.12949936091899872, "learning_rate": 0.0002, "loss": 0.5555176734924316, "mean_token_accuracy": 0.7733278125524521, "num_tokens": 3807110.0, "step": 233 }, { "entropy": 0.5474081933498383, "epoch": 0.8731343283582089, "grad_norm": 0.12146537750959396, "learning_rate": 0.0002, "loss": 0.5511813759803772, "mean_token_accuracy": 0.7766411751508713, "num_tokens": 3823486.0, "step": 234 }, { "entropy": 0.5372883975505829, "epoch": 0.8768656716417911, "grad_norm": 0.12444064766168594, "learning_rate": 0.0002, "loss": 0.5384877324104309, "mean_token_accuracy": 0.7811126857995987, "num_tokens": 3839856.0, "step": 235 }, { "entropy": 0.5574021190404892, "epoch": 0.8805970149253731, "grad_norm": 0.11953511834144592, "learning_rate": 0.0002, "loss": 0.5613345503807068, "mean_token_accuracy": 0.7729752510786057, "num_tokens": 3856362.0, "step": 236 }, { "entropy": 0.5452482104301453, "epoch": 0.8843283582089553, "grad_norm": 0.11208797991275787, "learning_rate": 0.0002, "loss": 0.5457064509391785, "mean_token_accuracy": 0.7782498598098755, "num_tokens": 3872666.0, "step": 237 }, { "entropy": 0.5534125864505768, "epoch": 0.8880597014925373, "grad_norm": 0.15453441441059113, "learning_rate": 0.0002, "loss": 0.5572060346603394, "mean_token_accuracy": 0.7716512382030487, "num_tokens": 3888939.0, "step": 238 }, { "entropy": 0.547100231051445, "epoch": 0.8917910447761194, "grad_norm": 0.12707094848155975, "learning_rate": 0.0002, "loss": 0.5511140823364258, "mean_token_accuracy": 0.7789764106273651, "num_tokens": 3905243.0, "step": 239 }, { "entropy": 0.544873908162117, "epoch": 0.8955223880597015, "grad_norm": 0.13703206181526184, "learning_rate": 0.0002, "loss": 0.5423987507820129, "mean_token_accuracy": 0.7779188007116318, "num_tokens": 3921866.0, "step": 240 }, { "entropy": 0.5453302264213562, "epoch": 0.8992537313432836, "grad_norm": 0.11689020693302155, "learning_rate": 0.0002, "loss": 0.5460352301597595, "mean_token_accuracy": 0.7779721468687057, "num_tokens": 3938407.0, "step": 241 }, { "entropy": 0.5635591447353363, "epoch": 0.9029850746268657, "grad_norm": 0.13040713965892792, "learning_rate": 0.0002, "loss": 0.5655105113983154, "mean_token_accuracy": 0.768951028585434, "num_tokens": 3954812.0, "step": 242 }, { "entropy": 0.5287201702594757, "epoch": 0.9067164179104478, "grad_norm": 0.11932681500911713, "learning_rate": 0.0002, "loss": 0.5290012359619141, "mean_token_accuracy": 0.7868975102901459, "num_tokens": 3970722.0, "step": 243 }, { "entropy": 0.5399811267852783, "epoch": 0.9104477611940298, "grad_norm": 0.15166425704956055, "learning_rate": 0.0002, "loss": 0.5475818514823914, "mean_token_accuracy": 0.7782254964113235, "num_tokens": 3986919.0, "step": 244 }, { "entropy": 0.5479171127080917, "epoch": 0.914179104477612, "grad_norm": 0.13205286860466003, "learning_rate": 0.0002, "loss": 0.5506084561347961, "mean_token_accuracy": 0.7769028395414352, "num_tokens": 4003718.0, "step": 245 }, { "entropy": 0.5506049394607544, "epoch": 0.917910447761194, "grad_norm": 0.1079086884856224, "learning_rate": 0.0002, "loss": 0.5398848056793213, "mean_token_accuracy": 0.7830533385276794, "num_tokens": 4020063.0, "step": 246 }, { "entropy": 0.5654618889093399, "epoch": 0.9216417910447762, "grad_norm": 0.1322406679391861, "learning_rate": 0.0002, "loss": 0.5590391755104065, "mean_token_accuracy": 0.7732941806316376, "num_tokens": 4036681.0, "step": 247 }, { "entropy": 0.546074166893959, "epoch": 0.9253731343283582, "grad_norm": 0.12490007281303406, "learning_rate": 0.0002, "loss": 0.5554251670837402, "mean_token_accuracy": 0.7764608860015869, "num_tokens": 4052971.0, "step": 248 }, { "entropy": 0.5580905228853226, "epoch": 0.9291044776119403, "grad_norm": 0.11980146169662476, "learning_rate": 0.0002, "loss": 0.5676828622817993, "mean_token_accuracy": 0.7696985453367233, "num_tokens": 4069338.0, "step": 249 }, { "entropy": 0.5355470329523087, "epoch": 0.9328358208955224, "grad_norm": 0.12107004970312119, "learning_rate": 0.0002, "loss": 0.5405516028404236, "mean_token_accuracy": 0.7829477041959763, "num_tokens": 4085750.0, "step": 250 }, { "entropy": 0.5567673444747925, "epoch": 0.9365671641791045, "grad_norm": 0.12893939018249512, "learning_rate": 0.0002, "loss": 0.5650359988212585, "mean_token_accuracy": 0.7712520509958267, "num_tokens": 4102118.0, "step": 251 }, { "entropy": 0.5410316288471222, "epoch": 0.9402985074626866, "grad_norm": 0.11652866750955582, "learning_rate": 0.0002, "loss": 0.5460695028305054, "mean_token_accuracy": 0.7774221301078796, "num_tokens": 4118568.0, "step": 252 }, { "entropy": 0.5609200298786163, "epoch": 0.9440298507462687, "grad_norm": 0.11244899779558182, "learning_rate": 0.0002, "loss": 0.5490402579307556, "mean_token_accuracy": 0.7748613804578781, "num_tokens": 4135123.0, "step": 253 }, { "entropy": 0.5497269034385681, "epoch": 0.9477611940298507, "grad_norm": 0.14016613364219666, "learning_rate": 0.0002, "loss": 0.5342196822166443, "mean_token_accuracy": 0.7829579263925552, "num_tokens": 4151216.0, "step": 254 }, { "entropy": 0.5376796424388885, "epoch": 0.9514925373134329, "grad_norm": 0.11261948943138123, "learning_rate": 0.0002, "loss": 0.5384314656257629, "mean_token_accuracy": 0.779564619064331, "num_tokens": 4167504.0, "step": 255 }, { "entropy": 0.5369044691324234, "epoch": 0.9552238805970149, "grad_norm": 0.1335015743970871, "learning_rate": 0.0002, "loss": 0.5465540885925293, "mean_token_accuracy": 0.7757421284914017, "num_tokens": 4183799.0, "step": 256 }, { "entropy": 0.5567403733730316, "epoch": 0.9589552238805971, "grad_norm": 0.14907455444335938, "learning_rate": 0.0002, "loss": 0.567619800567627, "mean_token_accuracy": 0.770223930478096, "num_tokens": 4200155.0, "step": 257 }, { "entropy": 0.5468429028987885, "epoch": 0.9626865671641791, "grad_norm": 0.11520266532897949, "learning_rate": 0.0002, "loss": 0.5453846454620361, "mean_token_accuracy": 0.7773052304983139, "num_tokens": 4216435.0, "step": 258 }, { "entropy": 0.5431469082832336, "epoch": 0.9664179104477612, "grad_norm": 0.13169828057289124, "learning_rate": 0.0002, "loss": 0.5401536822319031, "mean_token_accuracy": 0.7807234972715378, "num_tokens": 4232685.0, "step": 259 }, { "entropy": 0.5463652908802032, "epoch": 0.9701492537313433, "grad_norm": 0.1208634227514267, "learning_rate": 0.0002, "loss": 0.539630115032196, "mean_token_accuracy": 0.7806746661663055, "num_tokens": 4248983.0, "step": 260 }, { "entropy": 0.5373689532279968, "epoch": 0.9738805970149254, "grad_norm": 0.1322765052318573, "learning_rate": 0.0002, "loss": 0.5365580916404724, "mean_token_accuracy": 0.7808263897895813, "num_tokens": 4265223.0, "step": 261 }, { "entropy": 0.5479995906352997, "epoch": 0.9776119402985075, "grad_norm": 0.12395796924829483, "learning_rate": 0.0002, "loss": 0.5560559630393982, "mean_token_accuracy": 0.7720989733934402, "num_tokens": 4281420.0, "step": 262 }, { "entropy": 0.5320831388235092, "epoch": 0.9813432835820896, "grad_norm": 0.15233781933784485, "learning_rate": 0.0002, "loss": 0.5420798659324646, "mean_token_accuracy": 0.7780148983001709, "num_tokens": 4297933.0, "step": 263 }, { "entropy": 0.5410943180322647, "epoch": 0.9850746268656716, "grad_norm": 0.11531079560518265, "learning_rate": 0.0002, "loss": 0.5476459264755249, "mean_token_accuracy": 0.7788786739110947, "num_tokens": 4314320.0, "step": 264 }, { "entropy": 0.5516358613967896, "epoch": 0.9888059701492538, "grad_norm": 0.11947735399007797, "learning_rate": 0.0002, "loss": 0.5536230206489563, "mean_token_accuracy": 0.7767823338508606, "num_tokens": 4330601.0, "step": 265 }, { "entropy": 0.5500903576612473, "epoch": 0.9925373134328358, "grad_norm": 0.12315159291028976, "learning_rate": 0.0002, "loss": 0.5529444813728333, "mean_token_accuracy": 0.7752810269594193, "num_tokens": 4347043.0, "step": 266 }, { "entropy": 0.5517779290676117, "epoch": 0.996268656716418, "grad_norm": 0.11137247085571289, "learning_rate": 0.0002, "loss": 0.5534829497337341, "mean_token_accuracy": 0.7717059701681137, "num_tokens": 4363391.0, "step": 267 }, { "entropy": 0.5500383973121643, "epoch": 1.0, "grad_norm": 0.1438470184803009, "learning_rate": 0.0002, "loss": 0.5475767850875854, "mean_token_accuracy": 0.7807454466819763, "num_tokens": 4379703.0, "step": 268 }, { "entropy": 0.5567186176776886, "epoch": 1.0037313432835822, "grad_norm": 0.12165568768978119, "learning_rate": 0.0002, "loss": 0.5443229079246521, "mean_token_accuracy": 0.7788188308477402, "num_tokens": 4395979.0, "step": 269 }, { "entropy": 0.5200136750936508, "epoch": 1.007462686567164, "grad_norm": 0.11453047394752502, "learning_rate": 0.0002, "loss": 0.5096794962882996, "mean_token_accuracy": 0.7945292145013809, "num_tokens": 4412227.0, "step": 270 }, { "entropy": 0.5380017757415771, "epoch": 1.0111940298507462, "grad_norm": 0.15120473504066467, "learning_rate": 0.0002, "loss": 0.5425546169281006, "mean_token_accuracy": 0.781953439116478, "num_tokens": 4428611.0, "step": 271 }, { "entropy": 0.5208772569894791, "epoch": 1.0149253731343284, "grad_norm": 0.1341351717710495, "learning_rate": 0.0002, "loss": 0.5326657295227051, "mean_token_accuracy": 0.7831600904464722, "num_tokens": 4444927.0, "step": 272 }, { "entropy": 0.5214353799819946, "epoch": 1.0186567164179103, "grad_norm": 0.14984826743602753, "learning_rate": 0.0002, "loss": 0.5280492901802063, "mean_token_accuracy": 0.786370187997818, "num_tokens": 4460991.0, "step": 273 }, { "entropy": 0.5258834809064865, "epoch": 1.0223880597014925, "grad_norm": 0.13014522194862366, "learning_rate": 0.0002, "loss": 0.5271875858306885, "mean_token_accuracy": 0.7869210243225098, "num_tokens": 4477645.0, "step": 274 }, { "entropy": 0.5273120403289795, "epoch": 1.0261194029850746, "grad_norm": 0.1311647742986679, "learning_rate": 0.0002, "loss": 0.5195775032043457, "mean_token_accuracy": 0.7897085547447205, "num_tokens": 4493809.0, "step": 275 }, { "entropy": 0.5415386855602264, "epoch": 1.0298507462686568, "grad_norm": 0.11555178463459015, "learning_rate": 0.0002, "loss": 0.5413332581520081, "mean_token_accuracy": 0.7796304523944855, "num_tokens": 4510212.0, "step": 276 }, { "entropy": 0.5370220988988876, "epoch": 1.0335820895522387, "grad_norm": 0.13971680402755737, "learning_rate": 0.0002, "loss": 0.5396295785903931, "mean_token_accuracy": 0.7788214385509491, "num_tokens": 4526435.0, "step": 277 }, { "entropy": 0.5435305833816528, "epoch": 1.037313432835821, "grad_norm": 0.10762611031532288, "learning_rate": 0.0002, "loss": 0.5435919761657715, "mean_token_accuracy": 0.7784401625394821, "num_tokens": 4542952.0, "step": 278 }, { "entropy": 0.5561162084341049, "epoch": 1.041044776119403, "grad_norm": 0.1305421143770218, "learning_rate": 0.0002, "loss": 0.5544913411140442, "mean_token_accuracy": 0.7771686464548111, "num_tokens": 4559371.0, "step": 279 }, { "entropy": 0.5161843150854111, "epoch": 1.044776119402985, "grad_norm": 0.13184338808059692, "learning_rate": 0.0002, "loss": 0.511843204498291, "mean_token_accuracy": 0.7913843542337418, "num_tokens": 4575731.0, "step": 280 }, { "entropy": 0.52925243973732, "epoch": 1.0485074626865671, "grad_norm": 0.1287873089313507, "learning_rate": 0.0002, "loss": 0.5263785719871521, "mean_token_accuracy": 0.7861436605453491, "num_tokens": 4592056.0, "step": 281 }, { "entropy": 0.5253249853849411, "epoch": 1.0522388059701493, "grad_norm": 0.12661200761795044, "learning_rate": 0.0002, "loss": 0.5272859334945679, "mean_token_accuracy": 0.7849764674901962, "num_tokens": 4608326.0, "step": 282 }, { "entropy": 0.5225464850664139, "epoch": 1.0559701492537314, "grad_norm": 0.11925826221704483, "learning_rate": 0.0002, "loss": 0.5287873148918152, "mean_token_accuracy": 0.7825718820095062, "num_tokens": 4624408.0, "step": 283 }, { "entropy": 0.5239171385765076, "epoch": 1.0597014925373134, "grad_norm": 0.12639594078063965, "learning_rate": 0.0002, "loss": 0.5275134444236755, "mean_token_accuracy": 0.784866139292717, "num_tokens": 4640897.0, "step": 284 }, { "entropy": 0.5350142568349838, "epoch": 1.0634328358208955, "grad_norm": 0.13742367923259735, "learning_rate": 0.0002, "loss": 0.5391872525215149, "mean_token_accuracy": 0.7813242971897125, "num_tokens": 4657487.0, "step": 285 }, { "entropy": 0.5414403080940247, "epoch": 1.0671641791044777, "grad_norm": 0.12273678928613663, "learning_rate": 0.0002, "loss": 0.538042426109314, "mean_token_accuracy": 0.7844662219285965, "num_tokens": 4674009.0, "step": 286 }, { "entropy": 0.5556955337524414, "epoch": 1.0708955223880596, "grad_norm": 0.11591946333646774, "learning_rate": 0.0002, "loss": 0.5542109608650208, "mean_token_accuracy": 0.7758783847093582, "num_tokens": 4690230.0, "step": 287 }, { "entropy": 0.5334881544113159, "epoch": 1.0746268656716418, "grad_norm": 0.11168122291564941, "learning_rate": 0.0002, "loss": 0.5347651243209839, "mean_token_accuracy": 0.7833859175443649, "num_tokens": 4706362.0, "step": 288 }, { "entropy": 0.5315591096878052, "epoch": 1.078358208955224, "grad_norm": 0.13917559385299683, "learning_rate": 0.0002, "loss": 0.5380789041519165, "mean_token_accuracy": 0.7812001705169678, "num_tokens": 4722595.0, "step": 289 }, { "entropy": 0.5346228331327438, "epoch": 1.0820895522388059, "grad_norm": 0.13478422164916992, "learning_rate": 0.0002, "loss": 0.5455847978591919, "mean_token_accuracy": 0.7781703919172287, "num_tokens": 4738887.0, "step": 290 }, { "entropy": 0.5461715310811996, "epoch": 1.085820895522388, "grad_norm": 0.13396981358528137, "learning_rate": 0.0002, "loss": 0.5379023551940918, "mean_token_accuracy": 0.7827265560626984, "num_tokens": 4755212.0, "step": 291 }, { "entropy": 0.5389465689659119, "epoch": 1.0895522388059702, "grad_norm": 0.12781155109405518, "learning_rate": 0.0002, "loss": 0.5376452803611755, "mean_token_accuracy": 0.7828295826911926, "num_tokens": 4771644.0, "step": 292 }, { "entropy": 0.5441965609788895, "epoch": 1.0932835820895523, "grad_norm": 0.13662317395210266, "learning_rate": 0.0002, "loss": 0.53973788022995, "mean_token_accuracy": 0.781336709856987, "num_tokens": 4787994.0, "step": 293 }, { "entropy": 0.557211622595787, "epoch": 1.0970149253731343, "grad_norm": 0.13968485593795776, "learning_rate": 0.0002, "loss": 0.5545478463172913, "mean_token_accuracy": 0.7766687870025635, "num_tokens": 4804240.0, "step": 294 }, { "entropy": 0.5415647476911545, "epoch": 1.1007462686567164, "grad_norm": 0.14245721697807312, "learning_rate": 0.0002, "loss": 0.5388385653495789, "mean_token_accuracy": 0.7829283177852631, "num_tokens": 4820711.0, "step": 295 }, { "entropy": 0.5286812037229538, "epoch": 1.1044776119402986, "grad_norm": 0.14483948051929474, "learning_rate": 0.0002, "loss": 0.5349111557006836, "mean_token_accuracy": 0.7845683097839355, "num_tokens": 4836959.0, "step": 296 }, { "entropy": 0.5258732736110687, "epoch": 1.1082089552238805, "grad_norm": 0.13696761429309845, "learning_rate": 0.0002, "loss": 0.529443085193634, "mean_token_accuracy": 0.7867940962314606, "num_tokens": 4853067.0, "step": 297 }, { "entropy": 0.5512303709983826, "epoch": 1.1119402985074627, "grad_norm": 0.15340439975261688, "learning_rate": 0.0002, "loss": 0.552986741065979, "mean_token_accuracy": 0.7754423469305038, "num_tokens": 4869588.0, "step": 298 }, { "entropy": 0.5339537411928177, "epoch": 1.1156716417910448, "grad_norm": 0.15107926726341248, "learning_rate": 0.0002, "loss": 0.5356568694114685, "mean_token_accuracy": 0.7815524339675903, "num_tokens": 4885904.0, "step": 299 }, { "entropy": 0.5544896274805069, "epoch": 1.1194029850746268, "grad_norm": 0.13157761096954346, "learning_rate": 0.0002, "loss": 0.5553483366966248, "mean_token_accuracy": 0.7737178802490234, "num_tokens": 4902327.0, "step": 300 }, { "entropy": 0.5695160180330276, "epoch": 1.123134328358209, "grad_norm": 0.1447787880897522, "learning_rate": 0.0002, "loss": 0.5667352676391602, "mean_token_accuracy": 0.7724233418703079, "num_tokens": 4918857.0, "step": 301 }, { "entropy": 0.5424528568983078, "epoch": 1.126865671641791, "grad_norm": 0.130395770072937, "learning_rate": 0.0002, "loss": 0.54450523853302, "mean_token_accuracy": 0.7784540206193924, "num_tokens": 4935469.0, "step": 302 }, { "entropy": 0.537494882941246, "epoch": 1.1305970149253732, "grad_norm": 0.1572721302509308, "learning_rate": 0.0002, "loss": 0.539937436580658, "mean_token_accuracy": 0.7787607908248901, "num_tokens": 4951497.0, "step": 303 }, { "entropy": 0.5239665806293488, "epoch": 1.1343283582089552, "grad_norm": 0.14227941632270813, "learning_rate": 0.0002, "loss": 0.5174288153648376, "mean_token_accuracy": 0.7907485216856003, "num_tokens": 4967826.0, "step": 304 }, { "entropy": 0.5226030200719833, "epoch": 1.1380597014925373, "grad_norm": 0.13234300911426544, "learning_rate": 0.0002, "loss": 0.5237756967544556, "mean_token_accuracy": 0.7902256399393082, "num_tokens": 4984247.0, "step": 305 }, { "entropy": 0.5070921406149864, "epoch": 1.1417910447761195, "grad_norm": 0.15718795359134674, "learning_rate": 0.0002, "loss": 0.520646333694458, "mean_token_accuracy": 0.7865647524595261, "num_tokens": 5000320.0, "step": 306 }, { "entropy": 0.5070105642080307, "epoch": 1.1455223880597014, "grad_norm": 0.20183522999286652, "learning_rate": 0.0002, "loss": 0.528045654296875, "mean_token_accuracy": 0.7873903512954712, "num_tokens": 5016226.0, "step": 307 }, { "entropy": 0.5490072518587112, "epoch": 1.1492537313432836, "grad_norm": 0.12259556353092194, "learning_rate": 0.0002, "loss": 0.5465996861457825, "mean_token_accuracy": 0.7795770764350891, "num_tokens": 5032435.0, "step": 308 }, { "entropy": 0.5369555801153183, "epoch": 1.1529850746268657, "grad_norm": 0.17033320665359497, "learning_rate": 0.0002, "loss": 0.5238630175590515, "mean_token_accuracy": 0.7864966690540314, "num_tokens": 5048673.0, "step": 309 }, { "entropy": 0.5474718064069748, "epoch": 1.1567164179104479, "grad_norm": 0.15336251258850098, "learning_rate": 0.0002, "loss": 0.5351282358169556, "mean_token_accuracy": 0.7832874804735184, "num_tokens": 5064889.0, "step": 310 }, { "entropy": 0.5407518595457077, "epoch": 1.1604477611940298, "grad_norm": 0.1288745403289795, "learning_rate": 0.0002, "loss": 0.532909631729126, "mean_token_accuracy": 0.7854967713356018, "num_tokens": 5081181.0, "step": 311 }, { "entropy": 0.5553453862667084, "epoch": 1.164179104477612, "grad_norm": 0.17325082421302795, "learning_rate": 0.0002, "loss": 0.5650225877761841, "mean_token_accuracy": 0.7709382623434067, "num_tokens": 5097695.0, "step": 312 }, { "entropy": 0.5312155932188034, "epoch": 1.1679104477611941, "grad_norm": 0.14813978970050812, "learning_rate": 0.0002, "loss": 0.5398642420768738, "mean_token_accuracy": 0.7819912135601044, "num_tokens": 5114124.0, "step": 313 }, { "entropy": 0.5393004268407822, "epoch": 1.171641791044776, "grad_norm": 0.13244624435901642, "learning_rate": 0.0002, "loss": 0.5397657155990601, "mean_token_accuracy": 0.7833016067743301, "num_tokens": 5130526.0, "step": 314 }, { "entropy": 0.5356107205152512, "epoch": 1.1753731343283582, "grad_norm": 0.1546393185853958, "learning_rate": 0.0002, "loss": 0.5278767347335815, "mean_token_accuracy": 0.7873012572526932, "num_tokens": 5146786.0, "step": 315 }, { "entropy": 0.5360458493232727, "epoch": 1.1791044776119404, "grad_norm": 0.14604224264621735, "learning_rate": 0.0002, "loss": 0.5378543138504028, "mean_token_accuracy": 0.7808638215065002, "num_tokens": 5163157.0, "step": 316 }, { "entropy": 0.5358310341835022, "epoch": 1.1828358208955223, "grad_norm": 0.11514927446842194, "learning_rate": 0.0002, "loss": 0.5323253273963928, "mean_token_accuracy": 0.7850612699985504, "num_tokens": 5179759.0, "step": 317 }, { "entropy": 0.5336421579122543, "epoch": 1.1865671641791045, "grad_norm": 0.14939743280410767, "learning_rate": 0.0002, "loss": 0.5399504899978638, "mean_token_accuracy": 0.7822477370500565, "num_tokens": 5195772.0, "step": 318 }, { "entropy": 0.5196461454033852, "epoch": 1.1902985074626866, "grad_norm": 0.16364845633506775, "learning_rate": 0.0002, "loss": 0.5318784117698669, "mean_token_accuracy": 0.7826407551765442, "num_tokens": 5212049.0, "step": 319 }, { "entropy": 0.5297210067510605, "epoch": 1.1940298507462686, "grad_norm": 0.1340930312871933, "learning_rate": 0.0002, "loss": 0.5342279672622681, "mean_token_accuracy": 0.7825554758310318, "num_tokens": 5228387.0, "step": 320 }, { "entropy": 0.5374090075492859, "epoch": 1.1977611940298507, "grad_norm": 0.13523836433887482, "learning_rate": 0.0002, "loss": 0.5342003107070923, "mean_token_accuracy": 0.7829677164554596, "num_tokens": 5244798.0, "step": 321 }, { "entropy": 0.5403262600302696, "epoch": 1.2014925373134329, "grad_norm": 0.11974834650754929, "learning_rate": 0.0002, "loss": 0.5366995334625244, "mean_token_accuracy": 0.7828448265790939, "num_tokens": 5261240.0, "step": 322 }, { "entropy": 0.5380197167396545, "epoch": 1.205223880597015, "grad_norm": 0.154353529214859, "learning_rate": 0.0002, "loss": 0.533047080039978, "mean_token_accuracy": 0.7859889715909958, "num_tokens": 5277554.0, "step": 323 }, { "entropy": 0.5303442776203156, "epoch": 1.208955223880597, "grad_norm": 0.14264924824237823, "learning_rate": 0.0002, "loss": 0.5314475893974304, "mean_token_accuracy": 0.7831806391477585, "num_tokens": 5293949.0, "step": 324 }, { "entropy": 0.5252211391925812, "epoch": 1.212686567164179, "grad_norm": 0.1556359827518463, "learning_rate": 0.0002, "loss": 0.5285252928733826, "mean_token_accuracy": 0.783245861530304, "num_tokens": 5310026.0, "step": 325 }, { "entropy": 0.5328008607029915, "epoch": 1.2164179104477613, "grad_norm": 0.13450154662132263, "learning_rate": 0.0002, "loss": 0.5320917367935181, "mean_token_accuracy": 0.7842745780944824, "num_tokens": 5326386.0, "step": 326 }, { "entropy": 0.5319949090480804, "epoch": 1.2201492537313432, "grad_norm": 0.12143786996603012, "learning_rate": 0.0002, "loss": 0.5349273681640625, "mean_token_accuracy": 0.7820626497268677, "num_tokens": 5342658.0, "step": 327 }, { "entropy": 0.5234760195016861, "epoch": 1.2238805970149254, "grad_norm": 0.16645972430706024, "learning_rate": 0.0002, "loss": 0.5320586562156677, "mean_token_accuracy": 0.7844817489385605, "num_tokens": 5358974.0, "step": 328 }, { "entropy": 0.5378956496715546, "epoch": 1.2276119402985075, "grad_norm": 0.13522404432296753, "learning_rate": 0.0002, "loss": 0.5357790589332581, "mean_token_accuracy": 0.7823758125305176, "num_tokens": 5375371.0, "step": 329 }, { "entropy": 0.5387023985385895, "epoch": 1.2313432835820897, "grad_norm": 0.1315094530582428, "learning_rate": 0.0002, "loss": 0.5362842082977295, "mean_token_accuracy": 0.7809555679559708, "num_tokens": 5391896.0, "step": 330 }, { "entropy": 0.5072716027498245, "epoch": 1.2350746268656716, "grad_norm": 0.13498196005821228, "learning_rate": 0.0002, "loss": 0.507161021232605, "mean_token_accuracy": 0.7966707944869995, "num_tokens": 5408354.0, "step": 331 }, { "entropy": 0.5260337740182877, "epoch": 1.2388059701492538, "grad_norm": 0.13349276781082153, "learning_rate": 0.0002, "loss": 0.5276508331298828, "mean_token_accuracy": 0.7871510088443756, "num_tokens": 5424531.0, "step": 332 }, { "entropy": 0.5349582731723785, "epoch": 1.242537313432836, "grad_norm": 0.13890203833580017, "learning_rate": 0.0002, "loss": 0.5371206402778625, "mean_token_accuracy": 0.7821635603904724, "num_tokens": 5440815.0, "step": 333 }, { "entropy": 0.5346423760056496, "epoch": 1.2462686567164178, "grad_norm": 0.1553906500339508, "learning_rate": 0.0002, "loss": 0.5395735502243042, "mean_token_accuracy": 0.7817864269018173, "num_tokens": 5457072.0, "step": 334 }, { "entropy": 0.5478692203760147, "epoch": 1.25, "grad_norm": 0.15934403240680695, "learning_rate": 0.0002, "loss": 0.5516626834869385, "mean_token_accuracy": 0.7753347009420395, "num_tokens": 5473422.0, "step": 335 }, { "entropy": 0.5378739535808563, "epoch": 1.2537313432835822, "grad_norm": 0.12844312191009521, "learning_rate": 0.0002, "loss": 0.5326632261276245, "mean_token_accuracy": 0.7827756106853485, "num_tokens": 5489671.0, "step": 336 }, { "entropy": 0.5409121513366699, "epoch": 1.2574626865671643, "grad_norm": 0.1285056471824646, "learning_rate": 0.0002, "loss": 0.5452673435211182, "mean_token_accuracy": 0.7786683291196823, "num_tokens": 5506084.0, "step": 337 }, { "entropy": 0.5422088652849197, "epoch": 1.2611940298507462, "grad_norm": 0.14476130902767181, "learning_rate": 0.0002, "loss": 0.5416613817214966, "mean_token_accuracy": 0.7791768312454224, "num_tokens": 5522548.0, "step": 338 }, { "entropy": 0.5449076443910599, "epoch": 1.2649253731343284, "grad_norm": 0.13138490915298462, "learning_rate": 0.0002, "loss": 0.5395404696464539, "mean_token_accuracy": 0.7813031673431396, "num_tokens": 5539208.0, "step": 339 }, { "entropy": 0.5443570464849472, "epoch": 1.2686567164179103, "grad_norm": 0.15328356623649597, "learning_rate": 0.0002, "loss": 0.5410760641098022, "mean_token_accuracy": 0.7822384089231491, "num_tokens": 5555492.0, "step": 340 }, { "entropy": 0.5302190482616425, "epoch": 1.2723880597014925, "grad_norm": 0.15014180541038513, "learning_rate": 0.0002, "loss": 0.5311694145202637, "mean_token_accuracy": 0.7823975682258606, "num_tokens": 5571999.0, "step": 341 }, { "entropy": 0.5198534801602364, "epoch": 1.2761194029850746, "grad_norm": 0.13281527161598206, "learning_rate": 0.0002, "loss": 0.5303924083709717, "mean_token_accuracy": 0.7844155579805374, "num_tokens": 5588098.0, "step": 342 }, { "entropy": 0.5089417994022369, "epoch": 1.2798507462686568, "grad_norm": 0.1406290978193283, "learning_rate": 0.0002, "loss": 0.5175491571426392, "mean_token_accuracy": 0.7906824499368668, "num_tokens": 5604254.0, "step": 343 }, { "entropy": 0.5032122731208801, "epoch": 1.2835820895522387, "grad_norm": 0.15877749025821686, "learning_rate": 0.0002, "loss": 0.5124095678329468, "mean_token_accuracy": 0.790567934513092, "num_tokens": 5620363.0, "step": 344 }, { "entropy": 0.5435033291578293, "epoch": 1.287313432835821, "grad_norm": 0.1633625328540802, "learning_rate": 0.0002, "loss": 0.553101658821106, "mean_token_accuracy": 0.7757033556699753, "num_tokens": 5636720.0, "step": 345 }, { "entropy": 0.5401125550270081, "epoch": 1.291044776119403, "grad_norm": 0.14126214385032654, "learning_rate": 0.0002, "loss": 0.5362418293952942, "mean_token_accuracy": 0.7848408222198486, "num_tokens": 5653198.0, "step": 346 }, { "entropy": 0.5514497756958008, "epoch": 1.294776119402985, "grad_norm": 0.12672948837280273, "learning_rate": 0.0002, "loss": 0.5441724061965942, "mean_token_accuracy": 0.7795091718435287, "num_tokens": 5669516.0, "step": 347 }, { "entropy": 0.5293784886598587, "epoch": 1.2985074626865671, "grad_norm": 0.11630003899335861, "learning_rate": 0.0002, "loss": 0.5298827886581421, "mean_token_accuracy": 0.783647358417511, "num_tokens": 5685856.0, "step": 348 }, { "entropy": 0.5244417935609818, "epoch": 1.3022388059701493, "grad_norm": 0.14798091351985931, "learning_rate": 0.0002, "loss": 0.5307499170303345, "mean_token_accuracy": 0.7859917134046555, "num_tokens": 5702057.0, "step": 349 }, { "entropy": 0.5323777049779892, "epoch": 1.3059701492537314, "grad_norm": 0.12870146334171295, "learning_rate": 0.0002, "loss": 0.5365279912948608, "mean_token_accuracy": 0.7816431373357773, "num_tokens": 5718688.0, "step": 350 }, { "entropy": 0.5243604183197021, "epoch": 1.3097014925373134, "grad_norm": 0.12391035258769989, "learning_rate": 0.0002, "loss": 0.5227367281913757, "mean_token_accuracy": 0.7866858392953873, "num_tokens": 5734891.0, "step": 351 }, { "entropy": 0.5347918272018433, "epoch": 1.3134328358208955, "grad_norm": 0.145299032330513, "learning_rate": 0.0002, "loss": 0.5310446619987488, "mean_token_accuracy": 0.7831001132726669, "num_tokens": 5751328.0, "step": 352 }, { "entropy": 0.5411982387304306, "epoch": 1.3171641791044777, "grad_norm": 0.1532508134841919, "learning_rate": 0.0002, "loss": 0.5382261276245117, "mean_token_accuracy": 0.7814776748418808, "num_tokens": 5767612.0, "step": 353 }, { "entropy": 0.5384319573640823, "epoch": 1.3208955223880596, "grad_norm": 0.12034327536821365, "learning_rate": 0.0002, "loss": 0.5356577038764954, "mean_token_accuracy": 0.7809152156114578, "num_tokens": 5783823.0, "step": 354 }, { "entropy": 0.5378035828471184, "epoch": 1.3246268656716418, "grad_norm": 0.17426501214504242, "learning_rate": 0.0002, "loss": 0.54035884141922, "mean_token_accuracy": 0.781380295753479, "num_tokens": 5800149.0, "step": 355 }, { "entropy": 0.5415401831269264, "epoch": 1.328358208955224, "grad_norm": 0.1543213427066803, "learning_rate": 0.0002, "loss": 0.5499249696731567, "mean_token_accuracy": 0.7782198786735535, "num_tokens": 5816367.0, "step": 356 }, { "entropy": 0.5541952252388, "epoch": 1.332089552238806, "grad_norm": 0.1483956277370453, "learning_rate": 0.0002, "loss": 0.5502984523773193, "mean_token_accuracy": 0.7760822772979736, "num_tokens": 5832681.0, "step": 357 }, { "entropy": 0.5343631953001022, "epoch": 1.335820895522388, "grad_norm": 0.1370651125907898, "learning_rate": 0.0002, "loss": 0.531204879283905, "mean_token_accuracy": 0.7847591787576675, "num_tokens": 5848778.0, "step": 358 }, { "entropy": 0.5292060524225235, "epoch": 1.3395522388059702, "grad_norm": 0.13134512305259705, "learning_rate": 0.0002, "loss": 0.5340976119041443, "mean_token_accuracy": 0.7800851762294769, "num_tokens": 5864821.0, "step": 359 }, { "entropy": 0.5334947407245636, "epoch": 1.3432835820895521, "grad_norm": 0.1279117912054062, "learning_rate": 0.0002, "loss": 0.5352479815483093, "mean_token_accuracy": 0.7832343429327011, "num_tokens": 5881116.0, "step": 360 }, { "entropy": 0.5323592573404312, "epoch": 1.3470149253731343, "grad_norm": 0.28604868054389954, "learning_rate": 0.0002, "loss": 0.5301060080528259, "mean_token_accuracy": 0.7850496172904968, "num_tokens": 5897810.0, "step": 361 }, { "entropy": 0.5503924041986465, "epoch": 1.3507462686567164, "grad_norm": 0.34482085704803467, "learning_rate": 0.0002, "loss": 0.5528603196144104, "mean_token_accuracy": 0.7764434367418289, "num_tokens": 5914260.0, "step": 362 }, { "entropy": 0.5227297842502594, "epoch": 1.3544776119402986, "grad_norm": 0.12345509976148605, "learning_rate": 0.0002, "loss": 0.5238011479377747, "mean_token_accuracy": 0.7891107350587845, "num_tokens": 5930444.0, "step": 363 }, { "entropy": 0.5462608188390732, "epoch": 1.3582089552238805, "grad_norm": 0.1688961386680603, "learning_rate": 0.0002, "loss": 0.5603306293487549, "mean_token_accuracy": 0.771704226732254, "num_tokens": 5946741.0, "step": 364 }, { "entropy": 0.5538459420204163, "epoch": 1.3619402985074627, "grad_norm": 0.14098992943763733, "learning_rate": 0.0002, "loss": 0.5526646375656128, "mean_token_accuracy": 0.7749083191156387, "num_tokens": 5963128.0, "step": 365 }, { "entropy": 0.5297324359416962, "epoch": 1.3656716417910448, "grad_norm": 0.12920008599758148, "learning_rate": 0.0002, "loss": 0.5280593633651733, "mean_token_accuracy": 0.784359410405159, "num_tokens": 5979218.0, "step": 366 }, { "entropy": 0.5375068634748459, "epoch": 1.3694029850746268, "grad_norm": 0.1362897753715515, "learning_rate": 0.0002, "loss": 0.5373224020004272, "mean_token_accuracy": 0.7841860055923462, "num_tokens": 5995687.0, "step": 367 }, { "entropy": 0.5355936139822006, "epoch": 1.373134328358209, "grad_norm": 0.14052827656269073, "learning_rate": 0.0002, "loss": 0.5387214422225952, "mean_token_accuracy": 0.7806743085384369, "num_tokens": 6012035.0, "step": 368 }, { "entropy": 0.5435226261615753, "epoch": 1.376865671641791, "grad_norm": 0.1556740403175354, "learning_rate": 0.0002, "loss": 0.5441159009933472, "mean_token_accuracy": 0.7787201553583145, "num_tokens": 6028365.0, "step": 369 }, { "entropy": 0.5268312245607376, "epoch": 1.3805970149253732, "grad_norm": 0.15513257682323456, "learning_rate": 0.0002, "loss": 0.5291861891746521, "mean_token_accuracy": 0.7877073138952255, "num_tokens": 6044796.0, "step": 370 }, { "entropy": 0.5517646074295044, "epoch": 1.3843283582089552, "grad_norm": 0.1265048235654831, "learning_rate": 0.0002, "loss": 0.5546433925628662, "mean_token_accuracy": 0.7754338979721069, "num_tokens": 6061487.0, "step": 371 }, { "entropy": 0.5410579442977905, "epoch": 1.3880597014925373, "grad_norm": 0.13882151246070862, "learning_rate": 0.0002, "loss": 0.5375149846076965, "mean_token_accuracy": 0.7817846387624741, "num_tokens": 6077933.0, "step": 372 }, { "entropy": 0.5343161523342133, "epoch": 1.3917910447761195, "grad_norm": 0.1435064971446991, "learning_rate": 0.0002, "loss": 0.5308974981307983, "mean_token_accuracy": 0.7849253863096237, "num_tokens": 6094407.0, "step": 373 }, { "entropy": 0.5472413003444672, "epoch": 1.3955223880597014, "grad_norm": 0.1254650354385376, "learning_rate": 0.0002, "loss": 0.5410266518592834, "mean_token_accuracy": 0.7794545590877533, "num_tokens": 6110923.0, "step": 374 }, { "entropy": 0.5365632474422455, "epoch": 1.3992537313432836, "grad_norm": 0.13213133811950684, "learning_rate": 0.0002, "loss": 0.5404695868492126, "mean_token_accuracy": 0.7813301384449005, "num_tokens": 6127219.0, "step": 375 }, { "entropy": 0.5322464108467102, "epoch": 1.4029850746268657, "grad_norm": 0.1703079640865326, "learning_rate": 0.0002, "loss": 0.5420417189598083, "mean_token_accuracy": 0.7813734114170074, "num_tokens": 6143418.0, "step": 376 }, { "entropy": 0.5500752478837967, "epoch": 1.4067164179104479, "grad_norm": 0.1431417018175125, "learning_rate": 0.0002, "loss": 0.5511533617973328, "mean_token_accuracy": 0.7758170068264008, "num_tokens": 6159747.0, "step": 377 }, { "entropy": 0.5427335649728775, "epoch": 1.4104477611940298, "grad_norm": 0.1817740648984909, "learning_rate": 0.0002, "loss": 0.5414767861366272, "mean_token_accuracy": 0.7784233242273331, "num_tokens": 6176317.0, "step": 378 }, { "entropy": 0.5470531731843948, "epoch": 1.414179104477612, "grad_norm": 0.1422269493341446, "learning_rate": 0.0002, "loss": 0.5472888946533203, "mean_token_accuracy": 0.7780141085386276, "num_tokens": 6192737.0, "step": 379 }, { "entropy": 0.5464377701282501, "epoch": 1.417910447761194, "grad_norm": 0.17506512999534607, "learning_rate": 0.0002, "loss": 0.5490654706954956, "mean_token_accuracy": 0.7765569537878036, "num_tokens": 6208852.0, "step": 380 }, { "entropy": 0.5500655770301819, "epoch": 1.421641791044776, "grad_norm": 0.13887247443199158, "learning_rate": 0.0002, "loss": 0.5514895915985107, "mean_token_accuracy": 0.7774574458599091, "num_tokens": 6225069.0, "step": 381 }, { "entropy": 0.5438679605722427, "epoch": 1.4253731343283582, "grad_norm": 0.19045118987560272, "learning_rate": 0.0002, "loss": 0.5430073738098145, "mean_token_accuracy": 0.7802658081054688, "num_tokens": 6241528.0, "step": 382 }, { "entropy": 0.5306290239095688, "epoch": 1.4291044776119404, "grad_norm": 0.160585418343544, "learning_rate": 0.0002, "loss": 0.5361081957817078, "mean_token_accuracy": 0.7803311944007874, "num_tokens": 6257867.0, "step": 383 }, { "entropy": 0.5401095002889633, "epoch": 1.4328358208955223, "grad_norm": 0.1656486541032791, "learning_rate": 0.0002, "loss": 0.5400689244270325, "mean_token_accuracy": 0.780994102358818, "num_tokens": 6274155.0, "step": 384 }, { "entropy": 0.5327940136194229, "epoch": 1.4365671641791045, "grad_norm": 0.1317523568868637, "learning_rate": 0.0002, "loss": 0.5320010185241699, "mean_token_accuracy": 0.7850325703620911, "num_tokens": 6290558.0, "step": 385 }, { "entropy": 0.5441479384899139, "epoch": 1.4402985074626866, "grad_norm": 0.17623504996299744, "learning_rate": 0.0002, "loss": 0.5384020209312439, "mean_token_accuracy": 0.7846230715513229, "num_tokens": 6306878.0, "step": 386 }, { "entropy": 0.5452490895986557, "epoch": 1.4440298507462686, "grad_norm": 0.16240645945072174, "learning_rate": 0.0002, "loss": 0.5443468689918518, "mean_token_accuracy": 0.7802695333957672, "num_tokens": 6323446.0, "step": 387 }, { "entropy": 0.5221313908696175, "epoch": 1.4477611940298507, "grad_norm": 0.1463281661272049, "learning_rate": 0.0002, "loss": 0.5281410813331604, "mean_token_accuracy": 0.7816678881645203, "num_tokens": 6339949.0, "step": 388 }, { "entropy": 0.548899233341217, "epoch": 1.4514925373134329, "grad_norm": 0.22850677371025085, "learning_rate": 0.0002, "loss": 0.5660842657089233, "mean_token_accuracy": 0.7699355781078339, "num_tokens": 6356385.0, "step": 389 }, { "entropy": 0.5538987964391708, "epoch": 1.455223880597015, "grad_norm": 0.14064767956733704, "learning_rate": 0.0002, "loss": 0.5418739318847656, "mean_token_accuracy": 0.7807578295469284, "num_tokens": 6372804.0, "step": 390 }, { "entropy": 0.5599593967199326, "epoch": 1.458955223880597, "grad_norm": 0.18051759898662567, "learning_rate": 0.0002, "loss": 0.5524702072143555, "mean_token_accuracy": 0.776346430182457, "num_tokens": 6389040.0, "step": 391 }, { "entropy": 0.5202420800924301, "epoch": 1.462686567164179, "grad_norm": 0.14325307309627533, "learning_rate": 0.0002, "loss": 0.519583523273468, "mean_token_accuracy": 0.7894969880580902, "num_tokens": 6405365.0, "step": 392 }, { "entropy": 0.5261730998754501, "epoch": 1.4664179104477613, "grad_norm": 0.1525595486164093, "learning_rate": 0.0002, "loss": 0.5307163596153259, "mean_token_accuracy": 0.7871128022670746, "num_tokens": 6421868.0, "step": 393 }, { "entropy": 0.5307900905609131, "epoch": 1.4701492537313432, "grad_norm": 0.19890250265598297, "learning_rate": 0.0002, "loss": 0.5441185832023621, "mean_token_accuracy": 0.7786047160625458, "num_tokens": 6438616.0, "step": 394 }, { "entropy": 0.5521271824836731, "epoch": 1.4738805970149254, "grad_norm": 0.14049610495567322, "learning_rate": 0.0002, "loss": 0.5551049113273621, "mean_token_accuracy": 0.7755014002323151, "num_tokens": 6455024.0, "step": 395 }, { "entropy": 0.539069190621376, "epoch": 1.4776119402985075, "grad_norm": 0.1545083075761795, "learning_rate": 0.0002, "loss": 0.5353712439537048, "mean_token_accuracy": 0.78336501121521, "num_tokens": 6471293.0, "step": 396 }, { "entropy": 0.5550021678209305, "epoch": 1.4813432835820897, "grad_norm": 0.18578873574733734, "learning_rate": 0.0002, "loss": 0.5472472310066223, "mean_token_accuracy": 0.7796825766563416, "num_tokens": 6487641.0, "step": 397 }, { "entropy": 0.5490831285715103, "epoch": 1.4850746268656716, "grad_norm": 0.1240464299917221, "learning_rate": 0.0002, "loss": 0.5474961400032043, "mean_token_accuracy": 0.7774344980716705, "num_tokens": 6503822.0, "step": 398 }, { "entropy": 0.5393417626619339, "epoch": 1.4888059701492538, "grad_norm": 0.1891254484653473, "learning_rate": 0.0002, "loss": 0.5524366497993469, "mean_token_accuracy": 0.7745344191789627, "num_tokens": 6520011.0, "step": 399 }, { "entropy": 0.513459712266922, "epoch": 1.4925373134328357, "grad_norm": 0.2974206805229187, "learning_rate": 0.0002, "loss": 0.5200244188308716, "mean_token_accuracy": 0.7888158708810806, "num_tokens": 6536205.0, "step": 400 }, { "entropy": 0.5186173021793365, "epoch": 1.4962686567164178, "grad_norm": 0.15046866238117218, "learning_rate": 0.0002, "loss": 0.5207955241203308, "mean_token_accuracy": 0.7867278605699539, "num_tokens": 6552440.0, "step": 401 }, { "entropy": 0.5499364733695984, "epoch": 1.5, "grad_norm": 0.4020411968231201, "learning_rate": 0.0002, "loss": 0.5530084371566772, "mean_token_accuracy": 0.7796496748924255, "num_tokens": 6568961.0, "step": 402 }, { "entropy": 0.5427668243646622, "epoch": 1.5037313432835822, "grad_norm": 0.11850416660308838, "learning_rate": 0.0002, "loss": 0.533820629119873, "mean_token_accuracy": 0.7840306162834167, "num_tokens": 6585550.0, "step": 403 }, { "entropy": 0.5325792133808136, "epoch": 1.5074626865671643, "grad_norm": 0.18302492797374725, "learning_rate": 0.0002, "loss": 0.534012496471405, "mean_token_accuracy": 0.7814914137125015, "num_tokens": 6601942.0, "step": 404 }, { "entropy": 0.5354548320174217, "epoch": 1.5111940298507462, "grad_norm": 0.15404394268989563, "learning_rate": 0.0002, "loss": 0.538587749004364, "mean_token_accuracy": 0.7822761088609695, "num_tokens": 6618440.0, "step": 405 }, { "entropy": 0.5441371351480484, "epoch": 1.5149253731343284, "grad_norm": 0.13057801127433777, "learning_rate": 0.0002, "loss": 0.542742908000946, "mean_token_accuracy": 0.7798959463834763, "num_tokens": 6634866.0, "step": 406 }, { "entropy": 0.542233407497406, "epoch": 1.5186567164179103, "grad_norm": 0.14343421161174774, "learning_rate": 0.0002, "loss": 0.5447250008583069, "mean_token_accuracy": 0.7802796810865402, "num_tokens": 6651150.0, "step": 407 }, { "entropy": 0.5407950282096863, "epoch": 1.5223880597014925, "grad_norm": 0.14996956288814545, "learning_rate": 0.0002, "loss": 0.5389798879623413, "mean_token_accuracy": 0.7809374779462814, "num_tokens": 6667674.0, "step": 408 }, { "entropy": 0.5433390289545059, "epoch": 1.5261194029850746, "grad_norm": 0.1311637908220291, "learning_rate": 0.0002, "loss": 0.5383128523826599, "mean_token_accuracy": 0.7790700197219849, "num_tokens": 6684068.0, "step": 409 }, { "entropy": 0.527245432138443, "epoch": 1.5298507462686568, "grad_norm": 0.16411243379116058, "learning_rate": 0.0002, "loss": 0.5319215059280396, "mean_token_accuracy": 0.7840736508369446, "num_tokens": 6700752.0, "step": 410 }, { "entropy": 0.5146678760647774, "epoch": 1.533582089552239, "grad_norm": 0.1607578545808792, "learning_rate": 0.0002, "loss": 0.5198485851287842, "mean_token_accuracy": 0.7882288843393326, "num_tokens": 6716857.0, "step": 411 }, { "entropy": 0.5308386236429214, "epoch": 1.537313432835821, "grad_norm": 0.166807621717453, "learning_rate": 0.0002, "loss": 0.5419335961341858, "mean_token_accuracy": 0.7812209129333496, "num_tokens": 6732981.0, "step": 412 }, { "entropy": 0.5488767176866531, "epoch": 1.5410447761194028, "grad_norm": 0.14006908237934113, "learning_rate": 0.0002, "loss": 0.5508508086204529, "mean_token_accuracy": 0.7769163995981216, "num_tokens": 6749307.0, "step": 413 }, { "entropy": 0.5410346239805222, "epoch": 1.544776119402985, "grad_norm": 0.13224521279335022, "learning_rate": 0.0002, "loss": 0.5321468710899353, "mean_token_accuracy": 0.7842406779527664, "num_tokens": 6765688.0, "step": 414 }, { "entropy": 0.5605396628379822, "epoch": 1.5485074626865671, "grad_norm": 0.1389547735452652, "learning_rate": 0.0002, "loss": 0.5529029369354248, "mean_token_accuracy": 0.7745459079742432, "num_tokens": 6782015.0, "step": 415 }, { "entropy": 0.5347004532814026, "epoch": 1.5522388059701493, "grad_norm": 0.1258436143398285, "learning_rate": 0.0002, "loss": 0.5315224528312683, "mean_token_accuracy": 0.7851130068302155, "num_tokens": 6798206.0, "step": 416 }, { "entropy": 0.5425236374139786, "epoch": 1.5559701492537314, "grad_norm": 0.16927701234817505, "learning_rate": 0.0002, "loss": 0.5464774370193481, "mean_token_accuracy": 0.7801399230957031, "num_tokens": 6814725.0, "step": 417 }, { "entropy": 0.5187622159719467, "epoch": 1.5597014925373134, "grad_norm": 0.13987842202186584, "learning_rate": 0.0002, "loss": 0.5246447920799255, "mean_token_accuracy": 0.7894206643104553, "num_tokens": 6831232.0, "step": 418 }, { "entropy": 0.5316571593284607, "epoch": 1.5634328358208955, "grad_norm": 0.15650241076946259, "learning_rate": 0.0002, "loss": 0.538478434085846, "mean_token_accuracy": 0.7800242900848389, "num_tokens": 6847650.0, "step": 419 }, { "entropy": 0.5246055871248245, "epoch": 1.5671641791044775, "grad_norm": 0.13061542809009552, "learning_rate": 0.0002, "loss": 0.5321829319000244, "mean_token_accuracy": 0.7838113605976105, "num_tokens": 6864019.0, "step": 420 }, { "entropy": 0.5212045907974243, "epoch": 1.5708955223880596, "grad_norm": 0.13846127688884735, "learning_rate": 0.0002, "loss": 0.5200290679931641, "mean_token_accuracy": 0.7883654683828354, "num_tokens": 6880204.0, "step": 421 }, { "entropy": 0.542250782251358, "epoch": 1.5746268656716418, "grad_norm": 0.12467647343873978, "learning_rate": 0.0002, "loss": 0.5380762815475464, "mean_token_accuracy": 0.7811442613601685, "num_tokens": 6896430.0, "step": 422 }, { "entropy": 0.5405887067317963, "epoch": 1.578358208955224, "grad_norm": 0.1305769383907318, "learning_rate": 0.0002, "loss": 0.5357393026351929, "mean_token_accuracy": 0.7828609347343445, "num_tokens": 6912971.0, "step": 423 }, { "entropy": 0.5287357568740845, "epoch": 1.582089552238806, "grad_norm": 0.17313086986541748, "learning_rate": 0.0002, "loss": 0.5329744219779968, "mean_token_accuracy": 0.782240018248558, "num_tokens": 6929204.0, "step": 424 }, { "entropy": 0.5423530340194702, "epoch": 1.585820895522388, "grad_norm": 0.1359935700893402, "learning_rate": 0.0002, "loss": 0.5377368330955505, "mean_token_accuracy": 0.7828396558761597, "num_tokens": 6945791.0, "step": 425 }, { "entropy": 0.5215180069208145, "epoch": 1.5895522388059702, "grad_norm": 0.1547544300556183, "learning_rate": 0.0002, "loss": 0.5314459800720215, "mean_token_accuracy": 0.7837548702955246, "num_tokens": 6961875.0, "step": 426 }, { "entropy": 0.5231145992875099, "epoch": 1.5932835820895521, "grad_norm": 0.13578681647777557, "learning_rate": 0.0002, "loss": 0.5277360677719116, "mean_token_accuracy": 0.7842715680599213, "num_tokens": 6978198.0, "step": 427 }, { "entropy": 0.5486603379249573, "epoch": 1.5970149253731343, "grad_norm": 0.15189069509506226, "learning_rate": 0.0002, "loss": 0.549156129360199, "mean_token_accuracy": 0.7768438756465912, "num_tokens": 6994444.0, "step": 428 }, { "entropy": 0.54026959836483, "epoch": 1.6007462686567164, "grad_norm": 0.13162657618522644, "learning_rate": 0.0002, "loss": 0.5345808863639832, "mean_token_accuracy": 0.7827611416578293, "num_tokens": 7010461.0, "step": 429 }, { "entropy": 0.53890560567379, "epoch": 1.6044776119402986, "grad_norm": 0.133237823843956, "learning_rate": 0.0002, "loss": 0.5350275635719299, "mean_token_accuracy": 0.7830039262771606, "num_tokens": 7026813.0, "step": 430 }, { "entropy": 0.5518313944339752, "epoch": 1.6082089552238807, "grad_norm": 0.14963583648204803, "learning_rate": 0.0002, "loss": 0.5478031039237976, "mean_token_accuracy": 0.7780435979366302, "num_tokens": 7043301.0, "step": 431 }, { "entropy": 0.5414951294660568, "epoch": 1.6119402985074627, "grad_norm": 0.12772321701049805, "learning_rate": 0.0002, "loss": 0.5401883125305176, "mean_token_accuracy": 0.782444417476654, "num_tokens": 7059646.0, "step": 432 }, { "entropy": 0.5394223630428314, "epoch": 1.6156716417910446, "grad_norm": 0.13813580572605133, "learning_rate": 0.0002, "loss": 0.5405031442642212, "mean_token_accuracy": 0.7798984050750732, "num_tokens": 7076271.0, "step": 433 }, { "entropy": 0.5429421365261078, "epoch": 1.6194029850746268, "grad_norm": 0.15601246058940887, "learning_rate": 0.0002, "loss": 0.5516016483306885, "mean_token_accuracy": 0.775258257985115, "num_tokens": 7092578.0, "step": 434 }, { "entropy": 0.5521349459886551, "epoch": 1.623134328358209, "grad_norm": 0.14428818225860596, "learning_rate": 0.0002, "loss": 0.5492872595787048, "mean_token_accuracy": 0.7768293768167496, "num_tokens": 7109046.0, "step": 435 }, { "entropy": 0.5354936867952347, "epoch": 1.626865671641791, "grad_norm": 0.15073303878307343, "learning_rate": 0.0002, "loss": 0.5428034663200378, "mean_token_accuracy": 0.780666396021843, "num_tokens": 7125466.0, "step": 436 }, { "entropy": 0.5443413555622101, "epoch": 1.6305970149253732, "grad_norm": 0.14848864078521729, "learning_rate": 0.0002, "loss": 0.5486512780189514, "mean_token_accuracy": 0.7806312739849091, "num_tokens": 7141898.0, "step": 437 }, { "entropy": 0.5337215662002563, "epoch": 1.6343283582089554, "grad_norm": 0.15302547812461853, "learning_rate": 0.0002, "loss": 0.5392454862594604, "mean_token_accuracy": 0.7822044789791107, "num_tokens": 7158167.0, "step": 438 }, { "entropy": 0.5586158037185669, "epoch": 1.6380597014925373, "grad_norm": 0.17401555180549622, "learning_rate": 0.0002, "loss": 0.557881772518158, "mean_token_accuracy": 0.7756661027669907, "num_tokens": 7174477.0, "step": 439 }, { "entropy": 0.5406471788883209, "epoch": 1.6417910447761193, "grad_norm": 0.14608509838581085, "learning_rate": 0.0002, "loss": 0.5353439450263977, "mean_token_accuracy": 0.7812080383300781, "num_tokens": 7190694.0, "step": 440 }, { "entropy": 0.5237606167793274, "epoch": 1.6455223880597014, "grad_norm": 0.1542704850435257, "learning_rate": 0.0002, "loss": 0.5290042161941528, "mean_token_accuracy": 0.7855716645717621, "num_tokens": 7207153.0, "step": 441 }, { "entropy": 0.5269318968057632, "epoch": 1.6492537313432836, "grad_norm": 0.1659008413553238, "learning_rate": 0.0002, "loss": 0.530527651309967, "mean_token_accuracy": 0.7846795618534088, "num_tokens": 7223109.0, "step": 442 }, { "entropy": 0.5195682793855667, "epoch": 1.6529850746268657, "grad_norm": 0.14120091497898102, "learning_rate": 0.0002, "loss": 0.5263478755950928, "mean_token_accuracy": 0.7843965291976929, "num_tokens": 7239499.0, "step": 443 }, { "entropy": 0.5257822424173355, "epoch": 1.6567164179104479, "grad_norm": 0.1643773764371872, "learning_rate": 0.0002, "loss": 0.5316389203071594, "mean_token_accuracy": 0.7851150333881378, "num_tokens": 7255730.0, "step": 444 }, { "entropy": 0.5377429872751236, "epoch": 1.6604477611940298, "grad_norm": 0.14926724135875702, "learning_rate": 0.0002, "loss": 0.5427424907684326, "mean_token_accuracy": 0.7824969440698624, "num_tokens": 7272167.0, "step": 445 }, { "entropy": 0.538849800825119, "epoch": 1.664179104477612, "grad_norm": 0.13225945830345154, "learning_rate": 0.0002, "loss": 0.5327820181846619, "mean_token_accuracy": 0.783388078212738, "num_tokens": 7288421.0, "step": 446 }, { "entropy": 0.5399289578199387, "epoch": 1.667910447761194, "grad_norm": 0.1308569759130478, "learning_rate": 0.0002, "loss": 0.5292877554893494, "mean_token_accuracy": 0.7878285944461823, "num_tokens": 7304880.0, "step": 447 }, { "entropy": 0.5436895489692688, "epoch": 1.671641791044776, "grad_norm": 0.16895835101604462, "learning_rate": 0.0002, "loss": 0.5451297163963318, "mean_token_accuracy": 0.7789509892463684, "num_tokens": 7321256.0, "step": 448 }, { "entropy": 0.5504481792449951, "epoch": 1.6753731343283582, "grad_norm": 0.13614578545093536, "learning_rate": 0.0002, "loss": 0.5539385080337524, "mean_token_accuracy": 0.7752430438995361, "num_tokens": 7337589.0, "step": 449 }, { "entropy": 0.5513797849416733, "epoch": 1.6791044776119404, "grad_norm": 0.15195772051811218, "learning_rate": 0.0002, "loss": 0.5530341267585754, "mean_token_accuracy": 0.7749580442905426, "num_tokens": 7353883.0, "step": 450 }, { "entropy": 0.5413680523633957, "epoch": 1.6828358208955225, "grad_norm": 0.15170808136463165, "learning_rate": 0.0002, "loss": 0.543311357498169, "mean_token_accuracy": 0.7790023237466812, "num_tokens": 7370160.0, "step": 451 }, { "entropy": 0.5648334920406342, "epoch": 1.6865671641791045, "grad_norm": 0.1327073723077774, "learning_rate": 0.0002, "loss": 0.5623019933700562, "mean_token_accuracy": 0.7708193957805634, "num_tokens": 7386478.0, "step": 452 }, { "entropy": 0.517740860581398, "epoch": 1.6902985074626866, "grad_norm": 0.13745424151420593, "learning_rate": 0.0002, "loss": 0.5170730352401733, "mean_token_accuracy": 0.7882706969976425, "num_tokens": 7402645.0, "step": 453 }, { "entropy": 0.5524223297834396, "epoch": 1.6940298507462686, "grad_norm": 0.1598864197731018, "learning_rate": 0.0002, "loss": 0.5490080714225769, "mean_token_accuracy": 0.7766116112470627, "num_tokens": 7419124.0, "step": 454 }, { "entropy": 0.5260176658630371, "epoch": 1.6977611940298507, "grad_norm": 0.13257424533367157, "learning_rate": 0.0002, "loss": 0.5297276973724365, "mean_token_accuracy": 0.7853291928768158, "num_tokens": 7435508.0, "step": 455 }, { "entropy": 0.5325040817260742, "epoch": 1.7014925373134329, "grad_norm": 0.18319375813007355, "learning_rate": 0.0002, "loss": 0.543100118637085, "mean_token_accuracy": 0.7803790718317032, "num_tokens": 7451755.0, "step": 456 }, { "entropy": 0.5267694145441055, "epoch": 1.705223880597015, "grad_norm": 0.1554267704486847, "learning_rate": 0.0002, "loss": 0.5240468978881836, "mean_token_accuracy": 0.7871411740779877, "num_tokens": 7467919.0, "step": 457 }, { "entropy": 0.5426032692193985, "epoch": 1.7089552238805972, "grad_norm": 0.13706867396831512, "learning_rate": 0.0002, "loss": 0.5412613749504089, "mean_token_accuracy": 0.778879314661026, "num_tokens": 7484289.0, "step": 458 }, { "entropy": 0.5340660065412521, "epoch": 1.712686567164179, "grad_norm": 0.16726213693618774, "learning_rate": 0.0002, "loss": 0.5392245650291443, "mean_token_accuracy": 0.7805332094430923, "num_tokens": 7500611.0, "step": 459 }, { "entropy": 0.5553819835186005, "epoch": 1.716417910447761, "grad_norm": 0.16255703568458557, "learning_rate": 0.0002, "loss": 0.5517896413803101, "mean_token_accuracy": 0.7731162905693054, "num_tokens": 7517206.0, "step": 460 }, { "entropy": 0.5343479365110397, "epoch": 1.7201492537313432, "grad_norm": 0.13407304883003235, "learning_rate": 0.0002, "loss": 0.5380552411079407, "mean_token_accuracy": 0.778910294175148, "num_tokens": 7533459.0, "step": 461 }, { "entropy": 0.5323963612318039, "epoch": 1.7238805970149254, "grad_norm": 0.1650952398777008, "learning_rate": 0.0002, "loss": 0.5314269661903381, "mean_token_accuracy": 0.7864300310611725, "num_tokens": 7549589.0, "step": 462 }, { "entropy": 0.5433520078659058, "epoch": 1.7276119402985075, "grad_norm": 0.1429263949394226, "learning_rate": 0.0002, "loss": 0.540563702583313, "mean_token_accuracy": 0.7819092869758606, "num_tokens": 7566158.0, "step": 463 }, { "entropy": 0.5436968952417374, "epoch": 1.7313432835820897, "grad_norm": 0.14086155593395233, "learning_rate": 0.0002, "loss": 0.5398205518722534, "mean_token_accuracy": 0.7809909284114838, "num_tokens": 7582422.0, "step": 464 }, { "entropy": 0.5534437447786331, "epoch": 1.7350746268656716, "grad_norm": 0.14618556201457977, "learning_rate": 0.0002, "loss": 0.5561552047729492, "mean_token_accuracy": 0.7724596560001373, "num_tokens": 7598771.0, "step": 465 }, { "entropy": 0.5396170765161514, "epoch": 1.7388059701492538, "grad_norm": 0.1190977543592453, "learning_rate": 0.0002, "loss": 0.5389412641525269, "mean_token_accuracy": 0.7812270224094391, "num_tokens": 7615418.0, "step": 466 }, { "entropy": 0.5390318781137466, "epoch": 1.7425373134328357, "grad_norm": 0.15372450649738312, "learning_rate": 0.0002, "loss": 0.5436992645263672, "mean_token_accuracy": 0.7814512252807617, "num_tokens": 7631840.0, "step": 467 }, { "entropy": 0.5206413939595222, "epoch": 1.7462686567164178, "grad_norm": 0.13495191931724548, "learning_rate": 0.0002, "loss": 0.5253979563713074, "mean_token_accuracy": 0.7877579927444458, "num_tokens": 7648131.0, "step": 468 }, { "entropy": 0.5223769247531891, "epoch": 1.75, "grad_norm": 0.15382781624794006, "learning_rate": 0.0002, "loss": 0.5363397002220154, "mean_token_accuracy": 0.7828211337327957, "num_tokens": 7664453.0, "step": 469 }, { "entropy": 0.5333149433135986, "epoch": 1.7537313432835822, "grad_norm": 0.13387013971805573, "learning_rate": 0.0002, "loss": 0.5351001620292664, "mean_token_accuracy": 0.7830037176609039, "num_tokens": 7680781.0, "step": 470 }, { "entropy": 0.5429620742797852, "epoch": 1.7574626865671643, "grad_norm": 0.13604114949703217, "learning_rate": 0.0002, "loss": 0.5358593463897705, "mean_token_accuracy": 0.7837422788143158, "num_tokens": 7697310.0, "step": 471 }, { "entropy": 0.5731407701969147, "epoch": 1.7611940298507462, "grad_norm": 0.1410369724035263, "learning_rate": 0.0002, "loss": 0.5635945796966553, "mean_token_accuracy": 0.7718209028244019, "num_tokens": 7713558.0, "step": 472 }, { "entropy": 0.5679037570953369, "epoch": 1.7649253731343284, "grad_norm": 0.14904598891735077, "learning_rate": 0.0002, "loss": 0.5656334161758423, "mean_token_accuracy": 0.7714496552944183, "num_tokens": 7730117.0, "step": 473 }, { "entropy": 0.5429675132036209, "epoch": 1.7686567164179103, "grad_norm": 0.1564645618200302, "learning_rate": 0.0002, "loss": 0.5466417670249939, "mean_token_accuracy": 0.7782974392175674, "num_tokens": 7746633.0, "step": 474 }, { "entropy": 0.5362623929977417, "epoch": 1.7723880597014925, "grad_norm": 0.14919337630271912, "learning_rate": 0.0002, "loss": 0.5442617535591125, "mean_token_accuracy": 0.778479665517807, "num_tokens": 7762813.0, "step": 475 }, { "entropy": 0.5283475816249847, "epoch": 1.7761194029850746, "grad_norm": 0.14363890886306763, "learning_rate": 0.0002, "loss": 0.5296353101730347, "mean_token_accuracy": 0.7861494719982147, "num_tokens": 7778873.0, "step": 476 }, { "entropy": 0.5252759754657745, "epoch": 1.7798507462686568, "grad_norm": 0.17697355151176453, "learning_rate": 0.0002, "loss": 0.5262605547904968, "mean_token_accuracy": 0.7861870229244232, "num_tokens": 7795362.0, "step": 477 }, { "entropy": 0.5341710150241852, "epoch": 1.783582089552239, "grad_norm": 0.13914838433265686, "learning_rate": 0.0002, "loss": 0.5387526750564575, "mean_token_accuracy": 0.7779033482074738, "num_tokens": 7811639.0, "step": 478 }, { "entropy": 0.5409186482429504, "epoch": 1.787313432835821, "grad_norm": 0.14785298705101013, "learning_rate": 0.0002, "loss": 0.5428853034973145, "mean_token_accuracy": 0.7777274399995804, "num_tokens": 7828116.0, "step": 479 }, { "entropy": 0.5548221617937088, "epoch": 1.7910447761194028, "grad_norm": 0.1457030326128006, "learning_rate": 0.0002, "loss": 0.5512540340423584, "mean_token_accuracy": 0.7757317572832108, "num_tokens": 7844457.0, "step": 480 }, { "entropy": 0.5340719819068909, "epoch": 1.794776119402985, "grad_norm": 0.13429081439971924, "learning_rate": 0.0002, "loss": 0.5289599299430847, "mean_token_accuracy": 0.7837049216032028, "num_tokens": 7860611.0, "step": 481 }, { "entropy": 0.5379914194345474, "epoch": 1.7985074626865671, "grad_norm": 0.13006342947483063, "learning_rate": 0.0002, "loss": 0.5363917350769043, "mean_token_accuracy": 0.7821543663740158, "num_tokens": 7876837.0, "step": 482 }, { "entropy": 0.5481665432453156, "epoch": 1.8022388059701493, "grad_norm": 0.14950798451900482, "learning_rate": 0.0002, "loss": 0.5466524362564087, "mean_token_accuracy": 0.7806346863508224, "num_tokens": 7893152.0, "step": 483 }, { "entropy": 0.5473506450653076, "epoch": 1.8059701492537314, "grad_norm": 0.14105349779129028, "learning_rate": 0.0002, "loss": 0.5428904891014099, "mean_token_accuracy": 0.778725266456604, "num_tokens": 7909608.0, "step": 484 }, { "entropy": 0.5446173995733261, "epoch": 1.8097014925373134, "grad_norm": 0.15689605474472046, "learning_rate": 0.0002, "loss": 0.5529049634933472, "mean_token_accuracy": 0.7787118703126907, "num_tokens": 7926042.0, "step": 485 }, { "entropy": 0.5260195583105087, "epoch": 1.8134328358208955, "grad_norm": 0.15744158625602722, "learning_rate": 0.0002, "loss": 0.5373381972312927, "mean_token_accuracy": 0.7849460244178772, "num_tokens": 7942407.0, "step": 486 }, { "entropy": 0.5418536812067032, "epoch": 1.8171641791044775, "grad_norm": 0.14664271473884583, "learning_rate": 0.0002, "loss": 0.5412867069244385, "mean_token_accuracy": 0.7811890542507172, "num_tokens": 7958995.0, "step": 487 }, { "entropy": 0.5519318580627441, "epoch": 1.8208955223880596, "grad_norm": 0.15384623408317566, "learning_rate": 0.0002, "loss": 0.5512985587120056, "mean_token_accuracy": 0.7755472809076309, "num_tokens": 7975615.0, "step": 488 }, { "entropy": 0.5366766899824142, "epoch": 1.8246268656716418, "grad_norm": 0.17651750147342682, "learning_rate": 0.0002, "loss": 0.5435804128646851, "mean_token_accuracy": 0.7781522572040558, "num_tokens": 7991932.0, "step": 489 }, { "entropy": 0.5274553596973419, "epoch": 1.828358208955224, "grad_norm": 0.13903461396694183, "learning_rate": 0.0002, "loss": 0.5304480195045471, "mean_token_accuracy": 0.7822371274232864, "num_tokens": 8008268.0, "step": 490 }, { "entropy": 0.5359211266040802, "epoch": 1.832089552238806, "grad_norm": 0.1657918393611908, "learning_rate": 0.0002, "loss": 0.5305460095405579, "mean_token_accuracy": 0.7854030579328537, "num_tokens": 8024551.0, "step": 491 }, { "entropy": 0.5484016537666321, "epoch": 1.835820895522388, "grad_norm": 0.16684608161449432, "learning_rate": 0.0002, "loss": 0.5452835559844971, "mean_token_accuracy": 0.7772976756095886, "num_tokens": 8040823.0, "step": 492 }, { "entropy": 0.5474873930215836, "epoch": 1.8395522388059702, "grad_norm": 0.151128351688385, "learning_rate": 0.0002, "loss": 0.5493411421775818, "mean_token_accuracy": 0.7793968617916107, "num_tokens": 8057509.0, "step": 493 }, { "entropy": 0.526735208928585, "epoch": 1.8432835820895521, "grad_norm": 0.1347130686044693, "learning_rate": 0.0002, "loss": 0.5294213891029358, "mean_token_accuracy": 0.783684104681015, "num_tokens": 8073599.0, "step": 494 }, { "entropy": 0.5525032877922058, "epoch": 1.8470149253731343, "grad_norm": 0.14043265581130981, "learning_rate": 0.0002, "loss": 0.5447618961334229, "mean_token_accuracy": 0.7783424258232117, "num_tokens": 8089819.0, "step": 495 }, { "entropy": 0.5403036177158356, "epoch": 1.8507462686567164, "grad_norm": 0.13459749519824982, "learning_rate": 0.0002, "loss": 0.543724775314331, "mean_token_accuracy": 0.7801337391138077, "num_tokens": 8106320.0, "step": 496 }, { "entropy": 0.5121283084154129, "epoch": 1.8544776119402986, "grad_norm": 0.13925622403621674, "learning_rate": 0.0002, "loss": 0.5182461142539978, "mean_token_accuracy": 0.7902320176362991, "num_tokens": 8122590.0, "step": 497 }, { "entropy": 0.5341223925352097, "epoch": 1.8582089552238807, "grad_norm": 0.1333732157945633, "learning_rate": 0.0002, "loss": 0.5352264642715454, "mean_token_accuracy": 0.7827399671077728, "num_tokens": 8138922.0, "step": 498 }, { "entropy": 0.5457236468791962, "epoch": 1.8619402985074627, "grad_norm": 0.13741785287857056, "learning_rate": 0.0002, "loss": 0.5454993844032288, "mean_token_accuracy": 0.7798125892877579, "num_tokens": 8155306.0, "step": 499 }, { "entropy": 0.5553978830575943, "epoch": 1.8656716417910446, "grad_norm": 0.12911130487918854, "learning_rate": 0.0002, "loss": 0.5489829778671265, "mean_token_accuracy": 0.7798224687576294, "num_tokens": 8171560.0, "step": 500 }, { "entropy": 0.5366699695587158, "epoch": 1.8694029850746268, "grad_norm": 0.14433807134628296, "learning_rate": 0.0002, "loss": 0.5305231213569641, "mean_token_accuracy": 0.7864150553941727, "num_tokens": 8188037.0, "step": 501 }, { "entropy": 0.5387077182531357, "epoch": 1.873134328358209, "grad_norm": 0.14472654461860657, "learning_rate": 0.0002, "loss": 0.5373876094818115, "mean_token_accuracy": 0.7849652767181396, "num_tokens": 8204628.0, "step": 502 }, { "entropy": 0.5305859744548798, "epoch": 1.876865671641791, "grad_norm": 0.16016830503940582, "learning_rate": 0.0002, "loss": 0.5409325361251831, "mean_token_accuracy": 0.7806791961193085, "num_tokens": 8220902.0, "step": 503 }, { "entropy": 0.5299341380596161, "epoch": 1.8805970149253732, "grad_norm": 0.15263962745666504, "learning_rate": 0.0002, "loss": 0.5375992655754089, "mean_token_accuracy": 0.781559944152832, "num_tokens": 8237185.0, "step": 504 }, { "entropy": 0.5437009185552597, "epoch": 1.8843283582089554, "grad_norm": 0.15553534030914307, "learning_rate": 0.0002, "loss": 0.5443401336669922, "mean_token_accuracy": 0.7812230437994003, "num_tokens": 8253677.0, "step": 505 }, { "entropy": 0.5481602549552917, "epoch": 1.8880597014925373, "grad_norm": 0.14724990725517273, "learning_rate": 0.0002, "loss": 0.540518581867218, "mean_token_accuracy": 0.7784458547830582, "num_tokens": 8270080.0, "step": 506 }, { "entropy": 0.5473358333110809, "epoch": 1.8917910447761193, "grad_norm": 0.13046710193157196, "learning_rate": 0.0002, "loss": 0.5379562973976135, "mean_token_accuracy": 0.7840885818004608, "num_tokens": 8286417.0, "step": 507 }, { "entropy": 0.5339422821998596, "epoch": 1.8955223880597014, "grad_norm": 0.11970847100019455, "learning_rate": 0.0002, "loss": 0.531002402305603, "mean_token_accuracy": 0.7831601500511169, "num_tokens": 8302558.0, "step": 508 }, { "entropy": 0.5296764224767685, "epoch": 1.8992537313432836, "grad_norm": 0.1354552060365677, "learning_rate": 0.0002, "loss": 0.5331873893737793, "mean_token_accuracy": 0.7870133370161057, "num_tokens": 8318741.0, "step": 509 }, { "entropy": 0.52724589407444, "epoch": 1.9029850746268657, "grad_norm": 0.1636589914560318, "learning_rate": 0.0002, "loss": 0.5382875800132751, "mean_token_accuracy": 0.7812641561031342, "num_tokens": 8335074.0, "step": 510 }, { "entropy": 0.5487582981586456, "epoch": 1.9067164179104479, "grad_norm": 0.15405811369419098, "learning_rate": 0.0002, "loss": 0.5569562315940857, "mean_token_accuracy": 0.775174006819725, "num_tokens": 8351357.0, "step": 511 }, { "entropy": 0.5199541226029396, "epoch": 1.9104477611940298, "grad_norm": 0.13167649507522583, "learning_rate": 0.0002, "loss": 0.5217406749725342, "mean_token_accuracy": 0.788948193192482, "num_tokens": 8367452.0, "step": 512 }, { "entropy": 0.5357903987169266, "epoch": 1.914179104477612, "grad_norm": 0.12568941712379456, "learning_rate": 0.0002, "loss": 0.5307230949401855, "mean_token_accuracy": 0.7828755676746368, "num_tokens": 8383786.0, "step": 513 }, { "entropy": 0.5289642512798309, "epoch": 1.917910447761194, "grad_norm": 0.130939319729805, "learning_rate": 0.0002, "loss": 0.5241107940673828, "mean_token_accuracy": 0.786993533372879, "num_tokens": 8400005.0, "step": 514 }, { "entropy": 0.5548314303159714, "epoch": 1.921641791044776, "grad_norm": 0.1255977749824524, "learning_rate": 0.0002, "loss": 0.5506734848022461, "mean_token_accuracy": 0.7779561877250671, "num_tokens": 8416502.0, "step": 515 }, { "entropy": 0.5388498157262802, "epoch": 1.9253731343283582, "grad_norm": 0.13658908009529114, "learning_rate": 0.0002, "loss": 0.5440253615379333, "mean_token_accuracy": 0.7802704125642776, "num_tokens": 8432771.0, "step": 516 }, { "entropy": 0.5444848537445068, "epoch": 1.9291044776119404, "grad_norm": 0.1361331045627594, "learning_rate": 0.0002, "loss": 0.5464693903923035, "mean_token_accuracy": 0.7777076661586761, "num_tokens": 8449261.0, "step": 517 }, { "entropy": 0.545665979385376, "epoch": 1.9328358208955225, "grad_norm": 0.1317397505044937, "learning_rate": 0.0002, "loss": 0.5444501638412476, "mean_token_accuracy": 0.7814345061779022, "num_tokens": 8465832.0, "step": 518 }, { "entropy": 0.5405286103487015, "epoch": 1.9365671641791045, "grad_norm": 0.13252875208854675, "learning_rate": 0.0002, "loss": 0.5404050946235657, "mean_token_accuracy": 0.780963346362114, "num_tokens": 8482176.0, "step": 519 }, { "entropy": 0.5433270484209061, "epoch": 1.9402985074626866, "grad_norm": 0.13105268776416779, "learning_rate": 0.0002, "loss": 0.5479311943054199, "mean_token_accuracy": 0.7770702540874481, "num_tokens": 8498438.0, "step": 520 }, { "entropy": 0.5341716408729553, "epoch": 1.9440298507462686, "grad_norm": 0.14269208908081055, "learning_rate": 0.0002, "loss": 0.535066545009613, "mean_token_accuracy": 0.7825455218553543, "num_tokens": 8514674.0, "step": 521 }, { "entropy": 0.5395411849021912, "epoch": 1.9477611940298507, "grad_norm": 0.13277186453342438, "learning_rate": 0.0002, "loss": 0.5376089215278625, "mean_token_accuracy": 0.7824221551418304, "num_tokens": 8530963.0, "step": 522 }, { "entropy": 0.5529618561267853, "epoch": 1.9514925373134329, "grad_norm": 0.1381501704454422, "learning_rate": 0.0002, "loss": 0.5493215918540955, "mean_token_accuracy": 0.779175415635109, "num_tokens": 8547488.0, "step": 523 }, { "entropy": 0.5260922610759735, "epoch": 1.955223880597015, "grad_norm": 0.1598714143037796, "learning_rate": 0.0002, "loss": 0.5309720039367676, "mean_token_accuracy": 0.7842647433280945, "num_tokens": 8564003.0, "step": 524 }, { "entropy": 0.5258769541978836, "epoch": 1.9589552238805972, "grad_norm": 0.1397145837545395, "learning_rate": 0.0002, "loss": 0.533185601234436, "mean_token_accuracy": 0.7819601446390152, "num_tokens": 8580280.0, "step": 525 }, { "entropy": 0.5250103250145912, "epoch": 1.962686567164179, "grad_norm": 0.19406840205192566, "learning_rate": 0.0002, "loss": 0.5373009443283081, "mean_token_accuracy": 0.7827760279178619, "num_tokens": 8596181.0, "step": 526 }, { "entropy": 0.556450217962265, "epoch": 1.966417910447761, "grad_norm": 0.13848020136356354, "learning_rate": 0.0002, "loss": 0.5526891946792603, "mean_token_accuracy": 0.7767400592565536, "num_tokens": 8612545.0, "step": 527 }, { "entropy": 0.5524493604898453, "epoch": 1.9701492537313432, "grad_norm": 0.13262905180454254, "learning_rate": 0.0002, "loss": 0.5456893444061279, "mean_token_accuracy": 0.7794637978076935, "num_tokens": 8628708.0, "step": 528 }, { "entropy": 0.5483785569667816, "epoch": 1.9738805970149254, "grad_norm": 0.13305608928203583, "learning_rate": 0.0002, "loss": 0.5419108271598816, "mean_token_accuracy": 0.7776815295219421, "num_tokens": 8645353.0, "step": 529 }, { "entropy": 0.5357464104890823, "epoch": 1.9776119402985075, "grad_norm": 0.18632404506206512, "learning_rate": 0.0002, "loss": 0.538067102432251, "mean_token_accuracy": 0.7834661602973938, "num_tokens": 8661338.0, "step": 530 }, { "entropy": 0.5424002707004547, "epoch": 1.9813432835820897, "grad_norm": 0.14013341069221497, "learning_rate": 0.0002, "loss": 0.5498412251472473, "mean_token_accuracy": 0.7779710739850998, "num_tokens": 8677558.0, "step": 531 }, { "entropy": 0.5473677217960358, "epoch": 1.9850746268656716, "grad_norm": 0.16677168011665344, "learning_rate": 0.0002, "loss": 0.5508783459663391, "mean_token_accuracy": 0.7754979729652405, "num_tokens": 8693871.0, "step": 532 }, { "entropy": 0.5417899936437607, "epoch": 1.9888059701492538, "grad_norm": 0.13049523532390594, "learning_rate": 0.0002, "loss": 0.5387138724327087, "mean_token_accuracy": 0.7801752388477325, "num_tokens": 8710295.0, "step": 533 }, { "entropy": 0.539973795413971, "epoch": 1.9925373134328357, "grad_norm": 0.13125836849212646, "learning_rate": 0.0002, "loss": 0.5384909510612488, "mean_token_accuracy": 0.7825180888175964, "num_tokens": 8726574.0, "step": 534 }, { "entropy": 0.5503594130277634, "epoch": 1.9962686567164178, "grad_norm": 0.13576547801494598, "learning_rate": 0.0002, "loss": 0.5558905005455017, "mean_token_accuracy": 0.7731243073940277, "num_tokens": 8742903.0, "step": 535 }, { "entropy": 0.5420230776071548, "epoch": 2.0, "grad_norm": 0.13022863864898682, "learning_rate": 0.0002, "loss": 0.5468026399612427, "mean_token_accuracy": 0.7781520336866379, "num_tokens": 8759542.0, "step": 536 }, { "entropy": 0.5381979197263718, "epoch": 2.003731343283582, "grad_norm": 0.14043375849723816, "learning_rate": 0.0002, "loss": 0.527134358882904, "mean_token_accuracy": 0.7864610850811005, "num_tokens": 8775884.0, "step": 537 }, { "entropy": 0.5298552364110947, "epoch": 2.0074626865671643, "grad_norm": 0.15086792409420013, "learning_rate": 0.0002, "loss": 0.525084912776947, "mean_token_accuracy": 0.7869725525379181, "num_tokens": 8792092.0, "step": 538 }, { "entropy": 0.5192188173532486, "epoch": 2.0111940298507465, "grad_norm": 0.19961106777191162, "learning_rate": 0.0002, "loss": 0.5296894907951355, "mean_token_accuracy": 0.7826270759105682, "num_tokens": 8808558.0, "step": 539 }, { "entropy": 0.5123308524489403, "epoch": 2.014925373134328, "grad_norm": 0.19111908972263336, "learning_rate": 0.0002, "loss": 0.5212836265563965, "mean_token_accuracy": 0.789938747882843, "num_tokens": 8824957.0, "step": 540 }, { "entropy": 0.5178431421518326, "epoch": 2.0186567164179103, "grad_norm": 0.19028709828853607, "learning_rate": 0.0002, "loss": 0.5238035917282104, "mean_token_accuracy": 0.7860684394836426, "num_tokens": 8841440.0, "step": 541 }, { "entropy": 0.531784176826477, "epoch": 2.0223880597014925, "grad_norm": 0.15052154660224915, "learning_rate": 0.0002, "loss": 0.5242434144020081, "mean_token_accuracy": 0.7872632443904877, "num_tokens": 8857544.0, "step": 542 }, { "entropy": 0.523473396897316, "epoch": 2.0261194029850746, "grad_norm": 0.16107355058193207, "learning_rate": 0.0002, "loss": 0.5132102966308594, "mean_token_accuracy": 0.7902694642543793, "num_tokens": 8873855.0, "step": 543 }, { "entropy": 0.5190383419394493, "epoch": 2.029850746268657, "grad_norm": 0.1708311289548874, "learning_rate": 0.0002, "loss": 0.5148621797561646, "mean_token_accuracy": 0.7895102798938751, "num_tokens": 8890117.0, "step": 544 }, { "entropy": 0.529280424118042, "epoch": 2.033582089552239, "grad_norm": 0.16680803894996643, "learning_rate": 0.0002, "loss": 0.5307912826538086, "mean_token_accuracy": 0.7853487432003021, "num_tokens": 8906392.0, "step": 545 }, { "entropy": 0.49614501744508743, "epoch": 2.0373134328358207, "grad_norm": 0.1503826081752777, "learning_rate": 0.0002, "loss": 0.5012757182121277, "mean_token_accuracy": 0.7970542311668396, "num_tokens": 8922509.0, "step": 546 }, { "entropy": 0.509469673037529, "epoch": 2.041044776119403, "grad_norm": 0.15220946073532104, "learning_rate": 0.0002, "loss": 0.5193155407905579, "mean_token_accuracy": 0.7900224179029465, "num_tokens": 8938730.0, "step": 547 }, { "entropy": 0.5206529274582863, "epoch": 2.044776119402985, "grad_norm": 0.15667758882045746, "learning_rate": 0.0002, "loss": 0.5237014293670654, "mean_token_accuracy": 0.7895828038454056, "num_tokens": 8955181.0, "step": 548 }, { "entropy": 0.5195223838090897, "epoch": 2.048507462686567, "grad_norm": 0.1412286013364792, "learning_rate": 0.0002, "loss": 0.5065000653266907, "mean_token_accuracy": 0.7948807328939438, "num_tokens": 8971652.0, "step": 549 }, { "entropy": 0.5343464240431786, "epoch": 2.0522388059701493, "grad_norm": 0.17040982842445374, "learning_rate": 0.0002, "loss": 0.5262223482131958, "mean_token_accuracy": 0.7864163517951965, "num_tokens": 8987886.0, "step": 550 }, { "entropy": 0.5151650607585907, "epoch": 2.0559701492537314, "grad_norm": 0.18324047327041626, "learning_rate": 0.0002, "loss": 0.5181486010551453, "mean_token_accuracy": 0.7915034592151642, "num_tokens": 9004065.0, "step": 551 }, { "entropy": 0.5399871617555618, "epoch": 2.0597014925373136, "grad_norm": 0.18549422919750214, "learning_rate": 0.0002, "loss": 0.5452507138252258, "mean_token_accuracy": 0.7797505408525467, "num_tokens": 9020548.0, "step": 552 }, { "entropy": 0.5106882750988007, "epoch": 2.0634328358208953, "grad_norm": 0.18570005893707275, "learning_rate": 0.0002, "loss": 0.5167975425720215, "mean_token_accuracy": 0.7912678271532059, "num_tokens": 9036842.0, "step": 553 }, { "entropy": 0.5242500603199005, "epoch": 2.0671641791044775, "grad_norm": 0.16008509695529938, "learning_rate": 0.0002, "loss": 0.5222814083099365, "mean_token_accuracy": 0.7895151823759079, "num_tokens": 9053207.0, "step": 554 }, { "entropy": 0.5202578157186508, "epoch": 2.0708955223880596, "grad_norm": 0.158061683177948, "learning_rate": 0.0002, "loss": 0.510570228099823, "mean_token_accuracy": 0.7938546240329742, "num_tokens": 9069710.0, "step": 555 }, { "entropy": 0.5159406885504723, "epoch": 2.074626865671642, "grad_norm": 0.1673257201910019, "learning_rate": 0.0002, "loss": 0.5130877494812012, "mean_token_accuracy": 0.7952297329902649, "num_tokens": 9085896.0, "step": 556 }, { "entropy": 0.5333143472671509, "epoch": 2.078358208955224, "grad_norm": 0.1610044240951538, "learning_rate": 0.0002, "loss": 0.534683108329773, "mean_token_accuracy": 0.7838889360427856, "num_tokens": 9102330.0, "step": 557 }, { "entropy": 0.5199142321944237, "epoch": 2.082089552238806, "grad_norm": 0.18822608888149261, "learning_rate": 0.0002, "loss": 0.5304499864578247, "mean_token_accuracy": 0.7855323851108551, "num_tokens": 9118702.0, "step": 558 }, { "entropy": 0.5128015950322151, "epoch": 2.0858208955223883, "grad_norm": 0.16853775084018707, "learning_rate": 0.0002, "loss": 0.5243670344352722, "mean_token_accuracy": 0.7870570570230484, "num_tokens": 9135161.0, "step": 559 }, { "entropy": 0.5174604654312134, "epoch": 2.08955223880597, "grad_norm": 0.1812400370836258, "learning_rate": 0.0002, "loss": 0.5177437663078308, "mean_token_accuracy": 0.7915796935558319, "num_tokens": 9151704.0, "step": 560 }, { "entropy": 0.5173925012350082, "epoch": 2.093283582089552, "grad_norm": 0.1714162975549698, "learning_rate": 0.0002, "loss": 0.5103091597557068, "mean_token_accuracy": 0.7926450222730637, "num_tokens": 9167936.0, "step": 561 }, { "entropy": 0.5338417440652847, "epoch": 2.0970149253731343, "grad_norm": 0.18883411586284637, "learning_rate": 0.0002, "loss": 0.5264431834220886, "mean_token_accuracy": 0.7850892692804337, "num_tokens": 9184252.0, "step": 562 }, { "entropy": 0.5227560251951218, "epoch": 2.1007462686567164, "grad_norm": 0.16431209444999695, "learning_rate": 0.0002, "loss": 0.5194032192230225, "mean_token_accuracy": 0.7891248762607574, "num_tokens": 9200663.0, "step": 563 }, { "entropy": 0.5161062777042389, "epoch": 2.1044776119402986, "grad_norm": 0.19406329095363617, "learning_rate": 0.0002, "loss": 0.5161796808242798, "mean_token_accuracy": 0.7907394468784332, "num_tokens": 9216947.0, "step": 564 }, { "entropy": 0.5179730951786041, "epoch": 2.1082089552238807, "grad_norm": 0.1819450706243515, "learning_rate": 0.0002, "loss": 0.5243360996246338, "mean_token_accuracy": 0.7889621257781982, "num_tokens": 9233374.0, "step": 565 }, { "entropy": 0.5069013833999634, "epoch": 2.111940298507463, "grad_norm": 0.18256594240665436, "learning_rate": 0.0002, "loss": 0.5135838389396667, "mean_token_accuracy": 0.7917103320360184, "num_tokens": 9249879.0, "step": 566 }, { "entropy": 0.5135505869984627, "epoch": 2.1156716417910446, "grad_norm": 0.20573152601718903, "learning_rate": 0.0002, "loss": 0.5165933966636658, "mean_token_accuracy": 0.7909833937883377, "num_tokens": 9266246.0, "step": 567 }, { "entropy": 0.5395868420600891, "epoch": 2.1194029850746268, "grad_norm": 0.18927782773971558, "learning_rate": 0.0002, "loss": 0.5330281853675842, "mean_token_accuracy": 0.7855703681707382, "num_tokens": 9282481.0, "step": 568 }, { "entropy": 0.4938410297036171, "epoch": 2.123134328358209, "grad_norm": 0.19526073336601257, "learning_rate": 0.0002, "loss": 0.49382245540618896, "mean_token_accuracy": 0.7996838092803955, "num_tokens": 9298815.0, "step": 569 }, { "entropy": 0.5009667873382568, "epoch": 2.126865671641791, "grad_norm": 0.16595199704170227, "learning_rate": 0.0002, "loss": 0.5045086741447449, "mean_token_accuracy": 0.7978608906269073, "num_tokens": 9315340.0, "step": 570 }, { "entropy": 0.5141628980636597, "epoch": 2.1305970149253732, "grad_norm": 0.21891801059246063, "learning_rate": 0.0002, "loss": 0.5266185998916626, "mean_token_accuracy": 0.787352979183197, "num_tokens": 9331498.0, "step": 571 }, { "entropy": 0.5307284891605377, "epoch": 2.1343283582089554, "grad_norm": 0.1866699457168579, "learning_rate": 0.0002, "loss": 0.5273443460464478, "mean_token_accuracy": 0.7860653698444366, "num_tokens": 9347831.0, "step": 572 }, { "entropy": 0.5239406228065491, "epoch": 2.138059701492537, "grad_norm": 0.16141167283058167, "learning_rate": 0.0002, "loss": 0.5189298391342163, "mean_token_accuracy": 0.7913686484098434, "num_tokens": 9364053.0, "step": 573 }, { "entropy": 0.5423860549926758, "epoch": 2.1417910447761193, "grad_norm": 0.21419642865657806, "learning_rate": 0.0002, "loss": 0.5438653826713562, "mean_token_accuracy": 0.7800484448671341, "num_tokens": 9380482.0, "step": 574 }, { "entropy": 0.5319498926401138, "epoch": 2.1455223880597014, "grad_norm": 0.15394842624664307, "learning_rate": 0.0002, "loss": 0.5297288298606873, "mean_token_accuracy": 0.7861971110105515, "num_tokens": 9396762.0, "step": 575 }, { "entropy": 0.5272255092859268, "epoch": 2.1492537313432836, "grad_norm": 0.17917747795581818, "learning_rate": 0.0002, "loss": 0.5221657156944275, "mean_token_accuracy": 0.78948013484478, "num_tokens": 9412981.0, "step": 576 }, { "entropy": 0.5195171386003494, "epoch": 2.1529850746268657, "grad_norm": 0.16095657646656036, "learning_rate": 0.0002, "loss": 0.5160609483718872, "mean_token_accuracy": 0.7911281585693359, "num_tokens": 9429393.0, "step": 577 }, { "entropy": 0.5020652115345001, "epoch": 2.156716417910448, "grad_norm": 0.1592203974723816, "learning_rate": 0.0002, "loss": 0.5017430782318115, "mean_token_accuracy": 0.7959037572145462, "num_tokens": 9445763.0, "step": 578 }, { "entropy": 0.5353998094797134, "epoch": 2.16044776119403, "grad_norm": 0.18405838310718536, "learning_rate": 0.0002, "loss": 0.5360097885131836, "mean_token_accuracy": 0.7805107831954956, "num_tokens": 9462245.0, "step": 579 }, { "entropy": 0.5231145322322845, "epoch": 2.1641791044776117, "grad_norm": 0.16262777149677277, "learning_rate": 0.0002, "loss": 0.5238299369812012, "mean_token_accuracy": 0.7883976399898529, "num_tokens": 9478792.0, "step": 580 }, { "entropy": 0.5025703385472298, "epoch": 2.167910447761194, "grad_norm": 0.16886277496814728, "learning_rate": 0.0002, "loss": 0.5095133185386658, "mean_token_accuracy": 0.7930570840835571, "num_tokens": 9495042.0, "step": 581 }, { "entropy": 0.5041064321994781, "epoch": 2.171641791044776, "grad_norm": 0.1545090675354004, "learning_rate": 0.0002, "loss": 0.5001657605171204, "mean_token_accuracy": 0.7950020581483841, "num_tokens": 9511399.0, "step": 582 }, { "entropy": 0.533274233341217, "epoch": 2.175373134328358, "grad_norm": 0.15395475924015045, "learning_rate": 0.0002, "loss": 0.5321199893951416, "mean_token_accuracy": 0.7817400395870209, "num_tokens": 9527796.0, "step": 583 }, { "entropy": 0.5225674957036972, "epoch": 2.1791044776119404, "grad_norm": 0.1874343305826187, "learning_rate": 0.0002, "loss": 0.5301029682159424, "mean_token_accuracy": 0.7839690893888474, "num_tokens": 9544098.0, "step": 584 }, { "entropy": 0.5206504017114639, "epoch": 2.1828358208955225, "grad_norm": 0.18132635951042175, "learning_rate": 0.0002, "loss": 0.5191587209701538, "mean_token_accuracy": 0.7905547767877579, "num_tokens": 9560486.0, "step": 585 }, { "entropy": 0.5231298729777336, "epoch": 2.1865671641791047, "grad_norm": 0.19394823908805847, "learning_rate": 0.0002, "loss": 0.5234656929969788, "mean_token_accuracy": 0.7889635264873505, "num_tokens": 9576893.0, "step": 586 }, { "entropy": 0.4975113570690155, "epoch": 2.1902985074626864, "grad_norm": 0.1897096484899521, "learning_rate": 0.0002, "loss": 0.5067098736763, "mean_token_accuracy": 0.7950832843780518, "num_tokens": 9593176.0, "step": 587 }, { "entropy": 0.5182362198829651, "epoch": 2.1940298507462686, "grad_norm": 0.21101859211921692, "learning_rate": 0.0002, "loss": 0.5240258574485779, "mean_token_accuracy": 0.7852578610181808, "num_tokens": 9609529.0, "step": 588 }, { "entropy": 0.5308810174465179, "epoch": 2.1977611940298507, "grad_norm": 0.15612205862998962, "learning_rate": 0.0002, "loss": 0.5230595469474792, "mean_token_accuracy": 0.7886761873960495, "num_tokens": 9626018.0, "step": 589 }, { "entropy": 0.5405040681362152, "epoch": 2.201492537313433, "grad_norm": 0.16354262828826904, "learning_rate": 0.0002, "loss": 0.5339536666870117, "mean_token_accuracy": 0.7827159017324448, "num_tokens": 9642340.0, "step": 590 }, { "entropy": 0.5320803225040436, "epoch": 2.205223880597015, "grad_norm": 0.1848597228527069, "learning_rate": 0.0002, "loss": 0.5349913835525513, "mean_token_accuracy": 0.7858193665742874, "num_tokens": 9658780.0, "step": 591 }, { "entropy": 0.5458312928676605, "epoch": 2.208955223880597, "grad_norm": 0.16995884478092194, "learning_rate": 0.0002, "loss": 0.5466773509979248, "mean_token_accuracy": 0.7766650468111038, "num_tokens": 9675184.0, "step": 592 }, { "entropy": 0.520288422703743, "epoch": 2.2126865671641793, "grad_norm": 0.17533989250659943, "learning_rate": 0.0002, "loss": 0.5276610851287842, "mean_token_accuracy": 0.7833162993192673, "num_tokens": 9691587.0, "step": 593 }, { "entropy": 0.5230257883667946, "epoch": 2.216417910447761, "grad_norm": 0.1576543152332306, "learning_rate": 0.0002, "loss": 0.5214830040931702, "mean_token_accuracy": 0.7887468189001083, "num_tokens": 9707639.0, "step": 594 }, { "entropy": 0.5276977717876434, "epoch": 2.220149253731343, "grad_norm": 0.16972552239894867, "learning_rate": 0.0002, "loss": 0.5270232558250427, "mean_token_accuracy": 0.7899148017168045, "num_tokens": 9723826.0, "step": 595 }, { "entropy": 0.5177433490753174, "epoch": 2.2238805970149254, "grad_norm": 0.17887970805168152, "learning_rate": 0.0002, "loss": 0.5160896182060242, "mean_token_accuracy": 0.7925579845905304, "num_tokens": 9740088.0, "step": 596 }, { "entropy": 0.525688573718071, "epoch": 2.2276119402985075, "grad_norm": 0.1659506857395172, "learning_rate": 0.0002, "loss": 0.5277712345123291, "mean_token_accuracy": 0.7854456752538681, "num_tokens": 9756214.0, "step": 597 }, { "entropy": 0.5137215405702591, "epoch": 2.2313432835820897, "grad_norm": 0.18150706589221954, "learning_rate": 0.0002, "loss": 0.5194687247276306, "mean_token_accuracy": 0.7904618233442307, "num_tokens": 9772511.0, "step": 598 }, { "entropy": 0.529701828956604, "epoch": 2.235074626865672, "grad_norm": 0.17603962123394012, "learning_rate": 0.0002, "loss": 0.5309550166130066, "mean_token_accuracy": 0.7836979478597641, "num_tokens": 9788956.0, "step": 599 }, { "entropy": 0.5346364378929138, "epoch": 2.2388059701492535, "grad_norm": 0.17556419968605042, "learning_rate": 0.0002, "loss": 0.5340572595596313, "mean_token_accuracy": 0.7827766090631485, "num_tokens": 9805350.0, "step": 600 }, { "entropy": 0.5358438938856125, "epoch": 2.2425373134328357, "grad_norm": 0.19660161435604095, "learning_rate": 0.0002, "loss": 0.5320678353309631, "mean_token_accuracy": 0.7855796813964844, "num_tokens": 9821744.0, "step": 601 }, { "entropy": 0.5096235424280167, "epoch": 2.246268656716418, "grad_norm": 0.15900631248950958, "learning_rate": 0.0002, "loss": 0.5056334137916565, "mean_token_accuracy": 0.7966822683811188, "num_tokens": 9837824.0, "step": 602 }, { "entropy": 0.5357042700052261, "epoch": 2.25, "grad_norm": 0.1657211184501648, "learning_rate": 0.0002, "loss": 0.5354617238044739, "mean_token_accuracy": 0.7830197513103485, "num_tokens": 9854305.0, "step": 603 }, { "entropy": 0.5109390839934349, "epoch": 2.253731343283582, "grad_norm": 0.1763714998960495, "learning_rate": 0.0002, "loss": 0.5157687664031982, "mean_token_accuracy": 0.7923711538314819, "num_tokens": 9870793.0, "step": 604 }, { "entropy": 0.5191235095262527, "epoch": 2.2574626865671643, "grad_norm": 0.20325957238674164, "learning_rate": 0.0002, "loss": 0.5273858308792114, "mean_token_accuracy": 0.7857847660779953, "num_tokens": 9887144.0, "step": 605 }, { "entropy": 0.5128894448280334, "epoch": 2.2611940298507465, "grad_norm": 0.18303951621055603, "learning_rate": 0.0002, "loss": 0.5150971412658691, "mean_token_accuracy": 0.7911935448646545, "num_tokens": 9903362.0, "step": 606 }, { "entropy": 0.518405131995678, "epoch": 2.264925373134328, "grad_norm": 0.16138286888599396, "learning_rate": 0.0002, "loss": 0.5196152925491333, "mean_token_accuracy": 0.7916755676269531, "num_tokens": 9919665.0, "step": 607 }, { "entropy": 0.5238161385059357, "epoch": 2.2686567164179103, "grad_norm": 0.15336841344833374, "learning_rate": 0.0002, "loss": 0.5234584808349609, "mean_token_accuracy": 0.7885531485080719, "num_tokens": 9936204.0, "step": 608 }, { "entropy": 0.5139288082718849, "epoch": 2.2723880597014925, "grad_norm": 0.15460564196109772, "learning_rate": 0.0002, "loss": 0.516942024230957, "mean_token_accuracy": 0.7878196388483047, "num_tokens": 9952444.0, "step": 609 }, { "entropy": 0.5144378393888474, "epoch": 2.2761194029850746, "grad_norm": 0.16456560790538788, "learning_rate": 0.0002, "loss": 0.5143165588378906, "mean_token_accuracy": 0.7900296002626419, "num_tokens": 9968772.0, "step": 610 }, { "entropy": 0.5115328878164291, "epoch": 2.279850746268657, "grad_norm": 0.17883925139904022, "learning_rate": 0.0002, "loss": 0.5190625190734863, "mean_token_accuracy": 0.7872501909732819, "num_tokens": 9985174.0, "step": 611 }, { "entropy": 0.535979226231575, "epoch": 2.283582089552239, "grad_norm": 0.1744793951511383, "learning_rate": 0.0002, "loss": 0.5318659543991089, "mean_token_accuracy": 0.7878114283084869, "num_tokens": 10001610.0, "step": 612 }, { "entropy": 0.5348420441150665, "epoch": 2.2873134328358207, "grad_norm": 0.17023774981498718, "learning_rate": 0.0002, "loss": 0.5370223522186279, "mean_token_accuracy": 0.783968135714531, "num_tokens": 10017829.0, "step": 613 }, { "entropy": 0.5138903260231018, "epoch": 2.291044776119403, "grad_norm": 0.17115749418735504, "learning_rate": 0.0002, "loss": 0.5157005190849304, "mean_token_accuracy": 0.7915801256895065, "num_tokens": 10034135.0, "step": 614 }, { "entropy": 0.514953039586544, "epoch": 2.294776119402985, "grad_norm": 0.1999882459640503, "learning_rate": 0.0002, "loss": 0.5170516967773438, "mean_token_accuracy": 0.7916076630353928, "num_tokens": 10050500.0, "step": 615 }, { "entropy": 0.5247506201267242, "epoch": 2.298507462686567, "grad_norm": 0.16434574127197266, "learning_rate": 0.0002, "loss": 0.5179375410079956, "mean_token_accuracy": 0.7906480133533478, "num_tokens": 10066822.0, "step": 616 }, { "entropy": 0.5195427983999252, "epoch": 2.3022388059701493, "grad_norm": 0.16079425811767578, "learning_rate": 0.0002, "loss": 0.5192772746086121, "mean_token_accuracy": 0.788419172167778, "num_tokens": 10083211.0, "step": 617 }, { "entropy": 0.5161983221769333, "epoch": 2.3059701492537314, "grad_norm": 0.15893937647342682, "learning_rate": 0.0002, "loss": 0.5151652097702026, "mean_token_accuracy": 0.7913366705179214, "num_tokens": 10099502.0, "step": 618 }, { "entropy": 0.5129862576723099, "epoch": 2.3097014925373136, "grad_norm": 0.1990455985069275, "learning_rate": 0.0002, "loss": 0.5226958394050598, "mean_token_accuracy": 0.7890161275863647, "num_tokens": 10115875.0, "step": 619 }, { "entropy": 0.5259782820940018, "epoch": 2.3134328358208958, "grad_norm": 0.17600762844085693, "learning_rate": 0.0002, "loss": 0.5303045511245728, "mean_token_accuracy": 0.784588485956192, "num_tokens": 10132329.0, "step": 620 }, { "entropy": 0.5374605804681778, "epoch": 2.3171641791044775, "grad_norm": 0.15160205960273743, "learning_rate": 0.0002, "loss": 0.5319960117340088, "mean_token_accuracy": 0.7856357097625732, "num_tokens": 10148660.0, "step": 621 }, { "entropy": 0.5202681869268417, "epoch": 2.3208955223880596, "grad_norm": 0.17217791080474854, "learning_rate": 0.0002, "loss": 0.513685405254364, "mean_token_accuracy": 0.7912963330745697, "num_tokens": 10164847.0, "step": 622 }, { "entropy": 0.5351561158895493, "epoch": 2.324626865671642, "grad_norm": 0.16189849376678467, "learning_rate": 0.0002, "loss": 0.5341706275939941, "mean_token_accuracy": 0.7827345281839371, "num_tokens": 10181330.0, "step": 623 }, { "entropy": 0.5096163898706436, "epoch": 2.328358208955224, "grad_norm": 0.17251546680927277, "learning_rate": 0.0002, "loss": 0.5183389186859131, "mean_token_accuracy": 0.7891778647899628, "num_tokens": 10197593.0, "step": 624 }, { "entropy": 0.5043528005480766, "epoch": 2.332089552238806, "grad_norm": 0.19364336133003235, "learning_rate": 0.0002, "loss": 0.5169776082038879, "mean_token_accuracy": 0.792061522603035, "num_tokens": 10213821.0, "step": 625 }, { "entropy": 0.5118814930319786, "epoch": 2.3358208955223883, "grad_norm": 0.21755088865756989, "learning_rate": 0.0002, "loss": 0.5260127782821655, "mean_token_accuracy": 0.7870439440011978, "num_tokens": 10229959.0, "step": 626 }, { "entropy": 0.5387731194496155, "epoch": 2.33955223880597, "grad_norm": 0.15599676966667175, "learning_rate": 0.0002, "loss": 0.5359347462654114, "mean_token_accuracy": 0.7821696400642395, "num_tokens": 10246325.0, "step": 627 }, { "entropy": 0.5259936600923538, "epoch": 2.343283582089552, "grad_norm": 0.17784081399440765, "learning_rate": 0.0002, "loss": 0.5117411613464355, "mean_token_accuracy": 0.7913538813591003, "num_tokens": 10262854.0, "step": 628 }, { "entropy": 0.5261276811361313, "epoch": 2.3470149253731343, "grad_norm": 0.15290921926498413, "learning_rate": 0.0002, "loss": 0.5141685009002686, "mean_token_accuracy": 0.7897167503833771, "num_tokens": 10279167.0, "step": 629 }, { "entropy": 0.516872301697731, "epoch": 2.3507462686567164, "grad_norm": 0.16548150777816772, "learning_rate": 0.0002, "loss": 0.518975555896759, "mean_token_accuracy": 0.7876042425632477, "num_tokens": 10295367.0, "step": 630 }, { "entropy": 0.5166520774364471, "epoch": 2.3544776119402986, "grad_norm": 0.2100355476140976, "learning_rate": 0.0002, "loss": 0.5216490030288696, "mean_token_accuracy": 0.7918855249881744, "num_tokens": 10311818.0, "step": 631 }, { "entropy": 0.5158288031816483, "epoch": 2.3582089552238807, "grad_norm": 0.19722220301628113, "learning_rate": 0.0002, "loss": 0.5301001667976379, "mean_token_accuracy": 0.785649761557579, "num_tokens": 10328226.0, "step": 632 }, { "entropy": 0.5121333077549934, "epoch": 2.361940298507463, "grad_norm": 0.18101061880588531, "learning_rate": 0.0002, "loss": 0.514575719833374, "mean_token_accuracy": 0.7912623584270477, "num_tokens": 10344492.0, "step": 633 }, { "entropy": 0.5286690294742584, "epoch": 2.3656716417910446, "grad_norm": 0.18992973864078522, "learning_rate": 0.0002, "loss": 0.5238395929336548, "mean_token_accuracy": 0.7872939556837082, "num_tokens": 10360763.0, "step": 634 }, { "entropy": 0.504866473376751, "epoch": 2.3694029850746268, "grad_norm": 0.17053747177124023, "learning_rate": 0.0002, "loss": 0.5018288493156433, "mean_token_accuracy": 0.7963565587997437, "num_tokens": 10376794.0, "step": 635 }, { "entropy": 0.5348407328128815, "epoch": 2.373134328358209, "grad_norm": 0.1969325840473175, "learning_rate": 0.0002, "loss": 0.5392089486122131, "mean_token_accuracy": 0.781823992729187, "num_tokens": 10393125.0, "step": 636 }, { "entropy": 0.5291974544525146, "epoch": 2.376865671641791, "grad_norm": 0.19346994161605835, "learning_rate": 0.0002, "loss": 0.5330736637115479, "mean_token_accuracy": 0.781773254275322, "num_tokens": 10409537.0, "step": 637 }, { "entropy": 0.5348323583602905, "epoch": 2.3805970149253732, "grad_norm": 0.18969298899173737, "learning_rate": 0.0002, "loss": 0.5274794101715088, "mean_token_accuracy": 0.787670373916626, "num_tokens": 10425973.0, "step": 638 }, { "entropy": 0.5205499678850174, "epoch": 2.3843283582089554, "grad_norm": 0.17864486575126648, "learning_rate": 0.0002, "loss": 0.5213812589645386, "mean_token_accuracy": 0.7890082150697708, "num_tokens": 10442180.0, "step": 639 }, { "entropy": 0.528412714600563, "epoch": 2.388059701492537, "grad_norm": 0.1959443986415863, "learning_rate": 0.0002, "loss": 0.534969687461853, "mean_token_accuracy": 0.7831798046827316, "num_tokens": 10458477.0, "step": 640 }, { "entropy": 0.5136244520545006, "epoch": 2.3917910447761193, "grad_norm": 0.20498400926589966, "learning_rate": 0.0002, "loss": 0.511573314666748, "mean_token_accuracy": 0.7939646393060684, "num_tokens": 10475023.0, "step": 641 }, { "entropy": 0.5202098488807678, "epoch": 2.3955223880597014, "grad_norm": 0.20506030321121216, "learning_rate": 0.0002, "loss": 0.5162352919578552, "mean_token_accuracy": 0.7906180173158646, "num_tokens": 10491313.0, "step": 642 }, { "entropy": 0.5307043790817261, "epoch": 2.3992537313432836, "grad_norm": 0.17971979081630707, "learning_rate": 0.0002, "loss": 0.5288392305374146, "mean_token_accuracy": 0.7879067957401276, "num_tokens": 10507682.0, "step": 643 }, { "entropy": 0.5393616259098053, "epoch": 2.4029850746268657, "grad_norm": 0.23341259360313416, "learning_rate": 0.0002, "loss": 0.5383281707763672, "mean_token_accuracy": 0.781504288315773, "num_tokens": 10524138.0, "step": 644 }, { "entropy": 0.5379284471273422, "epoch": 2.406716417910448, "grad_norm": 0.16890797019004822, "learning_rate": 0.0002, "loss": 0.5414294004440308, "mean_token_accuracy": 0.7795721143484116, "num_tokens": 10540308.0, "step": 645 }, { "entropy": 0.5295774638652802, "epoch": 2.41044776119403, "grad_norm": 0.2540934085845947, "learning_rate": 0.0002, "loss": 0.5318943858146667, "mean_token_accuracy": 0.7859358042478561, "num_tokens": 10556760.0, "step": 646 }, { "entropy": 0.5170229598879814, "epoch": 2.4141791044776117, "grad_norm": 0.16737528145313263, "learning_rate": 0.0002, "loss": 0.517413318157196, "mean_token_accuracy": 0.7901816219091415, "num_tokens": 10573293.0, "step": 647 }, { "entropy": 0.526155412197113, "epoch": 2.417910447761194, "grad_norm": 0.2225574254989624, "learning_rate": 0.0002, "loss": 0.529864490032196, "mean_token_accuracy": 0.7856150567531586, "num_tokens": 10589674.0, "step": 648 }, { "entropy": 0.5266731381416321, "epoch": 2.421641791044776, "grad_norm": 0.16272951662540436, "learning_rate": 0.0002, "loss": 0.5234624743461609, "mean_token_accuracy": 0.7885357886552811, "num_tokens": 10606101.0, "step": 649 }, { "entropy": 0.5251661986112595, "epoch": 2.425373134328358, "grad_norm": 0.17834821343421936, "learning_rate": 0.0002, "loss": 0.5261815190315247, "mean_token_accuracy": 0.7859483957290649, "num_tokens": 10622240.0, "step": 650 }, { "entropy": 0.5259936600923538, "epoch": 2.4291044776119404, "grad_norm": 0.16211281716823578, "learning_rate": 0.0002, "loss": 0.5267058610916138, "mean_token_accuracy": 0.7840430587530136, "num_tokens": 10638728.0, "step": 651 }, { "entropy": 0.5017556846141815, "epoch": 2.4328358208955225, "grad_norm": 0.3111971616744995, "learning_rate": 0.0002, "loss": 0.5085122585296631, "mean_token_accuracy": 0.7949473708868027, "num_tokens": 10654954.0, "step": 652 }, { "entropy": 0.53680419921875, "epoch": 2.4365671641791042, "grad_norm": 0.17920435965061188, "learning_rate": 0.0002, "loss": 0.5438150763511658, "mean_token_accuracy": 0.7806514501571655, "num_tokens": 10671142.0, "step": 653 }, { "entropy": 0.5328411310911179, "epoch": 2.4402985074626864, "grad_norm": 0.36842888593673706, "learning_rate": 0.0002, "loss": 0.5365176200866699, "mean_token_accuracy": 0.7864848077297211, "num_tokens": 10687527.0, "step": 654 }, { "entropy": 0.5214048027992249, "epoch": 2.4440298507462686, "grad_norm": 0.15488730370998383, "learning_rate": 0.0002, "loss": 0.5212221145629883, "mean_token_accuracy": 0.7904541194438934, "num_tokens": 10703637.0, "step": 655 }, { "entropy": 0.5198699831962585, "epoch": 2.4477611940298507, "grad_norm": 0.17918945848941803, "learning_rate": 0.0002, "loss": 0.5142287015914917, "mean_token_accuracy": 0.7930866479873657, "num_tokens": 10719755.0, "step": 656 }, { "entropy": 0.5371468216180801, "epoch": 2.451492537313433, "grad_norm": 0.17966963350772858, "learning_rate": 0.0002, "loss": 0.5387783050537109, "mean_token_accuracy": 0.7836030423641205, "num_tokens": 10736159.0, "step": 657 }, { "entropy": 0.523772120475769, "epoch": 2.455223880597015, "grad_norm": 0.17708872258663177, "learning_rate": 0.0002, "loss": 0.5304325819015503, "mean_token_accuracy": 0.7857228368520737, "num_tokens": 10752300.0, "step": 658 }, { "entropy": 0.5180701240897179, "epoch": 2.458955223880597, "grad_norm": 0.18428592383861542, "learning_rate": 0.0002, "loss": 0.5193667411804199, "mean_token_accuracy": 0.7911625355482101, "num_tokens": 10768483.0, "step": 659 }, { "entropy": 0.528245247900486, "epoch": 2.4626865671641793, "grad_norm": 0.1747596561908722, "learning_rate": 0.0002, "loss": 0.5231127142906189, "mean_token_accuracy": 0.7906267046928406, "num_tokens": 10784872.0, "step": 660 }, { "entropy": 0.5145193934440613, "epoch": 2.466417910447761, "grad_norm": 0.16311223804950714, "learning_rate": 0.0002, "loss": 0.5083698630332947, "mean_token_accuracy": 0.7954908460378647, "num_tokens": 10801264.0, "step": 661 }, { "entropy": 0.5249892026185989, "epoch": 2.470149253731343, "grad_norm": 0.15471886098384857, "learning_rate": 0.0002, "loss": 0.5246090292930603, "mean_token_accuracy": 0.7875058203935623, "num_tokens": 10817509.0, "step": 662 }, { "entropy": 0.5209084749221802, "epoch": 2.4738805970149254, "grad_norm": 0.17972545325756073, "learning_rate": 0.0002, "loss": 0.5200228095054626, "mean_token_accuracy": 0.7910773009061813, "num_tokens": 10833875.0, "step": 663 }, { "entropy": 0.5148312151432037, "epoch": 2.4776119402985075, "grad_norm": 0.20573753118515015, "learning_rate": 0.0002, "loss": 0.5257189273834229, "mean_token_accuracy": 0.7857212275266647, "num_tokens": 10849915.0, "step": 664 }, { "entropy": 0.5218161419034004, "epoch": 2.4813432835820897, "grad_norm": 0.18017825484275818, "learning_rate": 0.0002, "loss": 0.5281471014022827, "mean_token_accuracy": 0.7845035791397095, "num_tokens": 10866228.0, "step": 665 }, { "entropy": 0.5220426917076111, "epoch": 2.485074626865672, "grad_norm": 0.16190138459205627, "learning_rate": 0.0002, "loss": 0.521308183670044, "mean_token_accuracy": 0.7905032187700272, "num_tokens": 10882941.0, "step": 666 }, { "entropy": 0.5130190551280975, "epoch": 2.4888059701492535, "grad_norm": 0.17984949052333832, "learning_rate": 0.0002, "loss": 0.5067973732948303, "mean_token_accuracy": 0.7954512685537338, "num_tokens": 10899165.0, "step": 667 }, { "entropy": 0.5297238677740097, "epoch": 2.4925373134328357, "grad_norm": 0.15996725857257843, "learning_rate": 0.0002, "loss": 0.5296366810798645, "mean_token_accuracy": 0.785218670964241, "num_tokens": 10915443.0, "step": 668 }, { "entropy": 0.4974808022379875, "epoch": 2.496268656716418, "grad_norm": 0.1793019324541092, "learning_rate": 0.0002, "loss": 0.4990445077419281, "mean_token_accuracy": 0.7966191321611404, "num_tokens": 10931711.0, "step": 669 }, { "entropy": 0.5239012390375137, "epoch": 2.5, "grad_norm": 0.19087010622024536, "learning_rate": 0.0002, "loss": 0.5348339676856995, "mean_token_accuracy": 0.7859302014112473, "num_tokens": 10948023.0, "step": 670 }, { "entropy": 0.502729706466198, "epoch": 2.503731343283582, "grad_norm": 0.17360597848892212, "learning_rate": 0.0002, "loss": 0.5077179074287415, "mean_token_accuracy": 0.7953527718782425, "num_tokens": 10964233.0, "step": 671 }, { "entropy": 0.5206915363669395, "epoch": 2.5074626865671643, "grad_norm": 0.19746483862400055, "learning_rate": 0.0002, "loss": 0.5238724946975708, "mean_token_accuracy": 0.7870853841304779, "num_tokens": 10980379.0, "step": 672 }, { "entropy": 0.5450692474842072, "epoch": 2.5111940298507465, "grad_norm": 0.20202518999576569, "learning_rate": 0.0002, "loss": 0.5349087119102478, "mean_token_accuracy": 0.7814089059829712, "num_tokens": 10996761.0, "step": 673 }, { "entropy": 0.5313533395528793, "epoch": 2.5149253731343286, "grad_norm": 0.16622328758239746, "learning_rate": 0.0002, "loss": 0.5273463726043701, "mean_token_accuracy": 0.7876841723918915, "num_tokens": 11013002.0, "step": 674 }, { "entropy": 0.5233149528503418, "epoch": 2.5186567164179103, "grad_norm": 0.1762213557958603, "learning_rate": 0.0002, "loss": 0.5284275412559509, "mean_token_accuracy": 0.7885796874761581, "num_tokens": 11029461.0, "step": 675 }, { "entropy": 0.5161427110433578, "epoch": 2.5223880597014925, "grad_norm": 0.1734134405851364, "learning_rate": 0.0002, "loss": 0.5218281149864197, "mean_token_accuracy": 0.7900317013263702, "num_tokens": 11045513.0, "step": 676 }, { "entropy": 0.527386263012886, "epoch": 2.5261194029850746, "grad_norm": 0.18649046123027802, "learning_rate": 0.0002, "loss": 0.5264036655426025, "mean_token_accuracy": 0.7881919145584106, "num_tokens": 11061764.0, "step": 677 }, { "entropy": 0.5335260331630707, "epoch": 2.529850746268657, "grad_norm": 0.16608470678329468, "learning_rate": 0.0002, "loss": 0.5327720046043396, "mean_token_accuracy": 0.7845087051391602, "num_tokens": 11077973.0, "step": 678 }, { "entropy": 0.5215242803096771, "epoch": 2.533582089552239, "grad_norm": 0.16991843283176422, "learning_rate": 0.0002, "loss": 0.5201636552810669, "mean_token_accuracy": 0.7907481640577316, "num_tokens": 11094025.0, "step": 679 }, { "entropy": 0.5226395204663277, "epoch": 2.5373134328358207, "grad_norm": 0.16204343736171722, "learning_rate": 0.0002, "loss": 0.5192615389823914, "mean_token_accuracy": 0.7913714349269867, "num_tokens": 11110340.0, "step": 680 }, { "entropy": 0.5280646532773972, "epoch": 2.541044776119403, "grad_norm": 0.17025548219680786, "learning_rate": 0.0002, "loss": 0.5243014097213745, "mean_token_accuracy": 0.7887150794267654, "num_tokens": 11126766.0, "step": 681 }, { "entropy": 0.5295235440135002, "epoch": 2.544776119402985, "grad_norm": 0.17332811653614044, "learning_rate": 0.0002, "loss": 0.5264289975166321, "mean_token_accuracy": 0.7893000990152359, "num_tokens": 11143383.0, "step": 682 }, { "entropy": 0.5350908041000366, "epoch": 2.548507462686567, "grad_norm": 0.16494929790496826, "learning_rate": 0.0002, "loss": 0.5385511517524719, "mean_token_accuracy": 0.7832952737808228, "num_tokens": 11159798.0, "step": 683 }, { "entropy": 0.5189319550991058, "epoch": 2.5522388059701493, "grad_norm": 0.1749635636806488, "learning_rate": 0.0002, "loss": 0.5244334936141968, "mean_token_accuracy": 0.7889615148305893, "num_tokens": 11176116.0, "step": 684 }, { "entropy": 0.5297338515520096, "epoch": 2.5559701492537314, "grad_norm": 0.16473545134067535, "learning_rate": 0.0002, "loss": 0.5357664227485657, "mean_token_accuracy": 0.7839798331260681, "num_tokens": 11192242.0, "step": 685 }, { "entropy": 0.5161855816841125, "epoch": 2.5597014925373136, "grad_norm": 0.19246211647987366, "learning_rate": 0.0002, "loss": 0.5211361050605774, "mean_token_accuracy": 0.790752574801445, "num_tokens": 11208617.0, "step": 686 }, { "entropy": 0.539324015378952, "epoch": 2.5634328358208958, "grad_norm": 0.16890385746955872, "learning_rate": 0.0002, "loss": 0.5382983684539795, "mean_token_accuracy": 0.7826134711503983, "num_tokens": 11225201.0, "step": 687 }, { "entropy": 0.5158891677856445, "epoch": 2.5671641791044775, "grad_norm": 0.16682742536067963, "learning_rate": 0.0002, "loss": 0.5142616629600525, "mean_token_accuracy": 0.7918410003185272, "num_tokens": 11241695.0, "step": 688 }, { "entropy": 0.5267701372504234, "epoch": 2.5708955223880596, "grad_norm": 0.1687549650669098, "learning_rate": 0.0002, "loss": 0.5238382816314697, "mean_token_accuracy": 0.7866890728473663, "num_tokens": 11258089.0, "step": 689 }, { "entropy": 0.5255937725305557, "epoch": 2.574626865671642, "grad_norm": 0.1738496869802475, "learning_rate": 0.0002, "loss": 0.5248072147369385, "mean_token_accuracy": 0.7852340638637543, "num_tokens": 11274450.0, "step": 690 }, { "entropy": 0.5198262184858322, "epoch": 2.578358208955224, "grad_norm": 0.1690807044506073, "learning_rate": 0.0002, "loss": 0.5270042419433594, "mean_token_accuracy": 0.7855731099843979, "num_tokens": 11290865.0, "step": 691 }, { "entropy": 0.5405410379171371, "epoch": 2.582089552238806, "grad_norm": 0.18134285509586334, "learning_rate": 0.0002, "loss": 0.5444961786270142, "mean_token_accuracy": 0.7780175656080246, "num_tokens": 11307409.0, "step": 692 }, { "entropy": 0.5347141325473785, "epoch": 2.585820895522388, "grad_norm": 0.1676827371120453, "learning_rate": 0.0002, "loss": 0.5311787724494934, "mean_token_accuracy": 0.784485325217247, "num_tokens": 11323946.0, "step": 693 }, { "entropy": 0.503664955496788, "epoch": 2.58955223880597, "grad_norm": 0.17767618596553802, "learning_rate": 0.0002, "loss": 0.5004390478134155, "mean_token_accuracy": 0.7965147197246552, "num_tokens": 11340062.0, "step": 694 }, { "entropy": 0.5400541573762894, "epoch": 2.593283582089552, "grad_norm": 0.17085346579551697, "learning_rate": 0.0002, "loss": 0.5395094156265259, "mean_token_accuracy": 0.781545028090477, "num_tokens": 11356660.0, "step": 695 }, { "entropy": 0.5177017226815224, "epoch": 2.5970149253731343, "grad_norm": 0.169759601354599, "learning_rate": 0.0002, "loss": 0.515388011932373, "mean_token_accuracy": 0.7907217293977737, "num_tokens": 11372963.0, "step": 696 }, { "entropy": 0.5352813154459, "epoch": 2.6007462686567164, "grad_norm": 0.17281876504421234, "learning_rate": 0.0002, "loss": 0.5351260900497437, "mean_token_accuracy": 0.7841326892375946, "num_tokens": 11389640.0, "step": 697 }, { "entropy": 0.5045363381505013, "epoch": 2.6044776119402986, "grad_norm": 0.18615856766700745, "learning_rate": 0.0002, "loss": 0.5119503736495972, "mean_token_accuracy": 0.7933619618415833, "num_tokens": 11405795.0, "step": 698 }, { "entropy": 0.521905705332756, "epoch": 2.6082089552238807, "grad_norm": 0.18743987381458282, "learning_rate": 0.0002, "loss": 0.5299134850502014, "mean_token_accuracy": 0.7850409299135208, "num_tokens": 11422045.0, "step": 699 }, { "entropy": 0.5174702405929565, "epoch": 2.611940298507463, "grad_norm": 0.17414018511772156, "learning_rate": 0.0002, "loss": 0.5177151560783386, "mean_token_accuracy": 0.7891951948404312, "num_tokens": 11438392.0, "step": 700 }, { "entropy": 0.5343185365200043, "epoch": 2.6156716417910446, "grad_norm": 0.17761462926864624, "learning_rate": 0.0002, "loss": 0.5284934043884277, "mean_token_accuracy": 0.7868274599313736, "num_tokens": 11455009.0, "step": 701 }, { "entropy": 0.53134885430336, "epoch": 2.6194029850746268, "grad_norm": 0.16672612726688385, "learning_rate": 0.0002, "loss": 0.5203122496604919, "mean_token_accuracy": 0.7913379818201065, "num_tokens": 11471341.0, "step": 702 }, { "entropy": 0.523793414235115, "epoch": 2.623134328358209, "grad_norm": 0.15720658004283905, "learning_rate": 0.0002, "loss": 0.5188941359519958, "mean_token_accuracy": 0.7898289412260056, "num_tokens": 11487565.0, "step": 703 }, { "entropy": 0.5335910320281982, "epoch": 2.626865671641791, "grad_norm": 0.18207021057605743, "learning_rate": 0.0002, "loss": 0.5383012294769287, "mean_token_accuracy": 0.7841922044754028, "num_tokens": 11503932.0, "step": 704 }, { "entropy": 0.5070014595985413, "epoch": 2.6305970149253732, "grad_norm": 0.18818838894367218, "learning_rate": 0.0002, "loss": 0.521304726600647, "mean_token_accuracy": 0.7882455736398697, "num_tokens": 11519876.0, "step": 705 }, { "entropy": 0.5179764032363892, "epoch": 2.6343283582089554, "grad_norm": 0.16391263902187347, "learning_rate": 0.0002, "loss": 0.5277372598648071, "mean_token_accuracy": 0.7888714224100113, "num_tokens": 11536317.0, "step": 706 }, { "entropy": 0.5383756011724472, "epoch": 2.638059701492537, "grad_norm": 0.20110981166362762, "learning_rate": 0.0002, "loss": 0.5405253171920776, "mean_token_accuracy": 0.7808063477277756, "num_tokens": 11552655.0, "step": 707 }, { "entropy": 0.5268357321619987, "epoch": 2.6417910447761193, "grad_norm": 0.17326846718788147, "learning_rate": 0.0002, "loss": 0.5239301919937134, "mean_token_accuracy": 0.7901074439287186, "num_tokens": 11568724.0, "step": 708 }, { "entropy": 0.5407274663448334, "epoch": 2.6455223880597014, "grad_norm": 0.16851350665092468, "learning_rate": 0.0002, "loss": 0.5350074172019958, "mean_token_accuracy": 0.7861216068267822, "num_tokens": 11585225.0, "step": 709 }, { "entropy": 0.5268073230981827, "epoch": 2.6492537313432836, "grad_norm": 0.19633817672729492, "learning_rate": 0.0002, "loss": 0.5214436054229736, "mean_token_accuracy": 0.7898468226194382, "num_tokens": 11601498.0, "step": 710 }, { "entropy": 0.535712480545044, "epoch": 2.6529850746268657, "grad_norm": 0.15659253299236298, "learning_rate": 0.0002, "loss": 0.5353400707244873, "mean_token_accuracy": 0.7835351228713989, "num_tokens": 11617811.0, "step": 711 }, { "entropy": 0.539536863565445, "epoch": 2.656716417910448, "grad_norm": 0.19012975692749023, "learning_rate": 0.0002, "loss": 0.5403158068656921, "mean_token_accuracy": 0.780579537153244, "num_tokens": 11634295.0, "step": 712 }, { "entropy": 0.5134764388203621, "epoch": 2.66044776119403, "grad_norm": 0.16630828380584717, "learning_rate": 0.0002, "loss": 0.5213350653648376, "mean_token_accuracy": 0.7890530824661255, "num_tokens": 11650834.0, "step": 713 }, { "entropy": 0.4917012006044388, "epoch": 2.664179104477612, "grad_norm": 0.1683693677186966, "learning_rate": 0.0002, "loss": 0.49927788972854614, "mean_token_accuracy": 0.797902062535286, "num_tokens": 11667060.0, "step": 714 }, { "entropy": 0.5247212499380112, "epoch": 2.667910447761194, "grad_norm": 0.17371122539043427, "learning_rate": 0.0002, "loss": 0.5344932079315186, "mean_token_accuracy": 0.783098891377449, "num_tokens": 11683574.0, "step": 715 }, { "entropy": 0.5191128477454185, "epoch": 2.671641791044776, "grad_norm": 0.16527095437049866, "learning_rate": 0.0002, "loss": 0.5183148384094238, "mean_token_accuracy": 0.790424644947052, "num_tokens": 11699720.0, "step": 716 }, { "entropy": 0.5185272991657257, "epoch": 2.675373134328358, "grad_norm": 0.16154323518276215, "learning_rate": 0.0002, "loss": 0.5092360973358154, "mean_token_accuracy": 0.7955475896596909, "num_tokens": 11716469.0, "step": 717 }, { "entropy": 0.5372938513755798, "epoch": 2.6791044776119404, "grad_norm": 0.15932703018188477, "learning_rate": 0.0002, "loss": 0.5302359461784363, "mean_token_accuracy": 0.786151722073555, "num_tokens": 11732748.0, "step": 718 }, { "entropy": 0.5596635788679123, "epoch": 2.6828358208955225, "grad_norm": 0.18202805519104004, "learning_rate": 0.0002, "loss": 0.5571697950363159, "mean_token_accuracy": 0.7754980325698853, "num_tokens": 11749150.0, "step": 719 }, { "entropy": 0.5210409909486771, "epoch": 2.6865671641791042, "grad_norm": 0.1875341236591339, "learning_rate": 0.0002, "loss": 0.5226970314979553, "mean_token_accuracy": 0.7895162850618362, "num_tokens": 11765442.0, "step": 720 }, { "entropy": 0.528057724237442, "epoch": 2.6902985074626864, "grad_norm": 0.16192083060741425, "learning_rate": 0.0002, "loss": 0.5281423330307007, "mean_token_accuracy": 0.788543164730072, "num_tokens": 11781875.0, "step": 721 }, { "entropy": 0.5093352198600769, "epoch": 2.6940298507462686, "grad_norm": 0.15824586153030396, "learning_rate": 0.0002, "loss": 0.5047670602798462, "mean_token_accuracy": 0.7923571020364761, "num_tokens": 11798168.0, "step": 722 }, { "entropy": 0.5319179147481918, "epoch": 2.6977611940298507, "grad_norm": 0.1545802354812622, "learning_rate": 0.0002, "loss": 0.5334397554397583, "mean_token_accuracy": 0.7845843136310577, "num_tokens": 11814632.0, "step": 723 }, { "entropy": 0.5133816972374916, "epoch": 2.701492537313433, "grad_norm": 0.16241911053657532, "learning_rate": 0.0002, "loss": 0.51878821849823, "mean_token_accuracy": 0.7933190315961838, "num_tokens": 11831088.0, "step": 724 }, { "entropy": 0.5164139419794083, "epoch": 2.705223880597015, "grad_norm": 0.14982916414737701, "learning_rate": 0.0002, "loss": 0.5140745639801025, "mean_token_accuracy": 0.7934172451496124, "num_tokens": 11847470.0, "step": 725 }, { "entropy": 0.521071195602417, "epoch": 2.708955223880597, "grad_norm": 0.17015258967876434, "learning_rate": 0.0002, "loss": 0.5232289433479309, "mean_token_accuracy": 0.7887244522571564, "num_tokens": 11863757.0, "step": 726 }, { "entropy": 0.5184628516435623, "epoch": 2.7126865671641793, "grad_norm": 0.1840510219335556, "learning_rate": 0.0002, "loss": 0.5194827318191528, "mean_token_accuracy": 0.7879429012537003, "num_tokens": 11880261.0, "step": 727 }, { "entropy": 0.5139294788241386, "epoch": 2.716417910447761, "grad_norm": 0.19588088989257812, "learning_rate": 0.0002, "loss": 0.5200832486152649, "mean_token_accuracy": 0.7899386137723923, "num_tokens": 11896585.0, "step": 728 }, { "entropy": 0.5239543169736862, "epoch": 2.720149253731343, "grad_norm": 0.20819295942783356, "learning_rate": 0.0002, "loss": 0.5261701345443726, "mean_token_accuracy": 0.7911202013492584, "num_tokens": 11912923.0, "step": 729 }, { "entropy": 0.5407283902168274, "epoch": 2.7238805970149254, "grad_norm": 0.17276515066623688, "learning_rate": 0.0002, "loss": 0.5370129942893982, "mean_token_accuracy": 0.7848152667284012, "num_tokens": 11929303.0, "step": 730 }, { "entropy": 0.542425274848938, "epoch": 2.7276119402985075, "grad_norm": 0.25132983922958374, "learning_rate": 0.0002, "loss": 0.5359519720077515, "mean_token_accuracy": 0.7846331894397736, "num_tokens": 11945440.0, "step": 731 }, { "entropy": 0.5357621908187866, "epoch": 2.7313432835820897, "grad_norm": 0.222070574760437, "learning_rate": 0.0002, "loss": 0.5348407626152039, "mean_token_accuracy": 0.7818550020456314, "num_tokens": 11961949.0, "step": 732 }, { "entropy": 0.5185696631669998, "epoch": 2.7350746268656714, "grad_norm": 0.19711528718471527, "learning_rate": 0.0002, "loss": 0.5264403223991394, "mean_token_accuracy": 0.7884511202573776, "num_tokens": 11978063.0, "step": 733 }, { "entropy": 0.516778826713562, "epoch": 2.7388059701492535, "grad_norm": 0.24369676411151886, "learning_rate": 0.0002, "loss": 0.5253380537033081, "mean_token_accuracy": 0.7903653234243393, "num_tokens": 11994278.0, "step": 734 }, { "entropy": 0.5164884254336357, "epoch": 2.7425373134328357, "grad_norm": 0.18417784571647644, "learning_rate": 0.0002, "loss": 0.5214477181434631, "mean_token_accuracy": 0.789106622338295, "num_tokens": 12010558.0, "step": 735 }, { "entropy": 0.5068091601133347, "epoch": 2.746268656716418, "grad_norm": 0.21942751109600067, "learning_rate": 0.0002, "loss": 0.513481855392456, "mean_token_accuracy": 0.7899149656295776, "num_tokens": 12026889.0, "step": 736 }, { "entropy": 0.5316798090934753, "epoch": 2.75, "grad_norm": 0.1581851989030838, "learning_rate": 0.0002, "loss": 0.5230653285980225, "mean_token_accuracy": 0.7884569317102432, "num_tokens": 12043341.0, "step": 737 }, { "entropy": 0.539380818605423, "epoch": 2.753731343283582, "grad_norm": 0.1578167974948883, "learning_rate": 0.0002, "loss": 0.5292148590087891, "mean_token_accuracy": 0.7852563858032227, "num_tokens": 12059848.0, "step": 738 }, { "entropy": 0.5343874096870422, "epoch": 2.7574626865671643, "grad_norm": 0.19632823765277863, "learning_rate": 0.0002, "loss": 0.5295359492301941, "mean_token_accuracy": 0.783517986536026, "num_tokens": 12076134.0, "step": 739 }, { "entropy": 0.5188475027680397, "epoch": 2.7611940298507465, "grad_norm": 0.16950450837612152, "learning_rate": 0.0002, "loss": 0.521928071975708, "mean_token_accuracy": 0.7883510291576385, "num_tokens": 12092406.0, "step": 740 }, { "entropy": 0.5121756568551064, "epoch": 2.7649253731343286, "grad_norm": 0.20061862468719482, "learning_rate": 0.0002, "loss": 0.5192751884460449, "mean_token_accuracy": 0.7898274064064026, "num_tokens": 12108773.0, "step": 741 }, { "entropy": 0.5244594514369965, "epoch": 2.7686567164179103, "grad_norm": 0.16218306124210358, "learning_rate": 0.0002, "loss": 0.5296685695648193, "mean_token_accuracy": 0.7826414853334427, "num_tokens": 12125082.0, "step": 742 }, { "entropy": 0.5244700759649277, "epoch": 2.7723880597014925, "grad_norm": 0.19114060699939728, "learning_rate": 0.0002, "loss": 0.5232917070388794, "mean_token_accuracy": 0.7893050163984299, "num_tokens": 12141570.0, "step": 743 }, { "entropy": 0.5299672707915306, "epoch": 2.7761194029850746, "grad_norm": 0.15443415939807892, "learning_rate": 0.0002, "loss": 0.5207250714302063, "mean_token_accuracy": 0.7905602306127548, "num_tokens": 12157874.0, "step": 744 }, { "entropy": 0.5345348864793777, "epoch": 2.779850746268657, "grad_norm": 0.1817025989294052, "learning_rate": 0.0002, "loss": 0.5311155319213867, "mean_token_accuracy": 0.785017192363739, "num_tokens": 12174053.0, "step": 745 }, { "entropy": 0.5195724815130234, "epoch": 2.783582089552239, "grad_norm": 0.157354474067688, "learning_rate": 0.0002, "loss": 0.5159887075424194, "mean_token_accuracy": 0.790684700012207, "num_tokens": 12190613.0, "step": 746 }, { "entropy": 0.5138278231024742, "epoch": 2.7873134328358207, "grad_norm": 0.16088353097438812, "learning_rate": 0.0002, "loss": 0.5184983611106873, "mean_token_accuracy": 0.7899224907159805, "num_tokens": 12206928.0, "step": 747 }, { "entropy": 0.5161465555429459, "epoch": 2.791044776119403, "grad_norm": 0.2099459022283554, "learning_rate": 0.0002, "loss": 0.5232690572738647, "mean_token_accuracy": 0.7870688289403915, "num_tokens": 12223267.0, "step": 748 }, { "entropy": 0.5158911049365997, "epoch": 2.794776119402985, "grad_norm": 0.15817788243293762, "learning_rate": 0.0002, "loss": 0.5168994665145874, "mean_token_accuracy": 0.7899310439825058, "num_tokens": 12239601.0, "step": 749 }, { "entropy": 0.5070392489433289, "epoch": 2.798507462686567, "grad_norm": 0.2228090614080429, "learning_rate": 0.0002, "loss": 0.5200591087341309, "mean_token_accuracy": 0.7891372889280319, "num_tokens": 12256032.0, "step": 750 }, { "entropy": 0.5438189208507538, "epoch": 2.8022388059701493, "grad_norm": 0.1719558835029602, "learning_rate": 0.0002, "loss": 0.5426724553108215, "mean_token_accuracy": 0.7774887979030609, "num_tokens": 12272514.0, "step": 751 }, { "entropy": 0.519834965467453, "epoch": 2.8059701492537314, "grad_norm": 0.18933889269828796, "learning_rate": 0.0002, "loss": 0.523102343082428, "mean_token_accuracy": 0.7904316037893295, "num_tokens": 12288877.0, "step": 752 }, { "entropy": 0.512350045144558, "epoch": 2.8097014925373136, "grad_norm": 0.1864548623561859, "learning_rate": 0.0002, "loss": 0.5090078115463257, "mean_token_accuracy": 0.7945949882268906, "num_tokens": 12305044.0, "step": 753 }, { "entropy": 0.5358164459466934, "epoch": 2.8134328358208958, "grad_norm": 0.17895784974098206, "learning_rate": 0.0002, "loss": 0.5349195003509521, "mean_token_accuracy": 0.7841221541166306, "num_tokens": 12321579.0, "step": 754 }, { "entropy": 0.5124004110693932, "epoch": 2.8171641791044775, "grad_norm": 0.17669007182121277, "learning_rate": 0.0002, "loss": 0.5126450657844543, "mean_token_accuracy": 0.7929520756006241, "num_tokens": 12338186.0, "step": 755 }, { "entropy": 0.5246561616659164, "epoch": 2.8208955223880596, "grad_norm": 0.19795700907707214, "learning_rate": 0.0002, "loss": 0.5288596153259277, "mean_token_accuracy": 0.7869751006364822, "num_tokens": 12354327.0, "step": 756 }, { "entropy": 0.5311583876609802, "epoch": 2.824626865671642, "grad_norm": 0.18146470189094543, "learning_rate": 0.0002, "loss": 0.5294592976570129, "mean_token_accuracy": 0.7862387895584106, "num_tokens": 12370923.0, "step": 757 }, { "entropy": 0.5319194048643112, "epoch": 2.828358208955224, "grad_norm": 0.19238857924938202, "learning_rate": 0.0002, "loss": 0.5317291617393494, "mean_token_accuracy": 0.7854786366224289, "num_tokens": 12387257.0, "step": 758 }, { "entropy": 0.526064857840538, "epoch": 2.832089552238806, "grad_norm": 0.1526212990283966, "learning_rate": 0.0002, "loss": 0.5222187042236328, "mean_token_accuracy": 0.7932349592447281, "num_tokens": 12403635.0, "step": 759 }, { "entropy": 0.5247229933738708, "epoch": 2.835820895522388, "grad_norm": 0.2871471047401428, "learning_rate": 0.0002, "loss": 0.5314409136772156, "mean_token_accuracy": 0.7845473885536194, "num_tokens": 12420097.0, "step": 760 }, { "entropy": 0.5259681046009064, "epoch": 2.83955223880597, "grad_norm": 0.1705760359764099, "learning_rate": 0.0002, "loss": 0.5313333868980408, "mean_token_accuracy": 0.787728413939476, "num_tokens": 12436382.0, "step": 761 }, { "entropy": 0.5318069308996201, "epoch": 2.843283582089552, "grad_norm": 0.20162752270698547, "learning_rate": 0.0002, "loss": 0.5359828472137451, "mean_token_accuracy": 0.7834303081035614, "num_tokens": 12452497.0, "step": 762 }, { "entropy": 0.5508353263139725, "epoch": 2.8470149253731343, "grad_norm": 0.161021888256073, "learning_rate": 0.0002, "loss": 0.5432707667350769, "mean_token_accuracy": 0.7808051556348801, "num_tokens": 12468969.0, "step": 763 }, { "entropy": 0.5287757962942123, "epoch": 2.8507462686567164, "grad_norm": 0.2050207257270813, "learning_rate": 0.0002, "loss": 0.5284628868103027, "mean_token_accuracy": 0.7843392193317413, "num_tokens": 12485354.0, "step": 764 }, { "entropy": 0.5344215333461761, "epoch": 2.8544776119402986, "grad_norm": 0.1695808321237564, "learning_rate": 0.0002, "loss": 0.535874605178833, "mean_token_accuracy": 0.782726377248764, "num_tokens": 12501936.0, "step": 765 }, { "entropy": 0.522572860121727, "epoch": 2.8582089552238807, "grad_norm": 0.19520296156406403, "learning_rate": 0.0002, "loss": 0.5247471332550049, "mean_token_accuracy": 0.7886104881763458, "num_tokens": 12518330.0, "step": 766 }, { "entropy": 0.5314962714910507, "epoch": 2.861940298507463, "grad_norm": 0.17423976957798004, "learning_rate": 0.0002, "loss": 0.5297841429710388, "mean_token_accuracy": 0.7862118780612946, "num_tokens": 12534665.0, "step": 767 }, { "entropy": 0.5281147062778473, "epoch": 2.8656716417910446, "grad_norm": 0.18605203926563263, "learning_rate": 0.0002, "loss": 0.5324077606201172, "mean_token_accuracy": 0.787416860461235, "num_tokens": 12551009.0, "step": 768 }, { "entropy": 0.5187551081180573, "epoch": 2.8694029850746268, "grad_norm": 0.1616411954164505, "learning_rate": 0.0002, "loss": 0.512826144695282, "mean_token_accuracy": 0.7936854958534241, "num_tokens": 12567387.0, "step": 769 }, { "entropy": 0.5136809647083282, "epoch": 2.873134328358209, "grad_norm": 0.17406195402145386, "learning_rate": 0.0002, "loss": 0.5155330300331116, "mean_token_accuracy": 0.7908283174037933, "num_tokens": 12583985.0, "step": 770 }, { "entropy": 0.5185445547103882, "epoch": 2.876865671641791, "grad_norm": 0.1833800971508026, "learning_rate": 0.0002, "loss": 0.5192956328392029, "mean_token_accuracy": 0.7888920605182648, "num_tokens": 12600193.0, "step": 771 }, { "entropy": 0.5310780256986618, "epoch": 2.8805970149253732, "grad_norm": 0.2313033938407898, "learning_rate": 0.0002, "loss": 0.5360459685325623, "mean_token_accuracy": 0.7845909744501114, "num_tokens": 12616559.0, "step": 772 }, { "entropy": 0.5207322463393211, "epoch": 2.8843283582089554, "grad_norm": 0.15698477625846863, "learning_rate": 0.0002, "loss": 0.514286994934082, "mean_token_accuracy": 0.789328083395958, "num_tokens": 12633075.0, "step": 773 }, { "entropy": 0.5343746095895767, "epoch": 2.888059701492537, "grad_norm": 0.21191926300525665, "learning_rate": 0.0002, "loss": 0.5408198833465576, "mean_token_accuracy": 0.7812719643115997, "num_tokens": 12649414.0, "step": 774 }, { "entropy": 0.5095183849334717, "epoch": 2.8917910447761193, "grad_norm": 0.1665944755077362, "learning_rate": 0.0002, "loss": 0.5066861510276794, "mean_token_accuracy": 0.7972470223903656, "num_tokens": 12665839.0, "step": 775 }, { "entropy": 0.5341623723506927, "epoch": 2.8955223880597014, "grad_norm": 0.19015316665172577, "learning_rate": 0.0002, "loss": 0.5310372114181519, "mean_token_accuracy": 0.7861314713954926, "num_tokens": 12682165.0, "step": 776 }, { "entropy": 0.5464024096727371, "epoch": 2.8992537313432836, "grad_norm": 0.19810722768306732, "learning_rate": 0.0002, "loss": 0.5392264723777771, "mean_token_accuracy": 0.7843339294195175, "num_tokens": 12698576.0, "step": 777 }, { "entropy": 0.5175042897462845, "epoch": 2.9029850746268657, "grad_norm": 0.16263291239738464, "learning_rate": 0.0002, "loss": 0.5172262191772461, "mean_token_accuracy": 0.792850524187088, "num_tokens": 12714766.0, "step": 778 }, { "entropy": 0.5199488997459412, "epoch": 2.906716417910448, "grad_norm": 0.2083202749490738, "learning_rate": 0.0002, "loss": 0.5252541899681091, "mean_token_accuracy": 0.7852817475795746, "num_tokens": 12731205.0, "step": 779 }, { "entropy": 0.4941527247428894, "epoch": 2.91044776119403, "grad_norm": 0.17050482332706451, "learning_rate": 0.0002, "loss": 0.4989524185657501, "mean_token_accuracy": 0.7972326874732971, "num_tokens": 12747594.0, "step": 780 }, { "entropy": 0.5078647658228874, "epoch": 2.914179104477612, "grad_norm": 0.23199598491191864, "learning_rate": 0.0002, "loss": 0.5211161971092224, "mean_token_accuracy": 0.7884382009506226, "num_tokens": 12763932.0, "step": 781 }, { "entropy": 0.5114319175481796, "epoch": 2.917910447761194, "grad_norm": 0.2023877501487732, "learning_rate": 0.0002, "loss": 0.5166995525360107, "mean_token_accuracy": 0.7941331118345261, "num_tokens": 12780023.0, "step": 782 }, { "entropy": 0.5287023633718491, "epoch": 2.921641791044776, "grad_norm": 0.21876347064971924, "learning_rate": 0.0002, "loss": 0.5263211727142334, "mean_token_accuracy": 0.7864357531070709, "num_tokens": 12796441.0, "step": 783 }, { "entropy": 0.5223046839237213, "epoch": 2.925373134328358, "grad_norm": 0.14650550484657288, "learning_rate": 0.0002, "loss": 0.5140995979309082, "mean_token_accuracy": 0.7916091233491898, "num_tokens": 12812793.0, "step": 784 }, { "entropy": 0.5247595310211182, "epoch": 2.9291044776119404, "grad_norm": 0.25079336762428284, "learning_rate": 0.0002, "loss": 0.5263584852218628, "mean_token_accuracy": 0.786608412861824, "num_tokens": 12829172.0, "step": 785 }, { "entropy": 0.5266484171152115, "epoch": 2.9328358208955225, "grad_norm": 0.16101892292499542, "learning_rate": 0.0002, "loss": 0.5220364928245544, "mean_token_accuracy": 0.7872611582279205, "num_tokens": 12845573.0, "step": 786 }, { "entropy": 0.5139588639140129, "epoch": 2.9365671641791042, "grad_norm": 0.21128332614898682, "learning_rate": 0.0002, "loss": 0.5196605920791626, "mean_token_accuracy": 0.7880596816539764, "num_tokens": 12861897.0, "step": 787 }, { "entropy": 0.5052976161241531, "epoch": 2.9402985074626864, "grad_norm": 0.1861787587404251, "learning_rate": 0.0002, "loss": 0.5119534134864807, "mean_token_accuracy": 0.7939311414957047, "num_tokens": 12878193.0, "step": 788 }, { "entropy": 0.5310614109039307, "epoch": 2.9440298507462686, "grad_norm": 0.1857159435749054, "learning_rate": 0.0002, "loss": 0.5301690101623535, "mean_token_accuracy": 0.786168098449707, "num_tokens": 12894935.0, "step": 789 }, { "entropy": 0.5310661867260933, "epoch": 2.9477611940298507, "grad_norm": 0.18339301645755768, "learning_rate": 0.0002, "loss": 0.5257419347763062, "mean_token_accuracy": 0.788611650466919, "num_tokens": 12911289.0, "step": 790 }, { "entropy": 0.5245337337255478, "epoch": 2.951492537313433, "grad_norm": 0.17652840912342072, "learning_rate": 0.0002, "loss": 0.5265839099884033, "mean_token_accuracy": 0.7901091575622559, "num_tokens": 12927670.0, "step": 791 }, { "entropy": 0.5245234072208405, "epoch": 2.955223880597015, "grad_norm": 0.17611214518547058, "learning_rate": 0.0002, "loss": 0.5243083834648132, "mean_token_accuracy": 0.7856577485799789, "num_tokens": 12944015.0, "step": 792 }, { "entropy": 0.5191880911588669, "epoch": 2.958955223880597, "grad_norm": 0.18345631659030914, "learning_rate": 0.0002, "loss": 0.5257253050804138, "mean_token_accuracy": 0.7881710231304169, "num_tokens": 12960131.0, "step": 793 }, { "entropy": 0.5140431523323059, "epoch": 2.9626865671641793, "grad_norm": 0.2098158448934555, "learning_rate": 0.0002, "loss": 0.5169271230697632, "mean_token_accuracy": 0.786968320608139, "num_tokens": 12976187.0, "step": 794 }, { "entropy": 0.5335211008787155, "epoch": 2.966417910447761, "grad_norm": 0.15838965773582458, "learning_rate": 0.0002, "loss": 0.5324181318283081, "mean_token_accuracy": 0.7819865345954895, "num_tokens": 12992461.0, "step": 795 }, { "entropy": 0.5252291113138199, "epoch": 2.970149253731343, "grad_norm": 0.19166119396686554, "learning_rate": 0.0002, "loss": 0.5205749869346619, "mean_token_accuracy": 0.7911773473024368, "num_tokens": 13008737.0, "step": 796 }, { "entropy": 0.5154759585857391, "epoch": 2.9738805970149254, "grad_norm": 0.16444922983646393, "learning_rate": 0.0002, "loss": 0.5141779184341431, "mean_token_accuracy": 0.7922156006097794, "num_tokens": 13025092.0, "step": 797 }, { "entropy": 0.5257436707615852, "epoch": 2.9776119402985075, "grad_norm": 0.19890975952148438, "learning_rate": 0.0002, "loss": 0.5353443622589111, "mean_token_accuracy": 0.7844508290290833, "num_tokens": 13041631.0, "step": 798 }, { "entropy": 0.5554878115653992, "epoch": 2.9813432835820897, "grad_norm": 0.19347697496414185, "learning_rate": 0.0002, "loss": 0.5568645596504211, "mean_token_accuracy": 0.7741395682096481, "num_tokens": 13058045.0, "step": 799 }, { "entropy": 0.5262391567230225, "epoch": 2.9850746268656714, "grad_norm": 0.17874093353748322, "learning_rate": 0.0002, "loss": 0.5202043056488037, "mean_token_accuracy": 0.7870875149965286, "num_tokens": 13074443.0, "step": 800 }, { "entropy": 0.5318054854869843, "epoch": 2.9888059701492535, "grad_norm": 0.182646706700325, "learning_rate": 0.0002, "loss": 0.5253685712814331, "mean_token_accuracy": 0.786090537905693, "num_tokens": 13090582.0, "step": 801 }, { "entropy": 0.5484406352043152, "epoch": 2.9925373134328357, "grad_norm": 0.15745747089385986, "learning_rate": 0.0002, "loss": 0.5452413558959961, "mean_token_accuracy": 0.7798783183097839, "num_tokens": 13106832.0, "step": 802 }, { "entropy": 0.527185246348381, "epoch": 2.996268656716418, "grad_norm": 0.1789730340242386, "learning_rate": 0.0002, "loss": 0.5218254923820496, "mean_token_accuracy": 0.7895842045545578, "num_tokens": 13123002.0, "step": 803 }, { "entropy": 0.5108470022678375, "epoch": 3.0, "grad_norm": 0.1871774047613144, "learning_rate": 0.0002, "loss": 0.5190352201461792, "mean_token_accuracy": 0.7890540361404419, "num_tokens": 13139156.0, "step": 804 } ], "logging_steps": 1, "max_steps": 804, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2242940510926275e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }