diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8014 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 798, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 1.1405175626277924, + "epoch": 0.0037593984962406013, + "grad_norm": 0.40029582381248474, + "learning_rate": 0.0002, + "loss": 2.4748640060424805, + "mean_token_accuracy": 0.5338118821382523, + "num_tokens": 16246.0, + "step": 1 + }, + { + "entropy": 1.2275302708148956, + "epoch": 0.007518796992481203, + "grad_norm": 0.36828649044036865, + "learning_rate": 0.0002, + "loss": 2.125943660736084, + "mean_token_accuracy": 0.5713680684566498, + "num_tokens": 32716.0, + "step": 2 + }, + { + "entropy": 1.4195487797260284, + "epoch": 0.011278195488721804, + "grad_norm": 0.29105839133262634, + "learning_rate": 0.0002, + "loss": 1.735130786895752, + "mean_token_accuracy": 0.5909573882818222, + "num_tokens": 48967.0, + "step": 3 + }, + { + "entropy": 1.3783348500728607, + "epoch": 0.015037593984962405, + "grad_norm": 0.2323397547006607, + "learning_rate": 0.0002, + "loss": 1.4040782451629639, + "mean_token_accuracy": 0.6318088620901108, + "num_tokens": 65467.0, + "step": 4 + }, + { + "entropy": 1.3656240701675415, + "epoch": 0.018796992481203006, + "grad_norm": 0.2868480384349823, + "learning_rate": 0.0002, + "loss": 1.3035261631011963, + "mean_token_accuracy": 0.6341304779052734, + "num_tokens": 81665.0, + "step": 5 + }, + { + "entropy": 1.264964371919632, + "epoch": 0.022556390977443608, + "grad_norm": 0.14605936408042908, + "learning_rate": 0.0002, + "loss": 1.1722630262374878, + "mean_token_accuracy": 0.6646067351102829, + "num_tokens": 97913.0, + "step": 6 + }, + { + "entropy": 1.1983447670936584, + "epoch": 0.02631578947368421, + "grad_norm": 0.10632229596376419, + "learning_rate": 0.0002, + "loss": 1.1054309606552124, + "mean_token_accuracy": 0.6686217486858368, + "num_tokens": 113953.0, + "step": 7 + }, + { + "entropy": 1.1218359470367432, + "epoch": 0.03007518796992481, + "grad_norm": 0.09761745482683182, + "learning_rate": 0.0002, + "loss": 1.0230426788330078, + "mean_token_accuracy": 0.676657035946846, + "num_tokens": 130177.0, + "step": 8 + }, + { + "entropy": 1.0549319684505463, + "epoch": 0.03383458646616541, + "grad_norm": 0.1231616735458374, + "learning_rate": 0.0002, + "loss": 0.9938599467277527, + "mean_token_accuracy": 0.6875758469104767, + "num_tokens": 146621.0, + "step": 9 + }, + { + "entropy": 0.987179160118103, + "epoch": 0.03759398496240601, + "grad_norm": 0.11966806650161743, + "learning_rate": 0.0002, + "loss": 0.9243900775909424, + "mean_token_accuracy": 0.6994709670543671, + "num_tokens": 162843.0, + "step": 10 + }, + { + "entropy": 0.935651957988739, + "epoch": 0.041353383458646614, + "grad_norm": 0.10380394756793976, + "learning_rate": 0.0002, + "loss": 0.866508960723877, + "mean_token_accuracy": 0.7096800655126572, + "num_tokens": 179313.0, + "step": 11 + }, + { + "entropy": 0.9110619872808456, + "epoch": 0.045112781954887216, + "grad_norm": 0.10094986110925674, + "learning_rate": 0.0002, + "loss": 0.832156240940094, + "mean_token_accuracy": 0.7104088068008423, + "num_tokens": 195785.0, + "step": 12 + }, + { + "entropy": 0.855834111571312, + "epoch": 0.04887218045112782, + "grad_norm": 0.37487563490867615, + "learning_rate": 0.0002, + "loss": 0.8014079332351685, + "mean_token_accuracy": 0.7197864800691605, + "num_tokens": 212026.0, + "step": 13 + }, + { + "entropy": 0.7773148268461227, + "epoch": 0.05263157894736842, + "grad_norm": 0.09044307470321655, + "learning_rate": 0.0002, + "loss": 0.7479192614555359, + "mean_token_accuracy": 0.7304967045783997, + "num_tokens": 228294.0, + "step": 14 + }, + { + "entropy": 0.7414887696504593, + "epoch": 0.05639097744360902, + "grad_norm": 0.11246141791343689, + "learning_rate": 0.0002, + "loss": 0.7355879545211792, + "mean_token_accuracy": 0.7314187586307526, + "num_tokens": 244681.0, + "step": 15 + }, + { + "entropy": 0.7010335773229599, + "epoch": 0.06015037593984962, + "grad_norm": 0.11098679155111313, + "learning_rate": 0.0002, + "loss": 0.6920604109764099, + "mean_token_accuracy": 0.7372281551361084, + "num_tokens": 261053.0, + "step": 16 + }, + { + "entropy": 0.6938799321651459, + "epoch": 0.06390977443609022, + "grad_norm": 0.08114200830459595, + "learning_rate": 0.0002, + "loss": 0.6897510886192322, + "mean_token_accuracy": 0.7408226281404495, + "num_tokens": 277338.0, + "step": 17 + }, + { + "entropy": 0.6835978478193283, + "epoch": 0.06766917293233082, + "grad_norm": 0.08077364414930344, + "learning_rate": 0.0002, + "loss": 0.6768285632133484, + "mean_token_accuracy": 0.740087628364563, + "num_tokens": 293709.0, + "step": 18 + }, + { + "entropy": 0.6589517742395401, + "epoch": 0.07142857142857142, + "grad_norm": 0.0879955068230629, + "learning_rate": 0.0002, + "loss": 0.65667724609375, + "mean_token_accuracy": 0.7443644404411316, + "num_tokens": 310128.0, + "step": 19 + }, + { + "entropy": 0.6506444960832596, + "epoch": 0.07518796992481203, + "grad_norm": 0.080411896109581, + "learning_rate": 0.0002, + "loss": 0.641387403011322, + "mean_token_accuracy": 0.7495939880609512, + "num_tokens": 326607.0, + "step": 20 + }, + { + "entropy": 0.6619953960180283, + "epoch": 0.07894736842105263, + "grad_norm": 0.0845642164349556, + "learning_rate": 0.0002, + "loss": 0.6475294232368469, + "mean_token_accuracy": 0.7457321733236313, + "num_tokens": 342774.0, + "step": 21 + }, + { + "entropy": 0.6577392071485519, + "epoch": 0.08270676691729323, + "grad_norm": 0.07965292036533356, + "learning_rate": 0.0002, + "loss": 0.6407521367073059, + "mean_token_accuracy": 0.7490587830543518, + "num_tokens": 359099.0, + "step": 22 + }, + { + "entropy": 0.6155381500720978, + "epoch": 0.08646616541353383, + "grad_norm": 0.07591664046049118, + "learning_rate": 0.0002, + "loss": 0.6092519760131836, + "mean_token_accuracy": 0.7603109776973724, + "num_tokens": 375179.0, + "step": 23 + }, + { + "entropy": 0.5885609835386276, + "epoch": 0.09022556390977443, + "grad_norm": 0.06627360731363297, + "learning_rate": 0.0002, + "loss": 0.5951059460639954, + "mean_token_accuracy": 0.7678095996379852, + "num_tokens": 391354.0, + "step": 24 + }, + { + "entropy": 0.5992416590452194, + "epoch": 0.09398496240601503, + "grad_norm": 0.08137614279985428, + "learning_rate": 0.0002, + "loss": 0.6067847013473511, + "mean_token_accuracy": 0.7620100975036621, + "num_tokens": 407719.0, + "step": 25 + }, + { + "entropy": 0.6116904020309448, + "epoch": 0.09774436090225563, + "grad_norm": 0.06891811639070511, + "learning_rate": 0.0002, + "loss": 0.6175057888031006, + "mean_token_accuracy": 0.7556122690439224, + "num_tokens": 424041.0, + "step": 26 + }, + { + "entropy": 0.6106788814067841, + "epoch": 0.10150375939849623, + "grad_norm": 0.059570278972387314, + "learning_rate": 0.0002, + "loss": 0.5937588214874268, + "mean_token_accuracy": 0.7666491121053696, + "num_tokens": 440295.0, + "step": 27 + }, + { + "entropy": 0.6181164085865021, + "epoch": 0.10526315789473684, + "grad_norm": 0.07394946366548538, + "learning_rate": 0.0002, + "loss": 0.6043965220451355, + "mean_token_accuracy": 0.7635089755058289, + "num_tokens": 456614.0, + "step": 28 + }, + { + "entropy": 0.6283685266971588, + "epoch": 0.10902255639097744, + "grad_norm": 0.07618279755115509, + "learning_rate": 0.0002, + "loss": 0.6195181608200073, + "mean_token_accuracy": 0.752281054854393, + "num_tokens": 472965.0, + "step": 29 + }, + { + "entropy": 0.5851932466030121, + "epoch": 0.11278195488721804, + "grad_norm": 0.05518079921603203, + "learning_rate": 0.0002, + "loss": 0.5881266593933105, + "mean_token_accuracy": 0.7650770843029022, + "num_tokens": 489391.0, + "step": 30 + }, + { + "entropy": 0.5895522385835648, + "epoch": 0.11654135338345864, + "grad_norm": 0.06688102334737778, + "learning_rate": 0.0002, + "loss": 0.6028741002082825, + "mean_token_accuracy": 0.7601553350687027, + "num_tokens": 505837.0, + "step": 31 + }, + { + "entropy": 0.5878616869449615, + "epoch": 0.12030075187969924, + "grad_norm": 0.059780046343803406, + "learning_rate": 0.0002, + "loss": 0.6033408045768738, + "mean_token_accuracy": 0.7582006454467773, + "num_tokens": 522243.0, + "step": 32 + }, + { + "entropy": 0.5838498622179031, + "epoch": 0.12406015037593984, + "grad_norm": 0.04929976165294647, + "learning_rate": 0.0002, + "loss": 0.5896713137626648, + "mean_token_accuracy": 0.761729434132576, + "num_tokens": 538731.0, + "step": 33 + }, + { + "entropy": 0.5691559016704559, + "epoch": 0.12781954887218044, + "grad_norm": 0.06266291439533234, + "learning_rate": 0.0002, + "loss": 0.5734342932701111, + "mean_token_accuracy": 0.7672057747840881, + "num_tokens": 554848.0, + "step": 34 + }, + { + "entropy": 0.5915598571300507, + "epoch": 0.13157894736842105, + "grad_norm": 0.06152564287185669, + "learning_rate": 0.0002, + "loss": 0.5912453532218933, + "mean_token_accuracy": 0.7633904218673706, + "num_tokens": 571057.0, + "step": 35 + }, + { + "entropy": 0.597556471824646, + "epoch": 0.13533834586466165, + "grad_norm": 0.04998990520834923, + "learning_rate": 0.0002, + "loss": 0.5882090330123901, + "mean_token_accuracy": 0.7643049657344818, + "num_tokens": 587326.0, + "step": 36 + }, + { + "entropy": 0.5905885845422745, + "epoch": 0.13909774436090225, + "grad_norm": 0.049017250537872314, + "learning_rate": 0.0002, + "loss": 0.5855776071548462, + "mean_token_accuracy": 0.7655442655086517, + "num_tokens": 603538.0, + "step": 37 + }, + { + "entropy": 0.586976170539856, + "epoch": 0.14285714285714285, + "grad_norm": 0.046413078904151917, + "learning_rate": 0.0002, + "loss": 0.5790608525276184, + "mean_token_accuracy": 0.767949789762497, + "num_tokens": 619734.0, + "step": 38 + }, + { + "entropy": 0.5844197869300842, + "epoch": 0.14661654135338345, + "grad_norm": 0.04495161026716232, + "learning_rate": 0.0002, + "loss": 0.5842206478118896, + "mean_token_accuracy": 0.7648505717515945, + "num_tokens": 636104.0, + "step": 39 + }, + { + "entropy": 0.5523269921541214, + "epoch": 0.15037593984962405, + "grad_norm": 0.04233352467417717, + "learning_rate": 0.0002, + "loss": 0.5523208975791931, + "mean_token_accuracy": 0.7776841074228287, + "num_tokens": 652478.0, + "step": 40 + }, + { + "entropy": 0.569878563284874, + "epoch": 0.15413533834586465, + "grad_norm": 0.04850724712014198, + "learning_rate": 0.0002, + "loss": 0.5725483298301697, + "mean_token_accuracy": 0.7687844336032867, + "num_tokens": 669008.0, + "step": 41 + }, + { + "entropy": 0.5655312091112137, + "epoch": 0.15789473684210525, + "grad_norm": 0.04192538931965828, + "learning_rate": 0.0002, + "loss": 0.5679923892021179, + "mean_token_accuracy": 0.7717834413051605, + "num_tokens": 685165.0, + "step": 42 + }, + { + "entropy": 0.5601242333650589, + "epoch": 0.16165413533834586, + "grad_norm": 0.042079195380210876, + "learning_rate": 0.0002, + "loss": 0.5594381093978882, + "mean_token_accuracy": 0.7740506827831268, + "num_tokens": 701529.0, + "step": 43 + }, + { + "entropy": 0.575413703918457, + "epoch": 0.16541353383458646, + "grad_norm": 0.04416325315833092, + "learning_rate": 0.0002, + "loss": 0.5747635364532471, + "mean_token_accuracy": 0.7721781879663467, + "num_tokens": 717922.0, + "step": 44 + }, + { + "entropy": 0.5668691843748093, + "epoch": 0.16917293233082706, + "grad_norm": 0.05360032618045807, + "learning_rate": 0.0002, + "loss": 0.5617860555648804, + "mean_token_accuracy": 0.7762805074453354, + "num_tokens": 733933.0, + "step": 45 + }, + { + "entropy": 0.5761540979146957, + "epoch": 0.17293233082706766, + "grad_norm": 0.040452998131513596, + "learning_rate": 0.0002, + "loss": 0.5704891085624695, + "mean_token_accuracy": 0.7709734439849854, + "num_tokens": 750555.0, + "step": 46 + }, + { + "entropy": 0.5610938370227814, + "epoch": 0.17669172932330826, + "grad_norm": 0.04221005737781525, + "learning_rate": 0.0002, + "loss": 0.5613417029380798, + "mean_token_accuracy": 0.7761952430009842, + "num_tokens": 766693.0, + "step": 47 + }, + { + "entropy": 0.5707991421222687, + "epoch": 0.18045112781954886, + "grad_norm": 0.03976718708872795, + "learning_rate": 0.0002, + "loss": 0.5678077936172485, + "mean_token_accuracy": 0.7737486809492111, + "num_tokens": 783330.0, + "step": 48 + }, + { + "entropy": 0.5475099235773087, + "epoch": 0.18421052631578946, + "grad_norm": 0.04141751676797867, + "learning_rate": 0.0002, + "loss": 0.5536777973175049, + "mean_token_accuracy": 0.7761508077383041, + "num_tokens": 799528.0, + "step": 49 + }, + { + "entropy": 0.5602568089962006, + "epoch": 0.18796992481203006, + "grad_norm": 0.04497222229838371, + "learning_rate": 0.0002, + "loss": 0.5695174336433411, + "mean_token_accuracy": 0.7716410309076309, + "num_tokens": 815957.0, + "step": 50 + }, + { + "entropy": 0.5643552988767624, + "epoch": 0.19172932330827067, + "grad_norm": 0.041956499218940735, + "learning_rate": 0.0002, + "loss": 0.5748574733734131, + "mean_token_accuracy": 0.7680526673793793, + "num_tokens": 832365.0, + "step": 51 + }, + { + "entropy": 0.5510173141956329, + "epoch": 0.19548872180451127, + "grad_norm": 0.04074239730834961, + "learning_rate": 0.0002, + "loss": 0.5555428266525269, + "mean_token_accuracy": 0.775487020611763, + "num_tokens": 848532.0, + "step": 52 + }, + { + "entropy": 0.5738573223352432, + "epoch": 0.19924812030075187, + "grad_norm": 0.036227982491254807, + "learning_rate": 0.0002, + "loss": 0.5651305913925171, + "mean_token_accuracy": 0.7725107222795486, + "num_tokens": 864646.0, + "step": 53 + }, + { + "entropy": 0.5808417797088623, + "epoch": 0.20300751879699247, + "grad_norm": 0.03816494345664978, + "learning_rate": 0.0002, + "loss": 0.5638910531997681, + "mean_token_accuracy": 0.7741686254739761, + "num_tokens": 881239.0, + "step": 54 + }, + { + "entropy": 0.5693863034248352, + "epoch": 0.20676691729323307, + "grad_norm": 0.035037554800510406, + "learning_rate": 0.0002, + "loss": 0.5701916813850403, + "mean_token_accuracy": 0.7687424123287201, + "num_tokens": 897601.0, + "step": 55 + }, + { + "entropy": 0.5595564395189285, + "epoch": 0.21052631578947367, + "grad_norm": 0.038008302450180054, + "learning_rate": 0.0002, + "loss": 0.5662519931793213, + "mean_token_accuracy": 0.7714412808418274, + "num_tokens": 914184.0, + "step": 56 + }, + { + "entropy": 0.5745149552822113, + "epoch": 0.21428571428571427, + "grad_norm": 0.03566848114132881, + "learning_rate": 0.0002, + "loss": 0.5779574513435364, + "mean_token_accuracy": 0.7686354070901871, + "num_tokens": 930380.0, + "step": 57 + }, + { + "entropy": 0.5675694793462753, + "epoch": 0.21804511278195488, + "grad_norm": 0.03368304297327995, + "learning_rate": 0.0002, + "loss": 0.5728892087936401, + "mean_token_accuracy": 0.7670125216245651, + "num_tokens": 946749.0, + "step": 58 + }, + { + "entropy": 0.5651668012142181, + "epoch": 0.22180451127819548, + "grad_norm": 0.035859547555446625, + "learning_rate": 0.0002, + "loss": 0.5706139802932739, + "mean_token_accuracy": 0.7697967290878296, + "num_tokens": 963053.0, + "step": 59 + }, + { + "entropy": 0.5670004636049271, + "epoch": 0.22556390977443608, + "grad_norm": 0.03998008742928505, + "learning_rate": 0.0002, + "loss": 0.5656613111495972, + "mean_token_accuracy": 0.7728914767503738, + "num_tokens": 979368.0, + "step": 60 + }, + { + "entropy": 0.5696548968553543, + "epoch": 0.22932330827067668, + "grad_norm": 0.04078423231840134, + "learning_rate": 0.0002, + "loss": 0.5716832280158997, + "mean_token_accuracy": 0.7699559330940247, + "num_tokens": 995406.0, + "step": 61 + }, + { + "entropy": 0.590179905295372, + "epoch": 0.23308270676691728, + "grad_norm": 0.0332336388528347, + "learning_rate": 0.0002, + "loss": 0.5876976847648621, + "mean_token_accuracy": 0.7626538276672363, + "num_tokens": 1011804.0, + "step": 62 + }, + { + "entropy": 0.5567612648010254, + "epoch": 0.23684210526315788, + "grad_norm": 0.033585552126169205, + "learning_rate": 0.0002, + "loss": 0.552665650844574, + "mean_token_accuracy": 0.7773807644844055, + "num_tokens": 1027984.0, + "step": 63 + }, + { + "entropy": 0.5729009807109833, + "epoch": 0.24060150375939848, + "grad_norm": 0.037177689373493195, + "learning_rate": 0.0002, + "loss": 0.5675500631332397, + "mean_token_accuracy": 0.7715246975421906, + "num_tokens": 1044274.0, + "step": 64 + }, + { + "entropy": 0.5565147399902344, + "epoch": 0.24436090225563908, + "grad_norm": 0.034301500767469406, + "learning_rate": 0.0002, + "loss": 0.5531203150749207, + "mean_token_accuracy": 0.7778400331735611, + "num_tokens": 1060650.0, + "step": 65 + }, + { + "entropy": 0.5595405846834183, + "epoch": 0.24812030075187969, + "grad_norm": 0.032111674547195435, + "learning_rate": 0.0002, + "loss": 0.5613226294517517, + "mean_token_accuracy": 0.7748188674449921, + "num_tokens": 1077082.0, + "step": 66 + }, + { + "entropy": 0.5684429109096527, + "epoch": 0.2518796992481203, + "grad_norm": 0.036634527146816254, + "learning_rate": 0.0002, + "loss": 0.5726494789123535, + "mean_token_accuracy": 0.7709641754627228, + "num_tokens": 1093328.0, + "step": 67 + }, + { + "entropy": 0.5331402271986008, + "epoch": 0.2556390977443609, + "grad_norm": 0.03533982113003731, + "learning_rate": 0.0002, + "loss": 0.5389207601547241, + "mean_token_accuracy": 0.7816744297742844, + "num_tokens": 1109550.0, + "step": 68 + }, + { + "entropy": 0.5601552575826645, + "epoch": 0.2593984962406015, + "grad_norm": 0.03249680623412132, + "learning_rate": 0.0002, + "loss": 0.5670143961906433, + "mean_token_accuracy": 0.7690982818603516, + "num_tokens": 1125670.0, + "step": 69 + }, + { + "entropy": 0.5491845458745956, + "epoch": 0.2631578947368421, + "grad_norm": 0.03275011479854584, + "learning_rate": 0.0002, + "loss": 0.5448943972587585, + "mean_token_accuracy": 0.7807547152042389, + "num_tokens": 1141797.0, + "step": 70 + }, + { + "entropy": 0.5585113912820816, + "epoch": 0.2669172932330827, + "grad_norm": 0.03664859011769295, + "learning_rate": 0.0002, + "loss": 0.560217022895813, + "mean_token_accuracy": 0.7755073606967926, + "num_tokens": 1158252.0, + "step": 71 + }, + { + "entropy": 0.5534943342208862, + "epoch": 0.2706766917293233, + "grad_norm": 0.03374176472425461, + "learning_rate": 0.0002, + "loss": 0.5520960688591003, + "mean_token_accuracy": 0.7764160335063934, + "num_tokens": 1174369.0, + "step": 72 + }, + { + "entropy": 0.5600117444992065, + "epoch": 0.2744360902255639, + "grad_norm": 0.033763986080884933, + "learning_rate": 0.0002, + "loss": 0.5588683485984802, + "mean_token_accuracy": 0.7761770337820053, + "num_tokens": 1190928.0, + "step": 73 + }, + { + "entropy": 0.5625056624412537, + "epoch": 0.2781954887218045, + "grad_norm": 0.034332193434238434, + "learning_rate": 0.0002, + "loss": 0.5600336790084839, + "mean_token_accuracy": 0.7748808860778809, + "num_tokens": 1207372.0, + "step": 74 + }, + { + "entropy": 0.5520483404397964, + "epoch": 0.2819548872180451, + "grad_norm": 0.03450694680213928, + "learning_rate": 0.0002, + "loss": 0.5558054447174072, + "mean_token_accuracy": 0.7750240415334702, + "num_tokens": 1223643.0, + "step": 75 + }, + { + "entropy": 0.5441252887248993, + "epoch": 0.2857142857142857, + "grad_norm": 0.03436208888888359, + "learning_rate": 0.0002, + "loss": 0.5533716678619385, + "mean_token_accuracy": 0.7759858965873718, + "num_tokens": 1239688.0, + "step": 76 + }, + { + "entropy": 0.5603705495595932, + "epoch": 0.2894736842105263, + "grad_norm": 0.03493620082736015, + "learning_rate": 0.0002, + "loss": 0.5694956183433533, + "mean_token_accuracy": 0.7717721164226532, + "num_tokens": 1255884.0, + "step": 77 + }, + { + "entropy": 0.5612094402313232, + "epoch": 0.2932330827067669, + "grad_norm": 0.03372187912464142, + "learning_rate": 0.0002, + "loss": 0.5608274936676025, + "mean_token_accuracy": 0.7747389078140259, + "num_tokens": 1271939.0, + "step": 78 + }, + { + "entropy": 0.5706307291984558, + "epoch": 0.29699248120300753, + "grad_norm": 0.0331907719373703, + "learning_rate": 0.0002, + "loss": 0.5624843239784241, + "mean_token_accuracy": 0.7734071314334869, + "num_tokens": 1288328.0, + "step": 79 + }, + { + "entropy": 0.5670299082994461, + "epoch": 0.3007518796992481, + "grad_norm": 0.033556245267391205, + "learning_rate": 0.0002, + "loss": 0.560691237449646, + "mean_token_accuracy": 0.7734449654817581, + "num_tokens": 1304760.0, + "step": 80 + }, + { + "entropy": 0.5619105398654938, + "epoch": 0.30451127819548873, + "grad_norm": 0.034520749002695084, + "learning_rate": 0.0002, + "loss": 0.5578286647796631, + "mean_token_accuracy": 0.774708941578865, + "num_tokens": 1321100.0, + "step": 81 + }, + { + "entropy": 0.5670763552188873, + "epoch": 0.3082706766917293, + "grad_norm": 0.04056672751903534, + "learning_rate": 0.0002, + "loss": 0.5737652778625488, + "mean_token_accuracy": 0.76849165558815, + "num_tokens": 1337796.0, + "step": 82 + }, + { + "entropy": 0.5314440876245499, + "epoch": 0.31203007518796994, + "grad_norm": 0.03262212499976158, + "learning_rate": 0.0002, + "loss": 0.535086989402771, + "mean_token_accuracy": 0.7845727354288101, + "num_tokens": 1354331.0, + "step": 83 + }, + { + "entropy": 0.5603013932704926, + "epoch": 0.3157894736842105, + "grad_norm": 0.036167021840810776, + "learning_rate": 0.0002, + "loss": 0.5675747394561768, + "mean_token_accuracy": 0.771581381559372, + "num_tokens": 1370543.0, + "step": 84 + }, + { + "entropy": 0.5526834577322006, + "epoch": 0.31954887218045114, + "grad_norm": 0.03807472810149193, + "learning_rate": 0.0002, + "loss": 0.5507928729057312, + "mean_token_accuracy": 0.7803521752357483, + "num_tokens": 1386874.0, + "step": 85 + }, + { + "entropy": 0.5730793476104736, + "epoch": 0.3233082706766917, + "grad_norm": 0.03474927321076393, + "learning_rate": 0.0002, + "loss": 0.5660271644592285, + "mean_token_accuracy": 0.7727594673633575, + "num_tokens": 1403110.0, + "step": 86 + }, + { + "entropy": 0.563334196805954, + "epoch": 0.32706766917293234, + "grad_norm": 0.03167711943387985, + "learning_rate": 0.0002, + "loss": 0.56499844789505, + "mean_token_accuracy": 0.7736751586198807, + "num_tokens": 1419614.0, + "step": 87 + }, + { + "entropy": 0.5451017022132874, + "epoch": 0.3308270676691729, + "grad_norm": 0.03233160078525543, + "learning_rate": 0.0002, + "loss": 0.5535646677017212, + "mean_token_accuracy": 0.7740109711885452, + "num_tokens": 1436028.0, + "step": 88 + }, + { + "entropy": 0.5493156313896179, + "epoch": 0.33458646616541354, + "grad_norm": 0.039253026247024536, + "learning_rate": 0.0002, + "loss": 0.5615313649177551, + "mean_token_accuracy": 0.7725273966789246, + "num_tokens": 1452644.0, + "step": 89 + }, + { + "entropy": 0.5737167149782181, + "epoch": 0.3383458646616541, + "grad_norm": 0.032968465238809586, + "learning_rate": 0.0002, + "loss": 0.5743820667266846, + "mean_token_accuracy": 0.7698662877082825, + "num_tokens": 1469108.0, + "step": 90 + }, + { + "entropy": 0.5741334408521652, + "epoch": 0.34210526315789475, + "grad_norm": 0.040047451853752136, + "learning_rate": 0.0002, + "loss": 0.5673686265945435, + "mean_token_accuracy": 0.7704142928123474, + "num_tokens": 1485445.0, + "step": 91 + }, + { + "entropy": 0.5617086589336395, + "epoch": 0.3458646616541353, + "grad_norm": 0.03181539848446846, + "learning_rate": 0.0002, + "loss": 0.5534920692443848, + "mean_token_accuracy": 0.7758883982896805, + "num_tokens": 1501801.0, + "step": 92 + }, + { + "entropy": 0.5597693920135498, + "epoch": 0.34962406015037595, + "grad_norm": 0.03365252912044525, + "learning_rate": 0.0002, + "loss": 0.5625807046890259, + "mean_token_accuracy": 0.7725406587123871, + "num_tokens": 1518047.0, + "step": 93 + }, + { + "entropy": 0.5496240109205246, + "epoch": 0.3533834586466165, + "grad_norm": 0.0320061519742012, + "learning_rate": 0.0002, + "loss": 0.5572867393493652, + "mean_token_accuracy": 0.7759815156459808, + "num_tokens": 1534447.0, + "step": 94 + }, + { + "entropy": 0.5630564987659454, + "epoch": 0.35714285714285715, + "grad_norm": 0.03503059223294258, + "learning_rate": 0.0002, + "loss": 0.5757870674133301, + "mean_token_accuracy": 0.766523465514183, + "num_tokens": 1550660.0, + "step": 95 + }, + { + "entropy": 0.5605316013097763, + "epoch": 0.3609022556390977, + "grad_norm": 0.032678134739398956, + "learning_rate": 0.0002, + "loss": 0.5634536743164062, + "mean_token_accuracy": 0.7716304063796997, + "num_tokens": 1566883.0, + "step": 96 + }, + { + "entropy": 0.5838266015052795, + "epoch": 0.36466165413533835, + "grad_norm": 0.030517758801579475, + "learning_rate": 0.0002, + "loss": 0.5759112238883972, + "mean_token_accuracy": 0.7689571380615234, + "num_tokens": 1583221.0, + "step": 97 + }, + { + "entropy": 0.575135201215744, + "epoch": 0.3684210526315789, + "grad_norm": 0.03620682284235954, + "learning_rate": 0.0002, + "loss": 0.5637581944465637, + "mean_token_accuracy": 0.7740969359874725, + "num_tokens": 1599392.0, + "step": 98 + }, + { + "entropy": 0.5724876075983047, + "epoch": 0.37218045112781956, + "grad_norm": 0.029337450861930847, + "learning_rate": 0.0002, + "loss": 0.5643174052238464, + "mean_token_accuracy": 0.77228944003582, + "num_tokens": 1615899.0, + "step": 99 + }, + { + "entropy": 0.5502088665962219, + "epoch": 0.37593984962406013, + "grad_norm": 0.03381618484854698, + "learning_rate": 0.0002, + "loss": 0.5598064661026001, + "mean_token_accuracy": 0.7747711390256882, + "num_tokens": 1632274.0, + "step": 100 + }, + { + "entropy": 0.5598712712526321, + "epoch": 0.37969924812030076, + "grad_norm": 0.03598952665925026, + "learning_rate": 0.0002, + "loss": 0.5719908475875854, + "mean_token_accuracy": 0.7700261324644089, + "num_tokens": 1648688.0, + "step": 101 + }, + { + "entropy": 0.5630699545145035, + "epoch": 0.38345864661654133, + "grad_norm": 0.031423430889844894, + "learning_rate": 0.0002, + "loss": 0.565830409526825, + "mean_token_accuracy": 0.7715611904859543, + "num_tokens": 1665258.0, + "step": 102 + }, + { + "entropy": 0.5845702290534973, + "epoch": 0.38721804511278196, + "grad_norm": 0.02941996045410633, + "learning_rate": 0.0002, + "loss": 0.5816816687583923, + "mean_token_accuracy": 0.7648696899414062, + "num_tokens": 1681639.0, + "step": 103 + }, + { + "entropy": 0.57722607254982, + "epoch": 0.39097744360902253, + "grad_norm": 0.034051019698381424, + "learning_rate": 0.0002, + "loss": 0.5756963491439819, + "mean_token_accuracy": 0.7672083526849747, + "num_tokens": 1698010.0, + "step": 104 + }, + { + "entropy": 0.5672426074743271, + "epoch": 0.39473684210526316, + "grad_norm": 0.03516025468707085, + "learning_rate": 0.0002, + "loss": 0.5597167015075684, + "mean_token_accuracy": 0.7757037431001663, + "num_tokens": 1714351.0, + "step": 105 + }, + { + "entropy": 0.5414413064718246, + "epoch": 0.39849624060150374, + "grad_norm": 0.03341100364923477, + "learning_rate": 0.0002, + "loss": 0.5480563640594482, + "mean_token_accuracy": 0.7781668901443481, + "num_tokens": 1730536.0, + "step": 106 + }, + { + "entropy": 0.5462717562913895, + "epoch": 0.40225563909774437, + "grad_norm": 0.03385477513074875, + "learning_rate": 0.0002, + "loss": 0.5512043833732605, + "mean_token_accuracy": 0.7787721008062363, + "num_tokens": 1746896.0, + "step": 107 + }, + { + "entropy": 0.5501613169908524, + "epoch": 0.40601503759398494, + "grad_norm": 0.035874005407094955, + "learning_rate": 0.0002, + "loss": 0.561366081237793, + "mean_token_accuracy": 0.7721621990203857, + "num_tokens": 1763235.0, + "step": 108 + }, + { + "entropy": 0.5445860922336578, + "epoch": 0.40977443609022557, + "grad_norm": 0.030480582267045975, + "learning_rate": 0.0002, + "loss": 0.5476114153862, + "mean_token_accuracy": 0.7789607793092728, + "num_tokens": 1779550.0, + "step": 109 + }, + { + "entropy": 0.5542454719543457, + "epoch": 0.41353383458646614, + "grad_norm": 0.0321124792098999, + "learning_rate": 0.0002, + "loss": 0.5565616488456726, + "mean_token_accuracy": 0.7755739092826843, + "num_tokens": 1795761.0, + "step": 110 + }, + { + "entropy": 0.5581567585468292, + "epoch": 0.41729323308270677, + "grad_norm": 0.0360286608338356, + "learning_rate": 0.0002, + "loss": 0.5496086478233337, + "mean_token_accuracy": 0.775969922542572, + "num_tokens": 1811759.0, + "step": 111 + }, + { + "entropy": 0.549008384346962, + "epoch": 0.42105263157894735, + "grad_norm": 0.029972167685627937, + "learning_rate": 0.0002, + "loss": 0.5420917272567749, + "mean_token_accuracy": 0.7787465006113052, + "num_tokens": 1827840.0, + "step": 112 + }, + { + "entropy": 0.5631350576877594, + "epoch": 0.424812030075188, + "grad_norm": 0.028662627562880516, + "learning_rate": 0.0002, + "loss": 0.5532713532447815, + "mean_token_accuracy": 0.7749679088592529, + "num_tokens": 1844167.0, + "step": 113 + }, + { + "entropy": 0.5277586579322815, + "epoch": 0.42857142857142855, + "grad_norm": 0.03287903964519501, + "learning_rate": 0.0002, + "loss": 0.5350267887115479, + "mean_token_accuracy": 0.7830938249826431, + "num_tokens": 1860530.0, + "step": 114 + }, + { + "entropy": 0.5497393310070038, + "epoch": 0.4323308270676692, + "grad_norm": 0.03770268335938454, + "learning_rate": 0.0002, + "loss": 0.5615973472595215, + "mean_token_accuracy": 0.7720151543617249, + "num_tokens": 1876970.0, + "step": 115 + }, + { + "entropy": 0.5729877650737762, + "epoch": 0.43609022556390975, + "grad_norm": 0.033978965133428574, + "learning_rate": 0.0002, + "loss": 0.5777981877326965, + "mean_token_accuracy": 0.7680597454309464, + "num_tokens": 1893575.0, + "step": 116 + }, + { + "entropy": 0.5504349619150162, + "epoch": 0.4398496240601504, + "grad_norm": 0.03185052052140236, + "learning_rate": 0.0002, + "loss": 0.5459255576133728, + "mean_token_accuracy": 0.7792946100234985, + "num_tokens": 1909809.0, + "step": 117 + }, + { + "entropy": 0.5565227419137955, + "epoch": 0.44360902255639095, + "grad_norm": 0.028807369992136955, + "learning_rate": 0.0002, + "loss": 0.551781177520752, + "mean_token_accuracy": 0.7776060104370117, + "num_tokens": 1925981.0, + "step": 118 + }, + { + "entropy": 0.5547512769699097, + "epoch": 0.4473684210526316, + "grad_norm": 0.0315021388232708, + "learning_rate": 0.0002, + "loss": 0.5484083890914917, + "mean_token_accuracy": 0.7798104882240295, + "num_tokens": 1942636.0, + "step": 119 + }, + { + "entropy": 0.5606597065925598, + "epoch": 0.45112781954887216, + "grad_norm": 0.02974752150475979, + "learning_rate": 0.0002, + "loss": 0.5633252263069153, + "mean_token_accuracy": 0.7710647433996201, + "num_tokens": 1959143.0, + "step": 120 + }, + { + "entropy": 0.5621981024742126, + "epoch": 0.4548872180451128, + "grad_norm": 0.03396495804190636, + "learning_rate": 0.0002, + "loss": 0.5700369477272034, + "mean_token_accuracy": 0.7708666622638702, + "num_tokens": 1975709.0, + "step": 121 + }, + { + "entropy": 0.5484206080436707, + "epoch": 0.45864661654135336, + "grad_norm": 0.03273981064558029, + "learning_rate": 0.0002, + "loss": 0.5635251998901367, + "mean_token_accuracy": 0.7709483653306961, + "num_tokens": 1992105.0, + "step": 122 + }, + { + "entropy": 0.5378261581063271, + "epoch": 0.462406015037594, + "grad_norm": 0.03221985325217247, + "learning_rate": 0.0002, + "loss": 0.5449070334434509, + "mean_token_accuracy": 0.7815380096435547, + "num_tokens": 2008467.0, + "step": 123 + }, + { + "entropy": 0.5606098920106888, + "epoch": 0.46616541353383456, + "grad_norm": 0.03314457833766937, + "learning_rate": 0.0002, + "loss": 0.563465416431427, + "mean_token_accuracy": 0.7709829658269882, + "num_tokens": 2024710.0, + "step": 124 + }, + { + "entropy": 0.5656619518995285, + "epoch": 0.4699248120300752, + "grad_norm": 0.03133262321352959, + "learning_rate": 0.0002, + "loss": 0.5610048174858093, + "mean_token_accuracy": 0.7718383222818375, + "num_tokens": 2040853.0, + "step": 125 + }, + { + "entropy": 0.5635328441858292, + "epoch": 0.47368421052631576, + "grad_norm": 0.030308736488223076, + "learning_rate": 0.0002, + "loss": 0.5604254007339478, + "mean_token_accuracy": 0.7731337696313858, + "num_tokens": 2057006.0, + "step": 126 + }, + { + "entropy": 0.57016222178936, + "epoch": 0.4774436090225564, + "grad_norm": 0.03194103017449379, + "learning_rate": 0.0002, + "loss": 0.5620253086090088, + "mean_token_accuracy": 0.7717723101377487, + "num_tokens": 2073332.0, + "step": 127 + }, + { + "entropy": 0.5490193665027618, + "epoch": 0.48120300751879697, + "grad_norm": 0.02910369262099266, + "learning_rate": 0.0002, + "loss": 0.5538103580474854, + "mean_token_accuracy": 0.7780880033969879, + "num_tokens": 2089495.0, + "step": 128 + }, + { + "entropy": 0.5662434548139572, + "epoch": 0.4849624060150376, + "grad_norm": 0.029468489810824394, + "learning_rate": 0.0002, + "loss": 0.5681107044219971, + "mean_token_accuracy": 0.7689958661794662, + "num_tokens": 2106114.0, + "step": 129 + }, + { + "entropy": 0.5431465953588486, + "epoch": 0.48872180451127817, + "grad_norm": 0.03223656490445137, + "learning_rate": 0.0002, + "loss": 0.5507116317749023, + "mean_token_accuracy": 0.7764191329479218, + "num_tokens": 2122567.0, + "step": 130 + }, + { + "entropy": 0.5563855171203613, + "epoch": 0.4924812030075188, + "grad_norm": 0.028281886130571365, + "learning_rate": 0.0002, + "loss": 0.5583161115646362, + "mean_token_accuracy": 0.7736326307058334, + "num_tokens": 2139083.0, + "step": 131 + }, + { + "entropy": 0.5674906224012375, + "epoch": 0.49624060150375937, + "grad_norm": 0.02878589555621147, + "learning_rate": 0.0002, + "loss": 0.564136803150177, + "mean_token_accuracy": 0.7724441289901733, + "num_tokens": 2155542.0, + "step": 132 + }, + { + "entropy": 0.5472439229488373, + "epoch": 0.5, + "grad_norm": 0.029321735724806786, + "learning_rate": 0.0002, + "loss": 0.5442805290222168, + "mean_token_accuracy": 0.7798047512769699, + "num_tokens": 2171801.0, + "step": 133 + }, + { + "entropy": 0.565643772482872, + "epoch": 0.5037593984962406, + "grad_norm": 0.028855223208665848, + "learning_rate": 0.0002, + "loss": 0.5595606565475464, + "mean_token_accuracy": 0.774070993065834, + "num_tokens": 2188167.0, + "step": 134 + }, + { + "entropy": 0.5532195568084717, + "epoch": 0.5075187969924813, + "grad_norm": 0.03198866546154022, + "learning_rate": 0.0002, + "loss": 0.5570374131202698, + "mean_token_accuracy": 0.7740880846977234, + "num_tokens": 2204470.0, + "step": 135 + }, + { + "entropy": 0.5408245772123337, + "epoch": 0.5112781954887218, + "grad_norm": 0.030379725620150566, + "learning_rate": 0.0002, + "loss": 0.5514412522315979, + "mean_token_accuracy": 0.7769049108028412, + "num_tokens": 2220739.0, + "step": 136 + }, + { + "entropy": 0.5346933305263519, + "epoch": 0.5150375939849624, + "grad_norm": 0.03085665963590145, + "learning_rate": 0.0002, + "loss": 0.5364114046096802, + "mean_token_accuracy": 0.7843690663576126, + "num_tokens": 2237147.0, + "step": 137 + }, + { + "entropy": 0.5493077784776688, + "epoch": 0.518796992481203, + "grad_norm": 0.02923487313091755, + "learning_rate": 0.0002, + "loss": 0.5560771822929382, + "mean_token_accuracy": 0.7737279832363129, + "num_tokens": 2253415.0, + "step": 138 + }, + { + "entropy": 0.5472232103347778, + "epoch": 0.5225563909774437, + "grad_norm": 0.031521063297986984, + "learning_rate": 0.0002, + "loss": 0.5497399568557739, + "mean_token_accuracy": 0.777409166097641, + "num_tokens": 2269589.0, + "step": 139 + }, + { + "entropy": 0.5515349954366684, + "epoch": 0.5263157894736842, + "grad_norm": 0.02956547960639, + "learning_rate": 0.0002, + "loss": 0.5464341640472412, + "mean_token_accuracy": 0.7794498354196548, + "num_tokens": 2285953.0, + "step": 140 + }, + { + "entropy": 0.5558236241340637, + "epoch": 0.5300751879699248, + "grad_norm": 0.02974775619804859, + "learning_rate": 0.0002, + "loss": 0.5577874779701233, + "mean_token_accuracy": 0.7712955176830292, + "num_tokens": 2302120.0, + "step": 141 + }, + { + "entropy": 0.5856722742319107, + "epoch": 0.5338345864661654, + "grad_norm": 0.03199459984898567, + "learning_rate": 0.0002, + "loss": 0.5856820940971375, + "mean_token_accuracy": 0.7616758495569229, + "num_tokens": 2318555.0, + "step": 142 + }, + { + "entropy": 0.5560419261455536, + "epoch": 0.5375939849624061, + "grad_norm": 0.03210260346531868, + "learning_rate": 0.0002, + "loss": 0.5606544613838196, + "mean_token_accuracy": 0.7734680622816086, + "num_tokens": 2334764.0, + "step": 143 + }, + { + "entropy": 0.5652720183134079, + "epoch": 0.5413533834586466, + "grad_norm": 0.025965852662920952, + "learning_rate": 0.0002, + "loss": 0.562166690826416, + "mean_token_accuracy": 0.77190200984478, + "num_tokens": 2351198.0, + "step": 144 + }, + { + "entropy": 0.531855046749115, + "epoch": 0.5451127819548872, + "grad_norm": 0.029480863362550735, + "learning_rate": 0.0002, + "loss": 0.5261865854263306, + "mean_token_accuracy": 0.7886765003204346, + "num_tokens": 2367340.0, + "step": 145 + }, + { + "entropy": 0.5517164468765259, + "epoch": 0.5488721804511278, + "grad_norm": 0.03105936385691166, + "learning_rate": 0.0002, + "loss": 0.5542973875999451, + "mean_token_accuracy": 0.7738576829433441, + "num_tokens": 2383605.0, + "step": 146 + }, + { + "entropy": 0.5376151502132416, + "epoch": 0.5526315789473685, + "grad_norm": 0.03337828442454338, + "learning_rate": 0.0002, + "loss": 0.5453506708145142, + "mean_token_accuracy": 0.7788939327001572, + "num_tokens": 2399719.0, + "step": 147 + }, + { + "entropy": 0.5623980462551117, + "epoch": 0.556390977443609, + "grad_norm": 0.028280731290578842, + "learning_rate": 0.0002, + "loss": 0.560990035533905, + "mean_token_accuracy": 0.7726676762104034, + "num_tokens": 2416182.0, + "step": 148 + }, + { + "entropy": 0.5573243647813797, + "epoch": 0.5601503759398496, + "grad_norm": 0.032505616545677185, + "learning_rate": 0.0002, + "loss": 0.5568500757217407, + "mean_token_accuracy": 0.7742682248353958, + "num_tokens": 2432558.0, + "step": 149 + }, + { + "entropy": 0.5573329925537109, + "epoch": 0.5639097744360902, + "grad_norm": 0.03238248452544212, + "learning_rate": 0.0002, + "loss": 0.5538819432258606, + "mean_token_accuracy": 0.777379959821701, + "num_tokens": 2448908.0, + "step": 150 + }, + { + "entropy": 0.5407138615846634, + "epoch": 0.5676691729323309, + "grad_norm": 0.02900576777756214, + "learning_rate": 0.0002, + "loss": 0.5466345548629761, + "mean_token_accuracy": 0.7775551229715347, + "num_tokens": 2465270.0, + "step": 151 + }, + { + "entropy": 0.554168626666069, + "epoch": 0.5714285714285714, + "grad_norm": 0.0312657356262207, + "learning_rate": 0.0002, + "loss": 0.5629188418388367, + "mean_token_accuracy": 0.7751999050378799, + "num_tokens": 2481577.0, + "step": 152 + }, + { + "entropy": 0.5447106957435608, + "epoch": 0.575187969924812, + "grad_norm": 0.02679499238729477, + "learning_rate": 0.0002, + "loss": 0.5434100031852722, + "mean_token_accuracy": 0.7805473357439041, + "num_tokens": 2498025.0, + "step": 153 + }, + { + "entropy": 0.5469905585050583, + "epoch": 0.5789473684210527, + "grad_norm": 0.03267526254057884, + "learning_rate": 0.0002, + "loss": 0.5438751578330994, + "mean_token_accuracy": 0.7798020392656326, + "num_tokens": 2514245.0, + "step": 154 + }, + { + "entropy": 0.5860631912946701, + "epoch": 0.5827067669172933, + "grad_norm": 0.03039904497563839, + "learning_rate": 0.0002, + "loss": 0.5810500383377075, + "mean_token_accuracy": 0.7673344761133194, + "num_tokens": 2530676.0, + "step": 155 + }, + { + "entropy": 0.5545631796121597, + "epoch": 0.5864661654135338, + "grad_norm": 0.028710732236504555, + "learning_rate": 0.0002, + "loss": 0.5573135614395142, + "mean_token_accuracy": 0.7758313864469528, + "num_tokens": 2547029.0, + "step": 156 + }, + { + "entropy": 0.5309299975633621, + "epoch": 0.5902255639097744, + "grad_norm": 0.037456102669239044, + "learning_rate": 0.0002, + "loss": 0.5443962812423706, + "mean_token_accuracy": 0.7781406342983246, + "num_tokens": 2563337.0, + "step": 157 + }, + { + "entropy": 0.5590629875659943, + "epoch": 0.5939849624060151, + "grad_norm": 0.03138922527432442, + "learning_rate": 0.0002, + "loss": 0.570573627948761, + "mean_token_accuracy": 0.7692520618438721, + "num_tokens": 2579699.0, + "step": 158 + }, + { + "entropy": 0.5507991015911102, + "epoch": 0.5977443609022557, + "grad_norm": 0.031148385256528854, + "learning_rate": 0.0002, + "loss": 0.549103856086731, + "mean_token_accuracy": 0.7769458442926407, + "num_tokens": 2596012.0, + "step": 159 + }, + { + "entropy": 0.5691386461257935, + "epoch": 0.6015037593984962, + "grad_norm": 0.03321440890431404, + "learning_rate": 0.0002, + "loss": 0.5682097673416138, + "mean_token_accuracy": 0.7695286124944687, + "num_tokens": 2612192.0, + "step": 160 + }, + { + "entropy": 0.5378303825855255, + "epoch": 0.6052631578947368, + "grad_norm": 0.029134051874279976, + "learning_rate": 0.0002, + "loss": 0.5314258337020874, + "mean_token_accuracy": 0.7879271060228348, + "num_tokens": 2628354.0, + "step": 161 + }, + { + "entropy": 0.5507005900144577, + "epoch": 0.6090225563909775, + "grad_norm": 0.028996866196393967, + "learning_rate": 0.0002, + "loss": 0.5531865358352661, + "mean_token_accuracy": 0.7761473655700684, + "num_tokens": 2644501.0, + "step": 162 + }, + { + "entropy": 0.5587231516838074, + "epoch": 0.6127819548872181, + "grad_norm": 0.03128351643681526, + "learning_rate": 0.0002, + "loss": 0.5601255297660828, + "mean_token_accuracy": 0.7728810757398605, + "num_tokens": 2660638.0, + "step": 163 + }, + { + "entropy": 0.5519489645957947, + "epoch": 0.6165413533834586, + "grad_norm": 0.03436357155442238, + "learning_rate": 0.0002, + "loss": 0.5580562949180603, + "mean_token_accuracy": 0.7739841938018799, + "num_tokens": 2676953.0, + "step": 164 + }, + { + "entropy": 0.5486033111810684, + "epoch": 0.6203007518796992, + "grad_norm": 0.030973074957728386, + "learning_rate": 0.0002, + "loss": 0.5505262613296509, + "mean_token_accuracy": 0.7756275236606598, + "num_tokens": 2693031.0, + "step": 165 + }, + { + "entropy": 0.5522639453411102, + "epoch": 0.6240601503759399, + "grad_norm": 0.03254729509353638, + "learning_rate": 0.0002, + "loss": 0.5508989095687866, + "mean_token_accuracy": 0.7748342007398605, + "num_tokens": 2709299.0, + "step": 166 + }, + { + "entropy": 0.5678143799304962, + "epoch": 0.6278195488721805, + "grad_norm": 0.027512261644005775, + "learning_rate": 0.0002, + "loss": 0.5593494772911072, + "mean_token_accuracy": 0.7736407816410065, + "num_tokens": 2725613.0, + "step": 167 + }, + { + "entropy": 0.5474298894405365, + "epoch": 0.631578947368421, + "grad_norm": 0.02777693048119545, + "learning_rate": 0.0002, + "loss": 0.5416566729545593, + "mean_token_accuracy": 0.7782540619373322, + "num_tokens": 2741762.0, + "step": 168 + }, + { + "entropy": 0.5676318109035492, + "epoch": 0.6353383458646616, + "grad_norm": 0.029206767678260803, + "learning_rate": 0.0002, + "loss": 0.5748559832572937, + "mean_token_accuracy": 0.7664623707532883, + "num_tokens": 2757964.0, + "step": 169 + }, + { + "entropy": 0.5471738129854202, + "epoch": 0.6390977443609023, + "grad_norm": 0.03809071704745293, + "learning_rate": 0.0002, + "loss": 0.5600809454917908, + "mean_token_accuracy": 0.7715400904417038, + "num_tokens": 2774260.0, + "step": 170 + }, + { + "entropy": 0.5543518960475922, + "epoch": 0.6428571428571429, + "grad_norm": 0.029330087825655937, + "learning_rate": 0.0002, + "loss": 0.5620079040527344, + "mean_token_accuracy": 0.7744479775428772, + "num_tokens": 2790354.0, + "step": 171 + }, + { + "entropy": 0.5556869655847549, + "epoch": 0.6466165413533834, + "grad_norm": 0.03219934552907944, + "learning_rate": 0.0002, + "loss": 0.5567511916160583, + "mean_token_accuracy": 0.7723055630922318, + "num_tokens": 2806411.0, + "step": 172 + }, + { + "entropy": 0.5598954260349274, + "epoch": 0.650375939849624, + "grad_norm": 0.03049585595726967, + "learning_rate": 0.0002, + "loss": 0.5581772923469543, + "mean_token_accuracy": 0.7723381072282791, + "num_tokens": 2822457.0, + "step": 173 + }, + { + "entropy": 0.5619530379772186, + "epoch": 0.6541353383458647, + "grad_norm": 0.029140042141079903, + "learning_rate": 0.0002, + "loss": 0.5565066337585449, + "mean_token_accuracy": 0.7765934616327286, + "num_tokens": 2838821.0, + "step": 174 + }, + { + "entropy": 0.5609161257743835, + "epoch": 0.6578947368421053, + "grad_norm": 0.03307173773646355, + "learning_rate": 0.0002, + "loss": 0.5584904551506042, + "mean_token_accuracy": 0.7731504142284393, + "num_tokens": 2854976.0, + "step": 175 + }, + { + "entropy": 0.5472587794065475, + "epoch": 0.6616541353383458, + "grad_norm": 0.027935896068811417, + "learning_rate": 0.0002, + "loss": 0.5532994270324707, + "mean_token_accuracy": 0.7745202481746674, + "num_tokens": 2871053.0, + "step": 176 + }, + { + "entropy": 0.5559375882148743, + "epoch": 0.6654135338345865, + "grad_norm": 0.028821157291531563, + "learning_rate": 0.0002, + "loss": 0.5584789514541626, + "mean_token_accuracy": 0.7747485786676407, + "num_tokens": 2887600.0, + "step": 177 + }, + { + "entropy": 0.5338730216026306, + "epoch": 0.6691729323308271, + "grad_norm": 0.026577429845929146, + "learning_rate": 0.0002, + "loss": 0.5381085276603699, + "mean_token_accuracy": 0.7791920751333237, + "num_tokens": 2903970.0, + "step": 178 + }, + { + "entropy": 0.556627482175827, + "epoch": 0.6729323308270677, + "grad_norm": 0.028157442808151245, + "learning_rate": 0.0002, + "loss": 0.5612574219703674, + "mean_token_accuracy": 0.7728701531887054, + "num_tokens": 2920095.0, + "step": 179 + }, + { + "entropy": 0.5468809902667999, + "epoch": 0.6766917293233082, + "grad_norm": 0.026617249473929405, + "learning_rate": 0.0002, + "loss": 0.5438866019248962, + "mean_token_accuracy": 0.776974618434906, + "num_tokens": 2936400.0, + "step": 180 + }, + { + "entropy": 0.5707015246152878, + "epoch": 0.6804511278195489, + "grad_norm": 0.03165828064084053, + "learning_rate": 0.0002, + "loss": 0.5632250905036926, + "mean_token_accuracy": 0.7731919437646866, + "num_tokens": 2952758.0, + "step": 181 + }, + { + "entropy": 0.5669363737106323, + "epoch": 0.6842105263157895, + "grad_norm": 0.03147813677787781, + "learning_rate": 0.0002, + "loss": 0.5652462840080261, + "mean_token_accuracy": 0.7679423987865448, + "num_tokens": 2969082.0, + "step": 182 + }, + { + "entropy": 0.5380169749259949, + "epoch": 0.6879699248120301, + "grad_norm": 0.027151955291628838, + "learning_rate": 0.0002, + "loss": 0.5455905795097351, + "mean_token_accuracy": 0.7796274274587631, + "num_tokens": 2985183.0, + "step": 183 + }, + { + "entropy": 0.5574334859848022, + "epoch": 0.6917293233082706, + "grad_norm": 0.03327858820557594, + "learning_rate": 0.0002, + "loss": 0.5695413947105408, + "mean_token_accuracy": 0.7701131999492645, + "num_tokens": 3001508.0, + "step": 184 + }, + { + "entropy": 0.5463923811912537, + "epoch": 0.6954887218045113, + "grad_norm": 0.07987584918737411, + "learning_rate": 0.0002, + "loss": 0.5507839918136597, + "mean_token_accuracy": 0.7769906222820282, + "num_tokens": 3017824.0, + "step": 185 + }, + { + "entropy": 0.5602079033851624, + "epoch": 0.6992481203007519, + "grad_norm": 0.032177284359931946, + "learning_rate": 0.0002, + "loss": 0.5561395883560181, + "mean_token_accuracy": 0.7731778472661972, + "num_tokens": 3034234.0, + "step": 186 + }, + { + "entropy": 0.5552242249250412, + "epoch": 0.7030075187969925, + "grad_norm": 0.17276985943317413, + "learning_rate": 0.0002, + "loss": 0.5665730237960815, + "mean_token_accuracy": 0.7776633650064468, + "num_tokens": 3050476.0, + "step": 187 + }, + { + "entropy": 0.5759404450654984, + "epoch": 0.706766917293233, + "grad_norm": 0.03187716379761696, + "learning_rate": 0.0002, + "loss": 0.5729998350143433, + "mean_token_accuracy": 0.7687390595674515, + "num_tokens": 3066888.0, + "step": 188 + }, + { + "entropy": 0.5559865832328796, + "epoch": 0.7105263157894737, + "grad_norm": 0.03442467749118805, + "learning_rate": 0.0002, + "loss": 0.5568963885307312, + "mean_token_accuracy": 0.7721963822841644, + "num_tokens": 3083234.0, + "step": 189 + }, + { + "entropy": 0.5560625046491623, + "epoch": 0.7142857142857143, + "grad_norm": 0.033102214336395264, + "learning_rate": 0.0002, + "loss": 0.5556387305259705, + "mean_token_accuracy": 0.7737521678209305, + "num_tokens": 3099426.0, + "step": 190 + }, + { + "entropy": 0.5532419383525848, + "epoch": 0.7180451127819549, + "grad_norm": 0.03335823863744736, + "learning_rate": 0.0002, + "loss": 0.5556282997131348, + "mean_token_accuracy": 0.7746775895357132, + "num_tokens": 3115788.0, + "step": 191 + }, + { + "entropy": 0.5511862933635712, + "epoch": 0.7218045112781954, + "grad_norm": 0.04099865257740021, + "learning_rate": 0.0002, + "loss": 0.564994752407074, + "mean_token_accuracy": 0.7689872086048126, + "num_tokens": 3132132.0, + "step": 192 + }, + { + "entropy": 0.5518632382154465, + "epoch": 0.7255639097744361, + "grad_norm": 0.03417513892054558, + "learning_rate": 0.0002, + "loss": 0.5622019171714783, + "mean_token_accuracy": 0.7704385071992874, + "num_tokens": 3148387.0, + "step": 193 + }, + { + "entropy": 0.5632559806108475, + "epoch": 0.7293233082706767, + "grad_norm": 0.030820859596133232, + "learning_rate": 0.0002, + "loss": 0.5607547163963318, + "mean_token_accuracy": 0.7714632153511047, + "num_tokens": 3164505.0, + "step": 194 + }, + { + "entropy": 0.589142233133316, + "epoch": 0.7330827067669173, + "grad_norm": 0.029547762125730515, + "learning_rate": 0.0002, + "loss": 0.5773433446884155, + "mean_token_accuracy": 0.7666076868772507, + "num_tokens": 3180879.0, + "step": 195 + }, + { + "entropy": 0.5543933212757111, + "epoch": 0.7368421052631579, + "grad_norm": 0.03714846074581146, + "learning_rate": 0.0002, + "loss": 0.5530077219009399, + "mean_token_accuracy": 0.7751282453536987, + "num_tokens": 3196997.0, + "step": 196 + }, + { + "entropy": 0.5504618287086487, + "epoch": 0.7406015037593985, + "grad_norm": 0.03167671337723732, + "learning_rate": 0.0002, + "loss": 0.5446099042892456, + "mean_token_accuracy": 0.7800730615854263, + "num_tokens": 3213232.0, + "step": 197 + }, + { + "entropy": 0.5440194606781006, + "epoch": 0.7443609022556391, + "grad_norm": 0.028702866286039352, + "learning_rate": 0.0002, + "loss": 0.5420858860015869, + "mean_token_accuracy": 0.780303880572319, + "num_tokens": 3229429.0, + "step": 198 + }, + { + "entropy": 0.5432772487401962, + "epoch": 0.7481203007518797, + "grad_norm": 0.04096582531929016, + "learning_rate": 0.0002, + "loss": 0.5523824095726013, + "mean_token_accuracy": 0.7756204158067703, + "num_tokens": 3245679.0, + "step": 199 + }, + { + "entropy": 0.5610463172197342, + "epoch": 0.7518796992481203, + "grad_norm": 0.036679867655038834, + "learning_rate": 0.0002, + "loss": 0.5655776262283325, + "mean_token_accuracy": 0.7715456783771515, + "num_tokens": 3262189.0, + "step": 200 + }, + { + "entropy": 0.5549308806657791, + "epoch": 0.7556390977443609, + "grad_norm": 0.02466488443315029, + "learning_rate": 0.0002, + "loss": 0.5475676655769348, + "mean_token_accuracy": 0.7779862135648727, + "num_tokens": 3278554.0, + "step": 201 + }, + { + "entropy": 0.5799617767333984, + "epoch": 0.7593984962406015, + "grad_norm": 0.028492242097854614, + "learning_rate": 0.0002, + "loss": 0.5770009160041809, + "mean_token_accuracy": 0.768639862537384, + "num_tokens": 3295063.0, + "step": 202 + }, + { + "entropy": 0.5529991090297699, + "epoch": 0.7631578947368421, + "grad_norm": 0.034728050231933594, + "learning_rate": 0.0002, + "loss": 0.5533767938613892, + "mean_token_accuracy": 0.7767061442136765, + "num_tokens": 3311348.0, + "step": 203 + }, + { + "entropy": 0.5689148902893066, + "epoch": 0.7669172932330827, + "grad_norm": 0.026985110715031624, + "learning_rate": 0.0002, + "loss": 0.5640019774436951, + "mean_token_accuracy": 0.7733623534440994, + "num_tokens": 3327811.0, + "step": 204 + }, + { + "entropy": 0.5497773736715317, + "epoch": 0.7706766917293233, + "grad_norm": 0.026469919830560684, + "learning_rate": 0.0002, + "loss": 0.5544072389602661, + "mean_token_accuracy": 0.7730964869260788, + "num_tokens": 3344190.0, + "step": 205 + }, + { + "entropy": 0.5487343817949295, + "epoch": 0.7744360902255639, + "grad_norm": 0.03394508361816406, + "learning_rate": 0.0002, + "loss": 0.5584373474121094, + "mean_token_accuracy": 0.7742648869752884, + "num_tokens": 3360318.0, + "step": 206 + }, + { + "entropy": 0.5593785345554352, + "epoch": 0.7781954887218046, + "grad_norm": 0.032090939581394196, + "learning_rate": 0.0002, + "loss": 0.5755316019058228, + "mean_token_accuracy": 0.7676598578691483, + "num_tokens": 3376652.0, + "step": 207 + }, + { + "entropy": 0.5540517121553421, + "epoch": 0.7819548872180451, + "grad_norm": 0.029152996838092804, + "learning_rate": 0.0002, + "loss": 0.553016185760498, + "mean_token_accuracy": 0.7774887681007385, + "num_tokens": 3392915.0, + "step": 208 + }, + { + "entropy": 0.5617629438638687, + "epoch": 0.7857142857142857, + "grad_norm": 0.029667040333151817, + "learning_rate": 0.0002, + "loss": 0.5602532625198364, + "mean_token_accuracy": 0.7753290235996246, + "num_tokens": 3409209.0, + "step": 209 + }, + { + "entropy": 0.5676616579294205, + "epoch": 0.7894736842105263, + "grad_norm": 0.03213479742407799, + "learning_rate": 0.0002, + "loss": 0.5651354789733887, + "mean_token_accuracy": 0.7729621976613998, + "num_tokens": 3425474.0, + "step": 210 + }, + { + "entropy": 0.5594458729028702, + "epoch": 0.793233082706767, + "grad_norm": 0.029152261093258858, + "learning_rate": 0.0002, + "loss": 0.5545633435249329, + "mean_token_accuracy": 0.7748460322618484, + "num_tokens": 3441810.0, + "step": 211 + }, + { + "entropy": 0.5657470673322678, + "epoch": 0.7969924812030075, + "grad_norm": 0.030394772067666054, + "learning_rate": 0.0002, + "loss": 0.5634792447090149, + "mean_token_accuracy": 0.7723300457000732, + "num_tokens": 3458017.0, + "step": 212 + }, + { + "entropy": 0.5386789590120316, + "epoch": 0.8007518796992481, + "grad_norm": 0.030803421512246132, + "learning_rate": 0.0002, + "loss": 0.543491780757904, + "mean_token_accuracy": 0.7788570076227188, + "num_tokens": 3474394.0, + "step": 213 + }, + { + "entropy": 0.5462117493152618, + "epoch": 0.8045112781954887, + "grad_norm": 0.032262928783893585, + "learning_rate": 0.0002, + "loss": 0.5550574064254761, + "mean_token_accuracy": 0.7757156640291214, + "num_tokens": 3490659.0, + "step": 214 + }, + { + "entropy": 0.5618492513895035, + "epoch": 0.8082706766917294, + "grad_norm": 0.030515553429722786, + "learning_rate": 0.0002, + "loss": 0.5604183673858643, + "mean_token_accuracy": 0.7713865786790848, + "num_tokens": 3507047.0, + "step": 215 + }, + { + "entropy": 0.5674788951873779, + "epoch": 0.8120300751879699, + "grad_norm": 0.03319476544857025, + "learning_rate": 0.0002, + "loss": 0.5704171657562256, + "mean_token_accuracy": 0.7660792618989944, + "num_tokens": 3523740.0, + "step": 216 + }, + { + "entropy": 0.5655016303062439, + "epoch": 0.8157894736842105, + "grad_norm": 0.025443432852625847, + "learning_rate": 0.0002, + "loss": 0.5628257989883423, + "mean_token_accuracy": 0.7704775929450989, + "num_tokens": 3540342.0, + "step": 217 + }, + { + "entropy": 0.5403912216424942, + "epoch": 0.8195488721804511, + "grad_norm": 0.03260233253240585, + "learning_rate": 0.0002, + "loss": 0.542536735534668, + "mean_token_accuracy": 0.7788421809673309, + "num_tokens": 3556623.0, + "step": 218 + }, + { + "entropy": 0.5680458843708038, + "epoch": 0.8233082706766918, + "grad_norm": 0.034483131021261215, + "learning_rate": 0.0002, + "loss": 0.5691131353378296, + "mean_token_accuracy": 0.76755091547966, + "num_tokens": 3573182.0, + "step": 219 + }, + { + "entropy": 0.5689092427492142, + "epoch": 0.8270676691729323, + "grad_norm": 0.027871334925293922, + "learning_rate": 0.0002, + "loss": 0.5706035494804382, + "mean_token_accuracy": 0.768176794052124, + "num_tokens": 3589235.0, + "step": 220 + }, + { + "entropy": 0.563735768198967, + "epoch": 0.8308270676691729, + "grad_norm": 0.02944294363260269, + "learning_rate": 0.0002, + "loss": 0.5672820806503296, + "mean_token_accuracy": 0.7710028737783432, + "num_tokens": 3605593.0, + "step": 221 + }, + { + "entropy": 0.5397096872329712, + "epoch": 0.8345864661654135, + "grad_norm": 0.030527444556355476, + "learning_rate": 0.0002, + "loss": 0.5446432828903198, + "mean_token_accuracy": 0.7779533118009567, + "num_tokens": 3621959.0, + "step": 222 + }, + { + "entropy": 0.5514500439167023, + "epoch": 0.8383458646616542, + "grad_norm": 0.029658010229468346, + "learning_rate": 0.0002, + "loss": 0.5571471452713013, + "mean_token_accuracy": 0.7720492035150528, + "num_tokens": 3638089.0, + "step": 223 + }, + { + "entropy": 0.5721202939748764, + "epoch": 0.8421052631578947, + "grad_norm": 0.026809731498360634, + "learning_rate": 0.0002, + "loss": 0.5748306512832642, + "mean_token_accuracy": 0.7655669301748276, + "num_tokens": 3654508.0, + "step": 224 + }, + { + "entropy": 0.5657171607017517, + "epoch": 0.8458646616541353, + "grad_norm": 0.02784072421491146, + "learning_rate": 0.0002, + "loss": 0.5645638704299927, + "mean_token_accuracy": 0.7713258415460587, + "num_tokens": 3670883.0, + "step": 225 + }, + { + "entropy": 0.5707942843437195, + "epoch": 0.849624060150376, + "grad_norm": 0.027495261281728745, + "learning_rate": 0.0002, + "loss": 0.5690877437591553, + "mean_token_accuracy": 0.7672522664070129, + "num_tokens": 3687138.0, + "step": 226 + }, + { + "entropy": 0.5599692463874817, + "epoch": 0.8533834586466166, + "grad_norm": 0.02714758738875389, + "learning_rate": 0.0002, + "loss": 0.558695912361145, + "mean_token_accuracy": 0.7728016823530197, + "num_tokens": 3703748.0, + "step": 227 + }, + { + "entropy": 0.5557542443275452, + "epoch": 0.8571428571428571, + "grad_norm": 0.027014488354325294, + "learning_rate": 0.0002, + "loss": 0.5528618097305298, + "mean_token_accuracy": 0.7744259238243103, + "num_tokens": 3720292.0, + "step": 228 + }, + { + "entropy": 0.5545012503862381, + "epoch": 0.8609022556390977, + "grad_norm": 0.030803967267274857, + "learning_rate": 0.0002, + "loss": 0.5548436045646667, + "mean_token_accuracy": 0.772901862859726, + "num_tokens": 3736719.0, + "step": 229 + }, + { + "entropy": 0.5630923807621002, + "epoch": 0.8646616541353384, + "grad_norm": 0.025556016713380814, + "learning_rate": 0.0002, + "loss": 0.5638667941093445, + "mean_token_accuracy": 0.7724170237779617, + "num_tokens": 3753111.0, + "step": 230 + }, + { + "entropy": 0.5482154339551926, + "epoch": 0.868421052631579, + "grad_norm": 0.026636675000190735, + "learning_rate": 0.0002, + "loss": 0.5516517758369446, + "mean_token_accuracy": 0.7738501876592636, + "num_tokens": 3769379.0, + "step": 231 + }, + { + "entropy": 0.5542188733816147, + "epoch": 0.8721804511278195, + "grad_norm": 0.030669352039694786, + "learning_rate": 0.0002, + "loss": 0.562447190284729, + "mean_token_accuracy": 0.7716392129659653, + "num_tokens": 3785882.0, + "step": 232 + }, + { + "entropy": 0.5528077483177185, + "epoch": 0.8759398496240601, + "grad_norm": 0.02840394526720047, + "learning_rate": 0.0002, + "loss": 0.5538339614868164, + "mean_token_accuracy": 0.7760019749403, + "num_tokens": 3802159.0, + "step": 233 + }, + { + "entropy": 0.5367541313171387, + "epoch": 0.8796992481203008, + "grad_norm": 0.027923524379730225, + "learning_rate": 0.0002, + "loss": 0.5381957292556763, + "mean_token_accuracy": 0.7805743962526321, + "num_tokens": 3818361.0, + "step": 234 + }, + { + "entropy": 0.5520175248384476, + "epoch": 0.8834586466165414, + "grad_norm": 0.03241734206676483, + "learning_rate": 0.0002, + "loss": 0.5536331534385681, + "mean_token_accuracy": 0.773536428809166, + "num_tokens": 3834731.0, + "step": 235 + }, + { + "entropy": 0.5460867285728455, + "epoch": 0.8872180451127819, + "grad_norm": 0.027079345658421516, + "learning_rate": 0.0002, + "loss": 0.5475375056266785, + "mean_token_accuracy": 0.7766189575195312, + "num_tokens": 3850982.0, + "step": 236 + }, + { + "entropy": 0.5568866729736328, + "epoch": 0.8909774436090225, + "grad_norm": 0.02961307018995285, + "learning_rate": 0.0002, + "loss": 0.5572586059570312, + "mean_token_accuracy": 0.7737904638051987, + "num_tokens": 3867054.0, + "step": 237 + }, + { + "entropy": 0.5462281703948975, + "epoch": 0.8947368421052632, + "grad_norm": 0.02547132968902588, + "learning_rate": 0.0002, + "loss": 0.5462326407432556, + "mean_token_accuracy": 0.779721811413765, + "num_tokens": 3883377.0, + "step": 238 + }, + { + "entropy": 0.5601012706756592, + "epoch": 0.8984962406015038, + "grad_norm": 0.027931643649935722, + "learning_rate": 0.0002, + "loss": 0.5673293471336365, + "mean_token_accuracy": 0.7699201852083206, + "num_tokens": 3899760.0, + "step": 239 + }, + { + "entropy": 0.558964416384697, + "epoch": 0.9022556390977443, + "grad_norm": 0.027888454496860504, + "learning_rate": 0.0002, + "loss": 0.5613861083984375, + "mean_token_accuracy": 0.7711526602506638, + "num_tokens": 3916259.0, + "step": 240 + }, + { + "entropy": 0.5591289699077606, + "epoch": 0.9060150375939849, + "grad_norm": 0.027367601171135902, + "learning_rate": 0.0002, + "loss": 0.5553447008132935, + "mean_token_accuracy": 0.7748121023178101, + "num_tokens": 3932764.0, + "step": 241 + }, + { + "entropy": 0.5419012606143951, + "epoch": 0.9097744360902256, + "grad_norm": 0.02720046602189541, + "learning_rate": 0.0002, + "loss": 0.5389461517333984, + "mean_token_accuracy": 0.7815262824296951, + "num_tokens": 3948767.0, + "step": 242 + }, + { + "entropy": 0.5506538301706314, + "epoch": 0.9135338345864662, + "grad_norm": 0.04870102182030678, + "learning_rate": 0.0002, + "loss": 0.5555541515350342, + "mean_token_accuracy": 0.7749286592006683, + "num_tokens": 3964899.0, + "step": 243 + }, + { + "entropy": 0.5377955883741379, + "epoch": 0.9172932330827067, + "grad_norm": 0.030033506453037262, + "learning_rate": 0.0002, + "loss": 0.5442740321159363, + "mean_token_accuracy": 0.7790930420160294, + "num_tokens": 3981257.0, + "step": 244 + }, + { + "entropy": 0.5506607741117477, + "epoch": 0.9210526315789473, + "grad_norm": 0.03199909254908562, + "learning_rate": 0.0002, + "loss": 0.5553537607192993, + "mean_token_accuracy": 0.7754099667072296, + "num_tokens": 3997442.0, + "step": 245 + }, + { + "entropy": 0.5611073523759842, + "epoch": 0.924812030075188, + "grad_norm": 0.027019886299967766, + "learning_rate": 0.0002, + "loss": 0.5553584098815918, + "mean_token_accuracy": 0.7750442922115326, + "num_tokens": 4013644.0, + "step": 246 + }, + { + "entropy": 0.5641084164381027, + "epoch": 0.9285714285714286, + "grad_norm": 0.028763286769390106, + "learning_rate": 0.0002, + "loss": 0.5639767050743103, + "mean_token_accuracy": 0.7705299705266953, + "num_tokens": 4029960.0, + "step": 247 + }, + { + "entropy": 0.5596693158149719, + "epoch": 0.9323308270676691, + "grad_norm": 0.029457937926054, + "learning_rate": 0.0002, + "loss": 0.5553030371665955, + "mean_token_accuracy": 0.7704959660768509, + "num_tokens": 4046137.0, + "step": 248 + }, + { + "entropy": 0.5426951497793198, + "epoch": 0.9360902255639098, + "grad_norm": 0.030174724757671356, + "learning_rate": 0.0002, + "loss": 0.5424360036849976, + "mean_token_accuracy": 0.7784756273031235, + "num_tokens": 4062488.0, + "step": 249 + }, + { + "entropy": 0.5482533425092697, + "epoch": 0.9398496240601504, + "grad_norm": 0.029116198420524597, + "learning_rate": 0.0002, + "loss": 0.548699676990509, + "mean_token_accuracy": 0.7772116810083389, + "num_tokens": 4079035.0, + "step": 250 + }, + { + "entropy": 0.5659994781017303, + "epoch": 0.943609022556391, + "grad_norm": 0.028919357806444168, + "learning_rate": 0.0002, + "loss": 0.5734626054763794, + "mean_token_accuracy": 0.7644091695547104, + "num_tokens": 4095496.0, + "step": 251 + }, + { + "entropy": 0.5390999913215637, + "epoch": 0.9473684210526315, + "grad_norm": 0.029156571254134178, + "learning_rate": 0.0002, + "loss": 0.542834460735321, + "mean_token_accuracy": 0.778347447514534, + "num_tokens": 4111786.0, + "step": 252 + }, + { + "entropy": 0.5335533022880554, + "epoch": 0.9511278195488722, + "grad_norm": 0.03090072236955166, + "learning_rate": 0.0002, + "loss": 0.5460265874862671, + "mean_token_accuracy": 0.777598574757576, + "num_tokens": 4127806.0, + "step": 253 + }, + { + "entropy": 0.5576867163181305, + "epoch": 0.9548872180451128, + "grad_norm": 0.0250933188945055, + "learning_rate": 0.0002, + "loss": 0.5579800605773926, + "mean_token_accuracy": 0.772262915968895, + "num_tokens": 4144255.0, + "step": 254 + }, + { + "entropy": 0.5680612325668335, + "epoch": 0.9586466165413534, + "grad_norm": 0.02682660147547722, + "learning_rate": 0.0002, + "loss": 0.5625680685043335, + "mean_token_accuracy": 0.7703745514154434, + "num_tokens": 4160554.0, + "step": 255 + }, + { + "entropy": 0.5646774917840958, + "epoch": 0.9624060150375939, + "grad_norm": 0.02460050955414772, + "learning_rate": 0.0002, + "loss": 0.5615121126174927, + "mean_token_accuracy": 0.7717017978429794, + "num_tokens": 4177058.0, + "step": 256 + }, + { + "entropy": 0.565275639295578, + "epoch": 0.9661654135338346, + "grad_norm": 0.028230059891939163, + "learning_rate": 0.0002, + "loss": 0.5602483153343201, + "mean_token_accuracy": 0.7725579738616943, + "num_tokens": 4193529.0, + "step": 257 + }, + { + "entropy": 0.5464546531438828, + "epoch": 0.9699248120300752, + "grad_norm": 0.028305059298872948, + "learning_rate": 0.0002, + "loss": 0.5506906509399414, + "mean_token_accuracy": 0.7744488716125488, + "num_tokens": 4209843.0, + "step": 258 + }, + { + "entropy": 0.5543451011180878, + "epoch": 0.9736842105263158, + "grad_norm": 0.026113279163837433, + "learning_rate": 0.0002, + "loss": 0.5566228628158569, + "mean_token_accuracy": 0.7761884778738022, + "num_tokens": 4226371.0, + "step": 259 + }, + { + "entropy": 0.5395558923482895, + "epoch": 0.9774436090225563, + "grad_norm": 0.027898062020540237, + "learning_rate": 0.0002, + "loss": 0.551036536693573, + "mean_token_accuracy": 0.7777495980262756, + "num_tokens": 4242588.0, + "step": 260 + }, + { + "entropy": 0.5481285452842712, + "epoch": 0.981203007518797, + "grad_norm": 0.027225090190768242, + "learning_rate": 0.0002, + "loss": 0.55158931016922, + "mean_token_accuracy": 0.7746086716651917, + "num_tokens": 4258895.0, + "step": 261 + }, + { + "entropy": 0.5476398766040802, + "epoch": 0.9849624060150376, + "grad_norm": 0.025991205126047134, + "learning_rate": 0.0002, + "loss": 0.550503671169281, + "mean_token_accuracy": 0.778662696480751, + "num_tokens": 4275233.0, + "step": 262 + }, + { + "entropy": 0.5611831694841385, + "epoch": 0.9887218045112782, + "grad_norm": 0.026602452620863914, + "learning_rate": 0.0002, + "loss": 0.5595046877861023, + "mean_token_accuracy": 0.7710649222135544, + "num_tokens": 4291628.0, + "step": 263 + }, + { + "entropy": 0.5607927143573761, + "epoch": 0.9924812030075187, + "grad_norm": 0.029126716777682304, + "learning_rate": 0.0002, + "loss": 0.55509352684021, + "mean_token_accuracy": 0.773261696100235, + "num_tokens": 4308266.0, + "step": 264 + }, + { + "entropy": 0.5344236195087433, + "epoch": 0.9962406015037594, + "grad_norm": 0.024904625490307808, + "learning_rate": 0.0002, + "loss": 0.5374810099601746, + "mean_token_accuracy": 0.7795998752117157, + "num_tokens": 4324647.0, + "step": 265 + }, + { + "entropy": 0.5802602022886276, + "epoch": 1.0, + "grad_norm": 0.02991756983101368, + "learning_rate": 0.0002, + "loss": 0.5802874565124512, + "mean_token_accuracy": 0.7651515454053879, + "num_tokens": 4341020.0, + "step": 266 + }, + { + "entropy": 0.5359837561845779, + "epoch": 1.0037593984962405, + "grad_norm": 0.028310680761933327, + "learning_rate": 0.0002, + "loss": 0.5382672548294067, + "mean_token_accuracy": 0.7797826081514359, + "num_tokens": 4356946.0, + "step": 267 + }, + { + "entropy": 0.547169104218483, + "epoch": 1.0075187969924813, + "grad_norm": 0.026942851021885872, + "learning_rate": 0.0002, + "loss": 0.5483385324478149, + "mean_token_accuracy": 0.7762030512094498, + "num_tokens": 4373376.0, + "step": 268 + }, + { + "entropy": 0.5396238714456558, + "epoch": 1.0112781954887218, + "grad_norm": 0.026464859023690224, + "learning_rate": 0.0002, + "loss": 0.5366930961608887, + "mean_token_accuracy": 0.7836534827947617, + "num_tokens": 4389434.0, + "step": 269 + }, + { + "entropy": 0.5377503633499146, + "epoch": 1.0150375939849625, + "grad_norm": 0.028936585411429405, + "learning_rate": 0.0002, + "loss": 0.5381658673286438, + "mean_token_accuracy": 0.7795982360839844, + "num_tokens": 4405773.0, + "step": 270 + }, + { + "entropy": 0.5378166139125824, + "epoch": 1.018796992481203, + "grad_norm": 0.026616571471095085, + "learning_rate": 0.0002, + "loss": 0.5366747975349426, + "mean_token_accuracy": 0.7815251797437668, + "num_tokens": 4422223.0, + "step": 271 + }, + { + "entropy": 0.5556348860263824, + "epoch": 1.0225563909774436, + "grad_norm": 0.03760155290365219, + "learning_rate": 0.0002, + "loss": 0.5643568634986877, + "mean_token_accuracy": 0.7716861069202423, + "num_tokens": 4438566.0, + "step": 272 + }, + { + "entropy": 0.5393058955669403, + "epoch": 1.0263157894736843, + "grad_norm": 0.028112079948186874, + "learning_rate": 0.0002, + "loss": 0.536059558391571, + "mean_token_accuracy": 0.7806826084852219, + "num_tokens": 4454882.0, + "step": 273 + }, + { + "entropy": 0.5509982258081436, + "epoch": 1.0300751879699248, + "grad_norm": 0.031216077506542206, + "learning_rate": 0.0002, + "loss": 0.545498251914978, + "mean_token_accuracy": 0.7785268127918243, + "num_tokens": 4471138.0, + "step": 274 + }, + { + "entropy": 0.562383309006691, + "epoch": 1.0338345864661653, + "grad_norm": 0.029023578390479088, + "learning_rate": 0.0002, + "loss": 0.5549452900886536, + "mean_token_accuracy": 0.7746210545301437, + "num_tokens": 4487599.0, + "step": 275 + }, + { + "entropy": 0.533460721373558, + "epoch": 1.037593984962406, + "grad_norm": 0.02839999832212925, + "learning_rate": 0.0002, + "loss": 0.5428166389465332, + "mean_token_accuracy": 0.7788663357496262, + "num_tokens": 4503718.0, + "step": 276 + }, + { + "entropy": 0.534645140171051, + "epoch": 1.0413533834586466, + "grad_norm": 0.03183748945593834, + "learning_rate": 0.0002, + "loss": 0.5435906052589417, + "mean_token_accuracy": 0.780232772231102, + "num_tokens": 4519836.0, + "step": 277 + }, + { + "entropy": 0.5403695106506348, + "epoch": 1.045112781954887, + "grad_norm": 0.03128998726606369, + "learning_rate": 0.0002, + "loss": 0.546108603477478, + "mean_token_accuracy": 0.7786454111337662, + "num_tokens": 4535945.0, + "step": 278 + }, + { + "entropy": 0.5610467493534088, + "epoch": 1.0488721804511278, + "grad_norm": 0.027818012982606888, + "learning_rate": 0.0002, + "loss": 0.560647189617157, + "mean_token_accuracy": 0.7709101587533951, + "num_tokens": 4552374.0, + "step": 279 + }, + { + "entropy": 0.5373391807079315, + "epoch": 1.0526315789473684, + "grad_norm": 0.03428777679800987, + "learning_rate": 0.0002, + "loss": 0.5469943284988403, + "mean_token_accuracy": 0.7768525630235672, + "num_tokens": 4568711.0, + "step": 280 + }, + { + "entropy": 0.5424034297466278, + "epoch": 1.056390977443609, + "grad_norm": 0.03859133645892143, + "learning_rate": 0.0002, + "loss": 0.5439317226409912, + "mean_token_accuracy": 0.7811300605535507, + "num_tokens": 4585017.0, + "step": 281 + }, + { + "entropy": 0.5506146401166916, + "epoch": 1.0601503759398496, + "grad_norm": 0.03055771067738533, + "learning_rate": 0.0002, + "loss": 0.546417236328125, + "mean_token_accuracy": 0.7766596227884293, + "num_tokens": 4601432.0, + "step": 282 + }, + { + "entropy": 0.5494361072778702, + "epoch": 1.0639097744360901, + "grad_norm": 0.0343659445643425, + "learning_rate": 0.0002, + "loss": 0.5465281009674072, + "mean_token_accuracy": 0.7783948630094528, + "num_tokens": 4617733.0, + "step": 283 + }, + { + "entropy": 0.5440582782030106, + "epoch": 1.0676691729323309, + "grad_norm": 0.026508856564760208, + "learning_rate": 0.0002, + "loss": 0.5454896092414856, + "mean_token_accuracy": 0.7768892496824265, + "num_tokens": 4634160.0, + "step": 284 + }, + { + "entropy": 0.5566096007823944, + "epoch": 1.0714285714285714, + "grad_norm": 0.03006400726735592, + "learning_rate": 0.0002, + "loss": 0.5534993410110474, + "mean_token_accuracy": 0.7748663425445557, + "num_tokens": 4650625.0, + "step": 285 + }, + { + "entropy": 0.5545021891593933, + "epoch": 1.0751879699248121, + "grad_norm": 0.03096926584839821, + "learning_rate": 0.0002, + "loss": 0.5561465620994568, + "mean_token_accuracy": 0.7750347554683685, + "num_tokens": 4667029.0, + "step": 286 + }, + { + "entropy": 0.5399864912033081, + "epoch": 1.0789473684210527, + "grad_norm": 0.030643943697214127, + "learning_rate": 0.0002, + "loss": 0.5460204482078552, + "mean_token_accuracy": 0.7770880162715912, + "num_tokens": 4683375.0, + "step": 287 + }, + { + "entropy": 0.5572090744972229, + "epoch": 1.0827067669172932, + "grad_norm": 0.026186607778072357, + "learning_rate": 0.0002, + "loss": 0.5585043430328369, + "mean_token_accuracy": 0.7719515711069107, + "num_tokens": 4699882.0, + "step": 288 + }, + { + "entropy": 0.5484725385904312, + "epoch": 1.086466165413534, + "grad_norm": 0.027757612988352776, + "learning_rate": 0.0002, + "loss": 0.5432863235473633, + "mean_token_accuracy": 0.7777998596429825, + "num_tokens": 4716268.0, + "step": 289 + }, + { + "entropy": 0.5435892194509506, + "epoch": 1.0902255639097744, + "grad_norm": 0.02975296974182129, + "learning_rate": 0.0002, + "loss": 0.5351642966270447, + "mean_token_accuracy": 0.7828023135662079, + "num_tokens": 4732434.0, + "step": 290 + }, + { + "entropy": 0.5531795173883438, + "epoch": 1.093984962406015, + "grad_norm": 0.028304405510425568, + "learning_rate": 0.0002, + "loss": 0.5516840815544128, + "mean_token_accuracy": 0.7772639095783234, + "num_tokens": 4748580.0, + "step": 291 + }, + { + "entropy": 0.5184081122279167, + "epoch": 1.0977443609022557, + "grad_norm": 0.03446349874138832, + "learning_rate": 0.0002, + "loss": 0.5299493670463562, + "mean_token_accuracy": 0.7840149402618408, + "num_tokens": 4764598.0, + "step": 292 + }, + { + "entropy": 0.5289477556943893, + "epoch": 1.1015037593984962, + "grad_norm": 0.036261677742004395, + "learning_rate": 0.0002, + "loss": 0.5453619956970215, + "mean_token_accuracy": 0.7767883092164993, + "num_tokens": 4780809.0, + "step": 293 + }, + { + "entropy": 0.5418924987316132, + "epoch": 1.1052631578947367, + "grad_norm": 0.029477933421730995, + "learning_rate": 0.0002, + "loss": 0.5471935272216797, + "mean_token_accuracy": 0.7789769917726517, + "num_tokens": 4797348.0, + "step": 294 + }, + { + "entropy": 0.5463252663612366, + "epoch": 1.1090225563909775, + "grad_norm": 0.031204085797071457, + "learning_rate": 0.0002, + "loss": 0.5424449443817139, + "mean_token_accuracy": 0.7788571715354919, + "num_tokens": 4813415.0, + "step": 295 + }, + { + "entropy": 0.5470333397388458, + "epoch": 1.112781954887218, + "grad_norm": 0.03411991521716118, + "learning_rate": 0.0002, + "loss": 0.5338444709777832, + "mean_token_accuracy": 0.7839784771203995, + "num_tokens": 4829572.0, + "step": 296 + }, + { + "entropy": 0.5626541525125504, + "epoch": 1.1165413533834587, + "grad_norm": 0.03397219255566597, + "learning_rate": 0.0002, + "loss": 0.5499536991119385, + "mean_token_accuracy": 0.7788331657648087, + "num_tokens": 4845785.0, + "step": 297 + }, + { + "entropy": 0.5299470722675323, + "epoch": 1.1203007518796992, + "grad_norm": 0.03497639298439026, + "learning_rate": 0.0002, + "loss": 0.5392253994941711, + "mean_token_accuracy": 0.7810451984405518, + "num_tokens": 4862012.0, + "step": 298 + }, + { + "entropy": 0.5335487574338913, + "epoch": 1.1240601503759398, + "grad_norm": 0.034831658005714417, + "learning_rate": 0.0002, + "loss": 0.5457339286804199, + "mean_token_accuracy": 0.779063493013382, + "num_tokens": 4878251.0, + "step": 299 + }, + { + "entropy": 0.528610497713089, + "epoch": 1.1278195488721805, + "grad_norm": 0.033591266721487045, + "learning_rate": 0.0002, + "loss": 0.542759358882904, + "mean_token_accuracy": 0.7827056795358658, + "num_tokens": 4894510.0, + "step": 300 + }, + { + "entropy": 0.5455980747938156, + "epoch": 1.131578947368421, + "grad_norm": 0.029848981648683548, + "learning_rate": 0.0002, + "loss": 0.5544407963752747, + "mean_token_accuracy": 0.7761986404657364, + "num_tokens": 4910941.0, + "step": 301 + }, + { + "entropy": 0.5403441041707993, + "epoch": 1.1353383458646618, + "grad_norm": 0.028331086039543152, + "learning_rate": 0.0002, + "loss": 0.5373193025588989, + "mean_token_accuracy": 0.7810037434101105, + "num_tokens": 4927224.0, + "step": 302 + }, + { + "entropy": 0.579601064324379, + "epoch": 1.1390977443609023, + "grad_norm": 0.034219082444906235, + "learning_rate": 0.0002, + "loss": 0.5681281685829163, + "mean_token_accuracy": 0.7684440910816193, + "num_tokens": 4943447.0, + "step": 303 + }, + { + "entropy": 0.5505090206861496, + "epoch": 1.1428571428571428, + "grad_norm": 0.0307406485080719, + "learning_rate": 0.0002, + "loss": 0.5461090803146362, + "mean_token_accuracy": 0.778554230928421, + "num_tokens": 4959489.0, + "step": 304 + }, + { + "entropy": 0.5576640069484711, + "epoch": 1.1466165413533835, + "grad_norm": 0.030323676764965057, + "learning_rate": 0.0002, + "loss": 0.5553523302078247, + "mean_token_accuracy": 0.773658037185669, + "num_tokens": 4975936.0, + "step": 305 + }, + { + "entropy": 0.5266588181257248, + "epoch": 1.150375939849624, + "grad_norm": 0.035491373389959335, + "learning_rate": 0.0002, + "loss": 0.5350923538208008, + "mean_token_accuracy": 0.7815313786268234, + "num_tokens": 4992537.0, + "step": 306 + }, + { + "entropy": 0.5482136011123657, + "epoch": 1.1541353383458646, + "grad_norm": 0.03442855179309845, + "learning_rate": 0.0002, + "loss": 0.5545141696929932, + "mean_token_accuracy": 0.7746158391237259, + "num_tokens": 5009023.0, + "step": 307 + }, + { + "entropy": 0.5559152960777283, + "epoch": 1.1578947368421053, + "grad_norm": 0.02727232687175274, + "learning_rate": 0.0002, + "loss": 0.5569304823875427, + "mean_token_accuracy": 0.7725173830986023, + "num_tokens": 5025411.0, + "step": 308 + }, + { + "entropy": 0.5630469471216202, + "epoch": 1.1616541353383458, + "grad_norm": 0.03064255230128765, + "learning_rate": 0.0002, + "loss": 0.5543197989463806, + "mean_token_accuracy": 0.774148479104042, + "num_tokens": 5041812.0, + "step": 309 + }, + { + "entropy": 0.5571756958961487, + "epoch": 1.1654135338345863, + "grad_norm": 0.03609425947070122, + "learning_rate": 0.0002, + "loss": 0.5525773763656616, + "mean_token_accuracy": 0.7752318233251572, + "num_tokens": 5058244.0, + "step": 310 + }, + { + "entropy": 0.5431416481733322, + "epoch": 1.169172932330827, + "grad_norm": 0.027324821799993515, + "learning_rate": 0.0002, + "loss": 0.5384103059768677, + "mean_token_accuracy": 0.7805906236171722, + "num_tokens": 5074488.0, + "step": 311 + }, + { + "entropy": 0.5343848988413811, + "epoch": 1.1729323308270676, + "grad_norm": 0.03805036470293999, + "learning_rate": 0.0002, + "loss": 0.5469476580619812, + "mean_token_accuracy": 0.779438316822052, + "num_tokens": 5090911.0, + "step": 312 + }, + { + "entropy": 0.536148265004158, + "epoch": 1.1766917293233083, + "grad_norm": 0.02961050719022751, + "learning_rate": 0.0002, + "loss": 0.5435563921928406, + "mean_token_accuracy": 0.7815048396587372, + "num_tokens": 5107152.0, + "step": 313 + }, + { + "entropy": 0.5418159067630768, + "epoch": 1.1804511278195489, + "grad_norm": 0.025910982862114906, + "learning_rate": 0.0002, + "loss": 0.540198028087616, + "mean_token_accuracy": 0.7800037860870361, + "num_tokens": 5123652.0, + "step": 314 + }, + { + "entropy": 0.5343509763479233, + "epoch": 1.1842105263157894, + "grad_norm": 0.03428869694471359, + "learning_rate": 0.0002, + "loss": 0.5369153618812561, + "mean_token_accuracy": 0.7804707884788513, + "num_tokens": 5139855.0, + "step": 315 + }, + { + "entropy": 0.5401560962200165, + "epoch": 1.1879699248120301, + "grad_norm": 0.027781767770648003, + "learning_rate": 0.0002, + "loss": 0.5393479466438293, + "mean_token_accuracy": 0.7805478721857071, + "num_tokens": 5156155.0, + "step": 316 + }, + { + "entropy": 0.5566094070672989, + "epoch": 1.1917293233082706, + "grad_norm": 0.026983041316270828, + "learning_rate": 0.0002, + "loss": 0.554964005947113, + "mean_token_accuracy": 0.7756882756948471, + "num_tokens": 5172489.0, + "step": 317 + }, + { + "entropy": 0.547125369310379, + "epoch": 1.1954887218045114, + "grad_norm": 0.03205394372344017, + "learning_rate": 0.0002, + "loss": 0.5493847727775574, + "mean_token_accuracy": 0.7793397605419159, + "num_tokens": 5189044.0, + "step": 318 + }, + { + "entropy": 0.534126952290535, + "epoch": 1.199248120300752, + "grad_norm": 0.027468601241707802, + "learning_rate": 0.0002, + "loss": 0.532336413860321, + "mean_token_accuracy": 0.7843205332756042, + "num_tokens": 5205622.0, + "step": 319 + }, + { + "entropy": 0.541590228676796, + "epoch": 1.2030075187969924, + "grad_norm": 0.02954232320189476, + "learning_rate": 0.0002, + "loss": 0.5532248020172119, + "mean_token_accuracy": 0.7745756506919861, + "num_tokens": 5222003.0, + "step": 320 + }, + { + "entropy": 0.5365501791238785, + "epoch": 1.2067669172932332, + "grad_norm": 0.03286029398441315, + "learning_rate": 0.0002, + "loss": 0.5431678891181946, + "mean_token_accuracy": 0.7808897346258163, + "num_tokens": 5238368.0, + "step": 321 + }, + { + "entropy": 0.5435497313737869, + "epoch": 1.2105263157894737, + "grad_norm": 0.03365312144160271, + "learning_rate": 0.0002, + "loss": 0.542516827583313, + "mean_token_accuracy": 0.7798768132925034, + "num_tokens": 5254690.0, + "step": 322 + }, + { + "entropy": 0.5485272854566574, + "epoch": 1.2142857142857142, + "grad_norm": 0.02945873513817787, + "learning_rate": 0.0002, + "loss": 0.5457643866539001, + "mean_token_accuracy": 0.779216393828392, + "num_tokens": 5270982.0, + "step": 323 + }, + { + "entropy": 0.5480885654687881, + "epoch": 1.218045112781955, + "grad_norm": 0.03765803202986717, + "learning_rate": 0.0002, + "loss": 0.544890284538269, + "mean_token_accuracy": 0.7774617224931717, + "num_tokens": 5287222.0, + "step": 324 + }, + { + "entropy": 0.5345787778496742, + "epoch": 1.2218045112781954, + "grad_norm": 0.029292147606611252, + "learning_rate": 0.0002, + "loss": 0.5371191501617432, + "mean_token_accuracy": 0.7809965461492538, + "num_tokens": 5303631.0, + "step": 325 + }, + { + "entropy": 0.5533891320228577, + "epoch": 1.225563909774436, + "grad_norm": 0.03491590917110443, + "learning_rate": 0.0002, + "loss": 0.5632805228233337, + "mean_token_accuracy": 0.7713405042886734, + "num_tokens": 5319707.0, + "step": 326 + }, + { + "entropy": 0.5442000329494476, + "epoch": 1.2293233082706767, + "grad_norm": 0.035631779581308365, + "learning_rate": 0.0002, + "loss": 0.5511363744735718, + "mean_token_accuracy": 0.77325139939785, + "num_tokens": 5336015.0, + "step": 327 + }, + { + "entropy": 0.550067774951458, + "epoch": 1.2330827067669172, + "grad_norm": 0.03429507836699486, + "learning_rate": 0.0002, + "loss": 0.5445730686187744, + "mean_token_accuracy": 0.7788997292518616, + "num_tokens": 5352567.0, + "step": 328 + }, + { + "entropy": 0.5536926835775375, + "epoch": 1.236842105263158, + "grad_norm": 0.02860317751765251, + "learning_rate": 0.0002, + "loss": 0.5513879656791687, + "mean_token_accuracy": 0.7763962298631668, + "num_tokens": 5368974.0, + "step": 329 + }, + { + "entropy": 0.5571767240762711, + "epoch": 1.2406015037593985, + "grad_norm": 0.03053511306643486, + "learning_rate": 0.0002, + "loss": 0.5535838007926941, + "mean_token_accuracy": 0.7756504565477371, + "num_tokens": 5385405.0, + "step": 330 + }, + { + "entropy": 0.5644853711128235, + "epoch": 1.244360902255639, + "grad_norm": 0.02813347429037094, + "learning_rate": 0.0002, + "loss": 0.5661532282829285, + "mean_token_accuracy": 0.7694092392921448, + "num_tokens": 5401733.0, + "step": 331 + }, + { + "entropy": 0.554289311170578, + "epoch": 1.2481203007518797, + "grad_norm": 0.030001962557435036, + "learning_rate": 0.0002, + "loss": 0.5581742525100708, + "mean_token_accuracy": 0.7724047005176544, + "num_tokens": 5418343.0, + "step": 332 + }, + { + "entropy": 0.5443666130304337, + "epoch": 1.2518796992481203, + "grad_norm": 0.030697215348482132, + "learning_rate": 0.0002, + "loss": 0.5461480021476746, + "mean_token_accuracy": 0.7806287556886673, + "num_tokens": 5434583.0, + "step": 333 + }, + { + "entropy": 0.5332125425338745, + "epoch": 1.255639097744361, + "grad_norm": 0.031576018780469894, + "learning_rate": 0.0002, + "loss": 0.535359799861908, + "mean_token_accuracy": 0.7810158431529999, + "num_tokens": 5450746.0, + "step": 334 + }, + { + "entropy": 0.555268332362175, + "epoch": 1.2593984962406015, + "grad_norm": 0.027363646775484085, + "learning_rate": 0.0002, + "loss": 0.5560035109519958, + "mean_token_accuracy": 0.7736663818359375, + "num_tokens": 5467188.0, + "step": 335 + }, + { + "entropy": 0.5493292659521103, + "epoch": 1.263157894736842, + "grad_norm": 0.031114885583519936, + "learning_rate": 0.0002, + "loss": 0.5509231090545654, + "mean_token_accuracy": 0.7764100879430771, + "num_tokens": 5483617.0, + "step": 336 + }, + { + "entropy": 0.5554828643798828, + "epoch": 1.2669172932330828, + "grad_norm": 0.027718449011445045, + "learning_rate": 0.0002, + "loss": 0.5540401339530945, + "mean_token_accuracy": 0.7730122804641724, + "num_tokens": 5499950.0, + "step": 337 + }, + { + "entropy": 0.5383172035217285, + "epoch": 1.2706766917293233, + "grad_norm": 0.029059337452054024, + "learning_rate": 0.0002, + "loss": 0.5407942533493042, + "mean_token_accuracy": 0.7809923589229584, + "num_tokens": 5516241.0, + "step": 338 + }, + { + "entropy": 0.5302157253026962, + "epoch": 1.274436090225564, + "grad_norm": 0.030479708686470985, + "learning_rate": 0.0002, + "loss": 0.530126690864563, + "mean_token_accuracy": 0.7863384485244751, + "num_tokens": 5532841.0, + "step": 339 + }, + { + "entropy": 0.5322539657354355, + "epoch": 1.2781954887218046, + "grad_norm": 0.031503573060035706, + "learning_rate": 0.0002, + "loss": 0.5389677286148071, + "mean_token_accuracy": 0.77957783639431, + "num_tokens": 5549325.0, + "step": 340 + }, + { + "entropy": 0.5437572598457336, + "epoch": 1.281954887218045, + "grad_norm": 0.027867093682289124, + "learning_rate": 0.0002, + "loss": 0.5459513664245605, + "mean_token_accuracy": 0.7789556235074997, + "num_tokens": 5565810.0, + "step": 341 + }, + { + "entropy": 0.5430660545825958, + "epoch": 1.2857142857142856, + "grad_norm": 0.03420820087194443, + "learning_rate": 0.0002, + "loss": 0.5441212058067322, + "mean_token_accuracy": 0.7775195837020874, + "num_tokens": 5581844.0, + "step": 342 + }, + { + "entropy": 0.5310375243425369, + "epoch": 1.2894736842105263, + "grad_norm": 0.03065858967602253, + "learning_rate": 0.0002, + "loss": 0.5356528162956238, + "mean_token_accuracy": 0.7801522761583328, + "num_tokens": 5598042.0, + "step": 343 + }, + { + "entropy": 0.5220501720905304, + "epoch": 1.2932330827067668, + "grad_norm": 0.029243886470794678, + "learning_rate": 0.0002, + "loss": 0.516523540019989, + "mean_token_accuracy": 0.7906120866537094, + "num_tokens": 5614111.0, + "step": 344 + }, + { + "entropy": 0.5659748762845993, + "epoch": 1.2969924812030076, + "grad_norm": 0.03555883839726448, + "learning_rate": 0.0002, + "loss": 0.5587096214294434, + "mean_token_accuracy": 0.771675169467926, + "num_tokens": 5630635.0, + "step": 345 + }, + { + "entropy": 0.5501575618982315, + "epoch": 1.300751879699248, + "grad_norm": 0.030357254669070244, + "learning_rate": 0.0002, + "loss": 0.5473156571388245, + "mean_token_accuracy": 0.7771240919828415, + "num_tokens": 5646994.0, + "step": 346 + }, + { + "entropy": 0.5270983800292015, + "epoch": 1.3045112781954886, + "grad_norm": 0.030822839587926865, + "learning_rate": 0.0002, + "loss": 0.5363721251487732, + "mean_token_accuracy": 0.7837044894695282, + "num_tokens": 5663472.0, + "step": 347 + }, + { + "entropy": 0.5483475178480148, + "epoch": 1.3082706766917294, + "grad_norm": 0.03400631621479988, + "learning_rate": 0.0002, + "loss": 0.5550627708435059, + "mean_token_accuracy": 0.7723206877708435, + "num_tokens": 5679878.0, + "step": 348 + }, + { + "entropy": 0.5459110736846924, + "epoch": 1.3120300751879699, + "grad_norm": 0.028672240674495697, + "learning_rate": 0.0002, + "loss": 0.5484554767608643, + "mean_token_accuracy": 0.7754105031490326, + "num_tokens": 5696124.0, + "step": 349 + }, + { + "entropy": 0.5513360351324081, + "epoch": 1.3157894736842106, + "grad_norm": 0.029986541718244553, + "learning_rate": 0.0002, + "loss": 0.548675000667572, + "mean_token_accuracy": 0.7767119109630585, + "num_tokens": 5712240.0, + "step": 350 + }, + { + "entropy": 0.5394999980926514, + "epoch": 1.3195488721804511, + "grad_norm": 0.027749765664339066, + "learning_rate": 0.0002, + "loss": 0.5411927700042725, + "mean_token_accuracy": 0.7794090211391449, + "num_tokens": 5728487.0, + "step": 351 + }, + { + "entropy": 0.5632177442312241, + "epoch": 1.3233082706766917, + "grad_norm": 0.03165826201438904, + "learning_rate": 0.0002, + "loss": 0.5644969344139099, + "mean_token_accuracy": 0.7739209532737732, + "num_tokens": 5744665.0, + "step": 352 + }, + { + "entropy": 0.5484495759010315, + "epoch": 1.3270676691729324, + "grad_norm": 0.02855236455798149, + "learning_rate": 0.0002, + "loss": 0.5507109761238098, + "mean_token_accuracy": 0.7781708836555481, + "num_tokens": 5761081.0, + "step": 353 + }, + { + "entropy": 0.5463808476924896, + "epoch": 1.330827067669173, + "grad_norm": 0.033144768327474594, + "learning_rate": 0.0002, + "loss": 0.5490323901176453, + "mean_token_accuracy": 0.7771764546632767, + "num_tokens": 5777230.0, + "step": 354 + }, + { + "entropy": 0.559476301074028, + "epoch": 1.3345864661654137, + "grad_norm": 0.030584782361984253, + "learning_rate": 0.0002, + "loss": 0.5653771162033081, + "mean_token_accuracy": 0.7701748311519623, + "num_tokens": 5793509.0, + "step": 355 + }, + { + "entropy": 0.5580354928970337, + "epoch": 1.3383458646616542, + "grad_norm": 0.029205013066530228, + "learning_rate": 0.0002, + "loss": 0.5602571964263916, + "mean_token_accuracy": 0.7710904181003571, + "num_tokens": 5809901.0, + "step": 356 + }, + { + "entropy": 0.5673199146986008, + "epoch": 1.3421052631578947, + "grad_norm": 0.03065381944179535, + "learning_rate": 0.0002, + "loss": 0.5655714273452759, + "mean_token_accuracy": 0.7691835165023804, + "num_tokens": 5826128.0, + "step": 357 + }, + { + "entropy": 0.5535888224840164, + "epoch": 1.3458646616541352, + "grad_norm": 0.028708767145872116, + "learning_rate": 0.0002, + "loss": 0.5483720302581787, + "mean_token_accuracy": 0.7754883170127869, + "num_tokens": 5842416.0, + "step": 358 + }, + { + "entropy": 0.5565765500068665, + "epoch": 1.349624060150376, + "grad_norm": 0.031074965372681618, + "learning_rate": 0.0002, + "loss": 0.5588751435279846, + "mean_token_accuracy": 0.7724489718675613, + "num_tokens": 5858778.0, + "step": 359 + }, + { + "entropy": 0.5447706580162048, + "epoch": 1.3533834586466165, + "grad_norm": 0.031974222511053085, + "learning_rate": 0.0002, + "loss": 0.5503548979759216, + "mean_token_accuracy": 0.7767511457204819, + "num_tokens": 5875340.0, + "step": 360 + }, + { + "entropy": 0.5325894355773926, + "epoch": 1.3571428571428572, + "grad_norm": 0.036680273711681366, + "learning_rate": 0.0002, + "loss": 0.5425075888633728, + "mean_token_accuracy": 0.7785896062850952, + "num_tokens": 5891618.0, + "step": 361 + }, + { + "entropy": 0.5401211231946945, + "epoch": 1.3609022556390977, + "grad_norm": 0.030604355037212372, + "learning_rate": 0.0002, + "loss": 0.543202817440033, + "mean_token_accuracy": 0.7824591100215912, + "num_tokens": 5907777.0, + "step": 362 + }, + { + "entropy": 0.548919603228569, + "epoch": 1.3646616541353382, + "grad_norm": 0.02865537256002426, + "learning_rate": 0.0002, + "loss": 0.5504399538040161, + "mean_token_accuracy": 0.7752194404602051, + "num_tokens": 5924266.0, + "step": 363 + }, + { + "entropy": 0.5391300171613693, + "epoch": 1.368421052631579, + "grad_norm": 0.030051855370402336, + "learning_rate": 0.0002, + "loss": 0.5288874506950378, + "mean_token_accuracy": 0.7848425358533859, + "num_tokens": 5940334.0, + "step": 364 + }, + { + "entropy": 0.5440739095211029, + "epoch": 1.3721804511278195, + "grad_norm": 0.02727932669222355, + "learning_rate": 0.0002, + "loss": 0.5456202626228333, + "mean_token_accuracy": 0.7774905413389206, + "num_tokens": 5956646.0, + "step": 365 + }, + { + "entropy": 0.5311928540468216, + "epoch": 1.3759398496240602, + "grad_norm": 0.029294485226273537, + "learning_rate": 0.0002, + "loss": 0.5352226495742798, + "mean_token_accuracy": 0.7806590050458908, + "num_tokens": 5972841.0, + "step": 366 + }, + { + "entropy": 0.5386375188827515, + "epoch": 1.3796992481203008, + "grad_norm": 0.034396879374980927, + "learning_rate": 0.0002, + "loss": 0.5386478304862976, + "mean_token_accuracy": 0.780673161149025, + "num_tokens": 5989110.0, + "step": 367 + }, + { + "entropy": 0.5205325111746788, + "epoch": 1.3834586466165413, + "grad_norm": 0.028440408408641815, + "learning_rate": 0.0002, + "loss": 0.524253249168396, + "mean_token_accuracy": 0.7875637263059616, + "num_tokens": 6005130.0, + "step": 368 + }, + { + "entropy": 0.5718593895435333, + "epoch": 1.387218045112782, + "grad_norm": 0.03535715863108635, + "learning_rate": 0.0002, + "loss": 0.5674105882644653, + "mean_token_accuracy": 0.7696711122989655, + "num_tokens": 6021765.0, + "step": 369 + }, + { + "entropy": 0.5570171922445297, + "epoch": 1.3909774436090225, + "grad_norm": 0.02890731766819954, + "learning_rate": 0.0002, + "loss": 0.5550771951675415, + "mean_token_accuracy": 0.7735273241996765, + "num_tokens": 6038195.0, + "step": 370 + }, + { + "entropy": 0.5555340945720673, + "epoch": 1.3947368421052633, + "grad_norm": 0.03310281038284302, + "learning_rate": 0.0002, + "loss": 0.5569556951522827, + "mean_token_accuracy": 0.7722765356302261, + "num_tokens": 6054869.0, + "step": 371 + }, + { + "entropy": 0.5339787155389786, + "epoch": 1.3984962406015038, + "grad_norm": 0.0280836783349514, + "learning_rate": 0.0002, + "loss": 0.5336146354675293, + "mean_token_accuracy": 0.7833946198225021, + "num_tokens": 6071026.0, + "step": 372 + }, + { + "entropy": 0.5382460206747055, + "epoch": 1.4022556390977443, + "grad_norm": 0.028865907341241837, + "learning_rate": 0.0002, + "loss": 0.5415489077568054, + "mean_token_accuracy": 0.7795161455869675, + "num_tokens": 6087218.0, + "step": 373 + }, + { + "entropy": 0.5312956869602203, + "epoch": 1.4060150375939848, + "grad_norm": 0.029321739450097084, + "learning_rate": 0.0002, + "loss": 0.5310655832290649, + "mean_token_accuracy": 0.7824108898639679, + "num_tokens": 6103644.0, + "step": 374 + }, + { + "entropy": 0.5470356345176697, + "epoch": 1.4097744360902256, + "grad_norm": 0.035155754536390305, + "learning_rate": 0.0002, + "loss": 0.5525869131088257, + "mean_token_accuracy": 0.7761145532131195, + "num_tokens": 6120051.0, + "step": 375 + }, + { + "entropy": 0.5374057814478874, + "epoch": 1.413533834586466, + "grad_norm": 0.029863376170396805, + "learning_rate": 0.0002, + "loss": 0.542983889579773, + "mean_token_accuracy": 0.7801049947738647, + "num_tokens": 6136168.0, + "step": 376 + }, + { + "entropy": 0.5664133429527283, + "epoch": 1.4172932330827068, + "grad_norm": 0.04531969875097275, + "learning_rate": 0.0002, + "loss": 0.5716960430145264, + "mean_token_accuracy": 0.7669987082481384, + "num_tokens": 6152503.0, + "step": 377 + }, + { + "entropy": 0.5445482283830643, + "epoch": 1.4210526315789473, + "grad_norm": 0.031349968165159225, + "learning_rate": 0.0002, + "loss": 0.5467873811721802, + "mean_token_accuracy": 0.7808011472225189, + "num_tokens": 6168685.0, + "step": 378 + }, + { + "entropy": 0.5332349240779877, + "epoch": 1.4248120300751879, + "grad_norm": 0.03072705864906311, + "learning_rate": 0.0002, + "loss": 0.5336711406707764, + "mean_token_accuracy": 0.785218134522438, + "num_tokens": 6185265.0, + "step": 379 + }, + { + "entropy": 0.5406992584466934, + "epoch": 1.4285714285714286, + "grad_norm": 0.03197013586759567, + "learning_rate": 0.0002, + "loss": 0.535304605960846, + "mean_token_accuracy": 0.781609907746315, + "num_tokens": 6201359.0, + "step": 380 + }, + { + "entropy": 0.5503518134355545, + "epoch": 1.4323308270676691, + "grad_norm": 0.02861807495355606, + "learning_rate": 0.0002, + "loss": 0.5474637746810913, + "mean_token_accuracy": 0.7788266986608505, + "num_tokens": 6217636.0, + "step": 381 + }, + { + "entropy": 0.5336224138736725, + "epoch": 1.4360902255639099, + "grad_norm": 0.03593042492866516, + "learning_rate": 0.0002, + "loss": 0.5366555452346802, + "mean_token_accuracy": 0.7802215367555618, + "num_tokens": 6234047.0, + "step": 382 + }, + { + "entropy": 0.5492585748434067, + "epoch": 1.4398496240601504, + "grad_norm": 0.02969398722052574, + "learning_rate": 0.0002, + "loss": 0.5519292950630188, + "mean_token_accuracy": 0.77450992166996, + "num_tokens": 6250372.0, + "step": 383 + }, + { + "entropy": 0.5435014069080353, + "epoch": 1.443609022556391, + "grad_norm": 0.03131045401096344, + "learning_rate": 0.0002, + "loss": 0.5428797602653503, + "mean_token_accuracy": 0.7789845615625381, + "num_tokens": 6266490.0, + "step": 384 + }, + { + "entropy": 0.5582468658685684, + "epoch": 1.4473684210526316, + "grad_norm": 0.0334627628326416, + "learning_rate": 0.0002, + "loss": 0.5606057047843933, + "mean_token_accuracy": 0.7737329006195068, + "num_tokens": 6282965.0, + "step": 385 + }, + { + "entropy": 0.5667697936296463, + "epoch": 1.4511278195488722, + "grad_norm": 0.031320203095674515, + "learning_rate": 0.0002, + "loss": 0.5704291462898254, + "mean_token_accuracy": 0.7688294649124146, + "num_tokens": 6299265.0, + "step": 386 + }, + { + "entropy": 0.5566418468952179, + "epoch": 1.454887218045113, + "grad_norm": 0.04116431251168251, + "learning_rate": 0.0002, + "loss": 0.5568630695343018, + "mean_token_accuracy": 0.774201288819313, + "num_tokens": 6315434.0, + "step": 387 + }, + { + "entropy": 0.5492933839559555, + "epoch": 1.4586466165413534, + "grad_norm": 0.02759244106709957, + "learning_rate": 0.0002, + "loss": 0.5531164407730103, + "mean_token_accuracy": 0.7763701528310776, + "num_tokens": 6331760.0, + "step": 388 + }, + { + "entropy": 0.5672035366296768, + "epoch": 1.462406015037594, + "grad_norm": 0.03223001956939697, + "learning_rate": 0.0002, + "loss": 0.56959068775177, + "mean_token_accuracy": 0.768874928355217, + "num_tokens": 6348346.0, + "step": 389 + }, + { + "entropy": 0.5533206462860107, + "epoch": 1.4661654135338344, + "grad_norm": 0.03371699899435043, + "learning_rate": 0.0002, + "loss": 0.5532012581825256, + "mean_token_accuracy": 0.7752765119075775, + "num_tokens": 6364905.0, + "step": 390 + }, + { + "entropy": 0.5474317967891693, + "epoch": 1.4699248120300752, + "grad_norm": 0.033150747418403625, + "learning_rate": 0.0002, + "loss": 0.5470337867736816, + "mean_token_accuracy": 0.776570737361908, + "num_tokens": 6381253.0, + "step": 391 + }, + { + "entropy": 0.5514713823795319, + "epoch": 1.4736842105263157, + "grad_norm": 0.03456156328320503, + "learning_rate": 0.0002, + "loss": 0.5495055317878723, + "mean_token_accuracy": 0.7780424803495407, + "num_tokens": 6397488.0, + "step": 392 + }, + { + "entropy": 0.524335652589798, + "epoch": 1.4774436090225564, + "grad_norm": 0.0276760496199131, + "learning_rate": 0.0002, + "loss": 0.5228588581085205, + "mean_token_accuracy": 0.7869584411382675, + "num_tokens": 6413858.0, + "step": 393 + }, + { + "entropy": 0.5439832955598831, + "epoch": 1.481203007518797, + "grad_norm": 0.030009951442480087, + "learning_rate": 0.0002, + "loss": 0.5459988117218018, + "mean_token_accuracy": 0.7772574722766876, + "num_tokens": 6430056.0, + "step": 394 + }, + { + "entropy": 0.558243066072464, + "epoch": 1.4849624060150375, + "grad_norm": 0.03417029604315758, + "learning_rate": 0.0002, + "loss": 0.551323652267456, + "mean_token_accuracy": 0.7783164083957672, + "num_tokens": 6446633.0, + "step": 395 + }, + { + "entropy": 0.5622076392173767, + "epoch": 1.4887218045112782, + "grad_norm": 0.030520809814333916, + "learning_rate": 0.0002, + "loss": 0.5651980638504028, + "mean_token_accuracy": 0.7693700790405273, + "num_tokens": 6463061.0, + "step": 396 + }, + { + "entropy": 0.5262496769428253, + "epoch": 1.4924812030075187, + "grad_norm": 0.03385322168469429, + "learning_rate": 0.0002, + "loss": 0.5383599400520325, + "mean_token_accuracy": 0.7795081436634064, + "num_tokens": 6479394.0, + "step": 397 + }, + { + "entropy": 0.5428214818239212, + "epoch": 1.4962406015037595, + "grad_norm": 0.0344393290579319, + "learning_rate": 0.0002, + "loss": 0.5506508350372314, + "mean_token_accuracy": 0.776181235909462, + "num_tokens": 6495837.0, + "step": 398 + }, + { + "entropy": 0.5589512288570404, + "epoch": 1.5, + "grad_norm": 0.031076369807124138, + "learning_rate": 0.0002, + "loss": 0.5615136027336121, + "mean_token_accuracy": 0.7719069272279739, + "num_tokens": 6512096.0, + "step": 399 + }, + { + "entropy": 0.560438871383667, + "epoch": 1.5037593984962405, + "grad_norm": 0.03327278420329094, + "learning_rate": 0.0002, + "loss": 0.5491290092468262, + "mean_token_accuracy": 0.7760379314422607, + "num_tokens": 6528380.0, + "step": 400 + }, + { + "entropy": 0.543613851070404, + "epoch": 1.5075187969924813, + "grad_norm": 0.03218228369951248, + "learning_rate": 0.0002, + "loss": 0.5404437780380249, + "mean_token_accuracy": 0.7790963053703308, + "num_tokens": 6544607.0, + "step": 401 + }, + { + "entropy": 0.5582986176013947, + "epoch": 1.5112781954887218, + "grad_norm": 0.031328245997428894, + "learning_rate": 0.0002, + "loss": 0.5539280772209167, + "mean_token_accuracy": 0.7730978429317474, + "num_tokens": 6561161.0, + "step": 402 + }, + { + "entropy": 0.5439886897802353, + "epoch": 1.5150375939849625, + "grad_norm": 0.0315370075404644, + "learning_rate": 0.0002, + "loss": 0.5494069457054138, + "mean_token_accuracy": 0.77658711373806, + "num_tokens": 6577494.0, + "step": 403 + }, + { + "entropy": 0.5441574305295944, + "epoch": 1.518796992481203, + "grad_norm": 0.029565030708909035, + "learning_rate": 0.0002, + "loss": 0.5542066097259521, + "mean_token_accuracy": 0.7728031128644943, + "num_tokens": 6593864.0, + "step": 404 + }, + { + "entropy": 0.5381332039833069, + "epoch": 1.5225563909774436, + "grad_norm": 0.030989129096269608, + "learning_rate": 0.0002, + "loss": 0.5439568758010864, + "mean_token_accuracy": 0.7784450650215149, + "num_tokens": 6610189.0, + "step": 405 + }, + { + "entropy": 0.5451879501342773, + "epoch": 1.526315789473684, + "grad_norm": 0.030062349513173103, + "learning_rate": 0.0002, + "loss": 0.5435837507247925, + "mean_token_accuracy": 0.7782586812973022, + "num_tokens": 6626574.0, + "step": 406 + }, + { + "entropy": 0.5333066508173943, + "epoch": 1.5300751879699248, + "grad_norm": 0.02931753545999527, + "learning_rate": 0.0002, + "loss": 0.52620530128479, + "mean_token_accuracy": 0.784236952662468, + "num_tokens": 6642855.0, + "step": 407 + }, + { + "entropy": 0.5590699911117554, + "epoch": 1.5338345864661656, + "grad_norm": 0.03177345171570778, + "learning_rate": 0.0002, + "loss": 0.5554062128067017, + "mean_token_accuracy": 0.7730756998062134, + "num_tokens": 6659323.0, + "step": 408 + }, + { + "entropy": 0.5350319743156433, + "epoch": 1.537593984962406, + "grad_norm": 0.033441949635744095, + "learning_rate": 0.0002, + "loss": 0.5428333282470703, + "mean_token_accuracy": 0.7798242121934891, + "num_tokens": 6675571.0, + "step": 409 + }, + { + "entropy": 0.5449950993061066, + "epoch": 1.5413533834586466, + "grad_norm": 0.03087989240884781, + "learning_rate": 0.0002, + "loss": 0.550757646560669, + "mean_token_accuracy": 0.7777638882398605, + "num_tokens": 6692022.0, + "step": 410 + }, + { + "entropy": 0.5534456223249435, + "epoch": 1.545112781954887, + "grad_norm": 0.030627673491835594, + "learning_rate": 0.0002, + "loss": 0.5566884875297546, + "mean_token_accuracy": 0.7747643887996674, + "num_tokens": 6708348.0, + "step": 411 + }, + { + "entropy": 0.5696779191493988, + "epoch": 1.5488721804511278, + "grad_norm": 0.029869280755519867, + "learning_rate": 0.0002, + "loss": 0.5629582405090332, + "mean_token_accuracy": 0.7705719769001007, + "num_tokens": 6725016.0, + "step": 412 + }, + { + "entropy": 0.5336505770683289, + "epoch": 1.5526315789473686, + "grad_norm": 0.02911611832678318, + "learning_rate": 0.0002, + "loss": 0.5279027223587036, + "mean_token_accuracy": 0.783367246389389, + "num_tokens": 6741327.0, + "step": 413 + }, + { + "entropy": 0.5392275899648666, + "epoch": 1.556390977443609, + "grad_norm": 0.02994578517973423, + "learning_rate": 0.0002, + "loss": 0.5416238307952881, + "mean_token_accuracy": 0.7807497531175613, + "num_tokens": 6757440.0, + "step": 414 + }, + { + "entropy": 0.5460323542356491, + "epoch": 1.5601503759398496, + "grad_norm": 0.03534119576215744, + "learning_rate": 0.0002, + "loss": 0.5568557977676392, + "mean_token_accuracy": 0.7705673724412918, + "num_tokens": 6773654.0, + "step": 415 + }, + { + "entropy": 0.5286229997873306, + "epoch": 1.5639097744360901, + "grad_norm": 0.029811112210154533, + "learning_rate": 0.0002, + "loss": 0.5318726301193237, + "mean_token_accuracy": 0.7832337915897369, + "num_tokens": 6789752.0, + "step": 416 + }, + { + "entropy": 0.5552769899368286, + "epoch": 1.5676691729323309, + "grad_norm": 0.030895395204424858, + "learning_rate": 0.0002, + "loss": 0.5534340739250183, + "mean_token_accuracy": 0.7729407846927643, + "num_tokens": 6805849.0, + "step": 417 + }, + { + "entropy": 0.5429228097200394, + "epoch": 1.5714285714285714, + "grad_norm": 0.02707672491669655, + "learning_rate": 0.0002, + "loss": 0.5381065607070923, + "mean_token_accuracy": 0.7819552570581436, + "num_tokens": 6822408.0, + "step": 418 + }, + { + "entropy": 0.5434612482786179, + "epoch": 1.5751879699248121, + "grad_norm": 0.031254079192876816, + "learning_rate": 0.0002, + "loss": 0.5391129851341248, + "mean_token_accuracy": 0.7833003848791122, + "num_tokens": 6838597.0, + "step": 419 + }, + { + "entropy": 0.5366530418395996, + "epoch": 1.5789473684210527, + "grad_norm": 0.03022637590765953, + "learning_rate": 0.0002, + "loss": 0.5400729179382324, + "mean_token_accuracy": 0.7778450101613998, + "num_tokens": 6854952.0, + "step": 420 + }, + { + "entropy": 0.5444828122854233, + "epoch": 1.5827067669172932, + "grad_norm": 0.031558163464069366, + "learning_rate": 0.0002, + "loss": 0.5507203936576843, + "mean_token_accuracy": 0.7739860564470291, + "num_tokens": 6871383.0, + "step": 421 + }, + { + "entropy": 0.5397373139858246, + "epoch": 1.5864661654135337, + "grad_norm": 0.03590668365359306, + "learning_rate": 0.0002, + "loss": 0.5495097041130066, + "mean_token_accuracy": 0.7745723277330399, + "num_tokens": 6887614.0, + "step": 422 + }, + { + "entropy": 0.5547508299350739, + "epoch": 1.5902255639097744, + "grad_norm": 0.03271407634019852, + "learning_rate": 0.0002, + "loss": 0.5595258474349976, + "mean_token_accuracy": 0.7740814536809921, + "num_tokens": 6903891.0, + "step": 423 + }, + { + "entropy": 0.5452055484056473, + "epoch": 1.5939849624060152, + "grad_norm": 0.034447524696588516, + "learning_rate": 0.0002, + "loss": 0.5422000288963318, + "mean_token_accuracy": 0.7810980677604675, + "num_tokens": 6920317.0, + "step": 424 + }, + { + "entropy": 0.5475759953260422, + "epoch": 1.5977443609022557, + "grad_norm": 0.027404673397541046, + "learning_rate": 0.0002, + "loss": 0.5450745820999146, + "mean_token_accuracy": 0.7764957696199417, + "num_tokens": 6936706.0, + "step": 425 + }, + { + "entropy": 0.5484007894992828, + "epoch": 1.6015037593984962, + "grad_norm": 0.031125633046030998, + "learning_rate": 0.0002, + "loss": 0.5480135083198547, + "mean_token_accuracy": 0.7771385014057159, + "num_tokens": 6952874.0, + "step": 426 + }, + { + "entropy": 0.5364782959222794, + "epoch": 1.6052631578947367, + "grad_norm": 0.029450541362166405, + "learning_rate": 0.0002, + "loss": 0.5340723395347595, + "mean_token_accuracy": 0.7846143394708633, + "num_tokens": 6969087.0, + "step": 427 + }, + { + "entropy": 0.5632024109363556, + "epoch": 1.6090225563909775, + "grad_norm": 0.03085445798933506, + "learning_rate": 0.0002, + "loss": 0.56367427110672, + "mean_token_accuracy": 0.7722935974597931, + "num_tokens": 6985519.0, + "step": 428 + }, + { + "entropy": 0.5589936077594757, + "epoch": 1.6127819548872182, + "grad_norm": 0.03428523615002632, + "learning_rate": 0.0002, + "loss": 0.5611156225204468, + "mean_token_accuracy": 0.7728175222873688, + "num_tokens": 7001978.0, + "step": 429 + }, + { + "entropy": 0.5625983476638794, + "epoch": 1.6165413533834587, + "grad_norm": 0.03059856966137886, + "learning_rate": 0.0002, + "loss": 0.5613099932670593, + "mean_token_accuracy": 0.7710365056991577, + "num_tokens": 7018277.0, + "step": 430 + }, + { + "entropy": 0.5519939213991165, + "epoch": 1.6203007518796992, + "grad_norm": 0.030437655746936798, + "learning_rate": 0.0002, + "loss": 0.545467734336853, + "mean_token_accuracy": 0.778165876865387, + "num_tokens": 7034622.0, + "step": 431 + }, + { + "entropy": 0.5278475731611252, + "epoch": 1.6240601503759398, + "grad_norm": 0.027164338156580925, + "learning_rate": 0.0002, + "loss": 0.5260958075523376, + "mean_token_accuracy": 0.7867996096611023, + "num_tokens": 7050833.0, + "step": 432 + }, + { + "entropy": 0.5364744961261749, + "epoch": 1.6278195488721805, + "grad_norm": 0.02916925586760044, + "learning_rate": 0.0002, + "loss": 0.5371173024177551, + "mean_token_accuracy": 0.7820777744054794, + "num_tokens": 7067201.0, + "step": 433 + }, + { + "entropy": 0.5432325303554535, + "epoch": 1.631578947368421, + "grad_norm": 0.02878529019653797, + "learning_rate": 0.0002, + "loss": 0.5453219413757324, + "mean_token_accuracy": 0.7784911543130875, + "num_tokens": 7083919.0, + "step": 434 + }, + { + "entropy": 0.5461350232362747, + "epoch": 1.6353383458646618, + "grad_norm": 0.030911264941096306, + "learning_rate": 0.0002, + "loss": 0.5520428419113159, + "mean_token_accuracy": 0.7748389393091202, + "num_tokens": 7100167.0, + "step": 435 + }, + { + "entropy": 0.5301318913698196, + "epoch": 1.6390977443609023, + "grad_norm": 0.0337194949388504, + "learning_rate": 0.0002, + "loss": 0.533911406993866, + "mean_token_accuracy": 0.781144917011261, + "num_tokens": 7115963.0, + "step": 436 + }, + { + "entropy": 0.554198831319809, + "epoch": 1.6428571428571428, + "grad_norm": 0.03273259475827217, + "learning_rate": 0.0002, + "loss": 0.5581203699111938, + "mean_token_accuracy": 0.7747189700603485, + "num_tokens": 7132343.0, + "step": 437 + }, + { + "entropy": 0.5451264977455139, + "epoch": 1.6466165413533833, + "grad_norm": 0.028795765712857246, + "learning_rate": 0.0002, + "loss": 0.5419780015945435, + "mean_token_accuracy": 0.7782856971025467, + "num_tokens": 7148711.0, + "step": 438 + }, + { + "entropy": 0.5696405470371246, + "epoch": 1.650375939849624, + "grad_norm": 0.02880324050784111, + "learning_rate": 0.0002, + "loss": 0.568999171257019, + "mean_token_accuracy": 0.7674362361431122, + "num_tokens": 7165000.0, + "step": 439 + }, + { + "entropy": 0.5544975996017456, + "epoch": 1.6541353383458648, + "grad_norm": 0.0319298580288887, + "learning_rate": 0.0002, + "loss": 0.5572612881660461, + "mean_token_accuracy": 0.7738819718360901, + "num_tokens": 7181178.0, + "step": 440 + }, + { + "entropy": 0.5648850053548813, + "epoch": 1.6578947368421053, + "grad_norm": 0.033446941524744034, + "learning_rate": 0.0002, + "loss": 0.5726531147956848, + "mean_token_accuracy": 0.767191156744957, + "num_tokens": 7197682.0, + "step": 441 + }, + { + "entropy": 0.5558575242757797, + "epoch": 1.6616541353383458, + "grad_norm": 0.02976951375603676, + "learning_rate": 0.0002, + "loss": 0.5575220584869385, + "mean_token_accuracy": 0.7738383561372757, + "num_tokens": 7214036.0, + "step": 442 + }, + { + "entropy": 0.5415066331624985, + "epoch": 1.6654135338345863, + "grad_norm": 0.03178182989358902, + "learning_rate": 0.0002, + "loss": 0.5425861477851868, + "mean_token_accuracy": 0.777436301112175, + "num_tokens": 7230232.0, + "step": 443 + }, + { + "entropy": 0.5568071007728577, + "epoch": 1.669172932330827, + "grad_norm": 0.029093647375702858, + "learning_rate": 0.0002, + "loss": 0.5502623319625854, + "mean_token_accuracy": 0.7746951729059219, + "num_tokens": 7246458.0, + "step": 444 + }, + { + "entropy": 0.5455858707427979, + "epoch": 1.6729323308270678, + "grad_norm": 0.03103097900748253, + "learning_rate": 0.0002, + "loss": 0.5415849685668945, + "mean_token_accuracy": 0.7773046642541885, + "num_tokens": 7262757.0, + "step": 445 + }, + { + "entropy": 0.5557373017072678, + "epoch": 1.6766917293233083, + "grad_norm": 0.034459494054317474, + "learning_rate": 0.0002, + "loss": 0.5588368773460388, + "mean_token_accuracy": 0.7731840759515762, + "num_tokens": 7279011.0, + "step": 446 + }, + { + "entropy": 0.536065399646759, + "epoch": 1.6804511278195489, + "grad_norm": 0.030954651534557343, + "learning_rate": 0.0002, + "loss": 0.5398183465003967, + "mean_token_accuracy": 0.778962567448616, + "num_tokens": 7295450.0, + "step": 447 + }, + { + "entropy": 0.5364357531070709, + "epoch": 1.6842105263157894, + "grad_norm": 0.03524971008300781, + "learning_rate": 0.0002, + "loss": 0.5447929501533508, + "mean_token_accuracy": 0.7776346057653427, + "num_tokens": 7311638.0, + "step": 448 + }, + { + "entropy": 0.5611797869205475, + "epoch": 1.6879699248120301, + "grad_norm": 0.02808379754424095, + "learning_rate": 0.0002, + "loss": 0.5557354688644409, + "mean_token_accuracy": 0.7739097476005554, + "num_tokens": 7327872.0, + "step": 449 + }, + { + "entropy": 0.5732033550739288, + "epoch": 1.6917293233082706, + "grad_norm": 0.03260007128119469, + "learning_rate": 0.0002, + "loss": 0.5591524839401245, + "mean_token_accuracy": 0.775033637881279, + "num_tokens": 7344324.0, + "step": 450 + }, + { + "entropy": 0.5342790335416794, + "epoch": 1.6954887218045114, + "grad_norm": 0.02984827756881714, + "learning_rate": 0.0002, + "loss": 0.5380273461341858, + "mean_token_accuracy": 0.782566487789154, + "num_tokens": 7360753.0, + "step": 451 + }, + { + "entropy": 0.5318778306245804, + "epoch": 1.699248120300752, + "grad_norm": 0.03279503807425499, + "learning_rate": 0.0002, + "loss": 0.544060468673706, + "mean_token_accuracy": 0.7762828469276428, + "num_tokens": 7377154.0, + "step": 452 + }, + { + "entropy": 0.5356487184762955, + "epoch": 1.7030075187969924, + "grad_norm": 0.03332759812474251, + "learning_rate": 0.0002, + "loss": 0.548007607460022, + "mean_token_accuracy": 0.7769170254468918, + "num_tokens": 7393621.0, + "step": 453 + }, + { + "entropy": 0.5513975322246552, + "epoch": 1.706766917293233, + "grad_norm": 0.03238146752119064, + "learning_rate": 0.0002, + "loss": 0.5592359900474548, + "mean_token_accuracy": 0.7740825116634369, + "num_tokens": 7409899.0, + "step": 454 + }, + { + "entropy": 0.5548000931739807, + "epoch": 1.7105263157894737, + "grad_norm": 0.02822866663336754, + "learning_rate": 0.0002, + "loss": 0.5497517585754395, + "mean_token_accuracy": 0.776210606098175, + "num_tokens": 7426237.0, + "step": 455 + }, + { + "entropy": 0.5756575465202332, + "epoch": 1.7142857142857144, + "grad_norm": 0.027675755321979523, + "learning_rate": 0.0002, + "loss": 0.5697333812713623, + "mean_token_accuracy": 0.7680118083953857, + "num_tokens": 7442768.0, + "step": 456 + }, + { + "entropy": 0.5417828410863876, + "epoch": 1.718045112781955, + "grad_norm": 0.033404842019081116, + "learning_rate": 0.0002, + "loss": 0.5454074740409851, + "mean_token_accuracy": 0.7808687537908554, + "num_tokens": 7459143.0, + "step": 457 + }, + { + "entropy": 0.5427983999252319, + "epoch": 1.7218045112781954, + "grad_norm": 0.03309955820441246, + "learning_rate": 0.0002, + "loss": 0.5416461825370789, + "mean_token_accuracy": 0.7808773517608643, + "num_tokens": 7475461.0, + "step": 458 + }, + { + "entropy": 0.5505435466766357, + "epoch": 1.725563909774436, + "grad_norm": 0.034179892390966415, + "learning_rate": 0.0002, + "loss": 0.5560557246208191, + "mean_token_accuracy": 0.7720683664083481, + "num_tokens": 7491762.0, + "step": 459 + }, + { + "entropy": 0.5398002862930298, + "epoch": 1.7293233082706767, + "grad_norm": 0.036437805742025375, + "learning_rate": 0.0002, + "loss": 0.5529733896255493, + "mean_token_accuracy": 0.7730463594198227, + "num_tokens": 7507801.0, + "step": 460 + }, + { + "entropy": 0.5538046360015869, + "epoch": 1.7330827067669174, + "grad_norm": 0.038074180483818054, + "learning_rate": 0.0002, + "loss": 0.5474164485931396, + "mean_token_accuracy": 0.7738546878099442, + "num_tokens": 7524195.0, + "step": 461 + }, + { + "entropy": 0.5446304082870483, + "epoch": 1.736842105263158, + "grad_norm": 0.028863312676548958, + "learning_rate": 0.0002, + "loss": 0.534104585647583, + "mean_token_accuracy": 0.7812709957361221, + "num_tokens": 7540346.0, + "step": 462 + }, + { + "entropy": 0.5635255128145218, + "epoch": 1.7406015037593985, + "grad_norm": 0.0377831794321537, + "learning_rate": 0.0002, + "loss": 0.5565074682235718, + "mean_token_accuracy": 0.7726516425609589, + "num_tokens": 7556361.0, + "step": 463 + }, + { + "entropy": 0.5520550906658173, + "epoch": 1.744360902255639, + "grad_norm": 0.027316391468048096, + "learning_rate": 0.0002, + "loss": 0.5496057868003845, + "mean_token_accuracy": 0.7767691016197205, + "num_tokens": 7572407.0, + "step": 464 + }, + { + "entropy": 0.5517378151416779, + "epoch": 1.7481203007518797, + "grad_norm": 0.03549322485923767, + "learning_rate": 0.0002, + "loss": 0.5542277097702026, + "mean_token_accuracy": 0.7771301567554474, + "num_tokens": 7588716.0, + "step": 465 + }, + { + "entropy": 0.5447746813297272, + "epoch": 1.7518796992481203, + "grad_norm": 0.03821020945906639, + "learning_rate": 0.0002, + "loss": 0.558238685131073, + "mean_token_accuracy": 0.7732566744089127, + "num_tokens": 7604921.0, + "step": 466 + }, + { + "entropy": 0.5422779768705368, + "epoch": 1.755639097744361, + "grad_norm": 0.03218455985188484, + "learning_rate": 0.0002, + "loss": 0.549083411693573, + "mean_token_accuracy": 0.7762202769517899, + "num_tokens": 7621109.0, + "step": 467 + }, + { + "entropy": 0.5479860007762909, + "epoch": 1.7593984962406015, + "grad_norm": 0.03186026215553284, + "learning_rate": 0.0002, + "loss": 0.5414553880691528, + "mean_token_accuracy": 0.7800420671701431, + "num_tokens": 7637434.0, + "step": 468 + }, + { + "entropy": 0.5488834828138351, + "epoch": 1.763157894736842, + "grad_norm": 0.030316263437271118, + "learning_rate": 0.0002, + "loss": 0.5371969938278198, + "mean_token_accuracy": 0.7800302803516388, + "num_tokens": 7653708.0, + "step": 469 + }, + { + "entropy": 0.5712478011846542, + "epoch": 1.7669172932330826, + "grad_norm": 0.0292644202709198, + "learning_rate": 0.0002, + "loss": 0.5641398429870605, + "mean_token_accuracy": 0.7701270431280136, + "num_tokens": 7670165.0, + "step": 470 + }, + { + "entropy": 0.5487608909606934, + "epoch": 1.7706766917293233, + "grad_norm": 0.029384015128016472, + "learning_rate": 0.0002, + "loss": 0.5528495907783508, + "mean_token_accuracy": 0.7725293934345245, + "num_tokens": 7686546.0, + "step": 471 + }, + { + "entropy": 0.5485792607069016, + "epoch": 1.774436090225564, + "grad_norm": 0.03848496824502945, + "learning_rate": 0.0002, + "loss": 0.557949960231781, + "mean_token_accuracy": 0.7736170142889023, + "num_tokens": 7703199.0, + "step": 472 + }, + { + "entropy": 0.5328742563724518, + "epoch": 1.7781954887218046, + "grad_norm": 0.029961325228214264, + "learning_rate": 0.0002, + "loss": 0.5426016449928284, + "mean_token_accuracy": 0.7784318327903748, + "num_tokens": 7719414.0, + "step": 473 + }, + { + "entropy": 0.5418206453323364, + "epoch": 1.781954887218045, + "grad_norm": 0.03003692626953125, + "learning_rate": 0.0002, + "loss": 0.543552815914154, + "mean_token_accuracy": 0.777516707777977, + "num_tokens": 7735591.0, + "step": 474 + }, + { + "entropy": 0.5588981062173843, + "epoch": 1.7857142857142856, + "grad_norm": 0.035983212292194366, + "learning_rate": 0.0002, + "loss": 0.5562595725059509, + "mean_token_accuracy": 0.7752551138401031, + "num_tokens": 7751978.0, + "step": 475 + }, + { + "entropy": 0.5337852984666824, + "epoch": 1.7894736842105263, + "grad_norm": 0.030708249658346176, + "learning_rate": 0.0002, + "loss": 0.5263274312019348, + "mean_token_accuracy": 0.7854783833026886, + "num_tokens": 7768537.0, + "step": 476 + }, + { + "entropy": 0.5388501137495041, + "epoch": 1.793233082706767, + "grad_norm": 0.034256935119628906, + "learning_rate": 0.0002, + "loss": 0.5432993173599243, + "mean_token_accuracy": 0.7769720703363419, + "num_tokens": 7784830.0, + "step": 477 + }, + { + "entropy": 0.5526683777570724, + "epoch": 1.7969924812030076, + "grad_norm": 0.030191054567694664, + "learning_rate": 0.0002, + "loss": 0.5529841184616089, + "mean_token_accuracy": 0.774674654006958, + "num_tokens": 7801305.0, + "step": 478 + }, + { + "entropy": 0.5205394625663757, + "epoch": 1.800751879699248, + "grad_norm": 0.03705041483044624, + "learning_rate": 0.0002, + "loss": 0.5320290327072144, + "mean_token_accuracy": 0.7844933271408081, + "num_tokens": 7817468.0, + "step": 479 + }, + { + "entropy": 0.5391060262918472, + "epoch": 1.8045112781954886, + "grad_norm": 0.03425837680697441, + "learning_rate": 0.0002, + "loss": 0.5482912659645081, + "mean_token_accuracy": 0.7772899568080902, + "num_tokens": 7833783.0, + "step": 480 + }, + { + "entropy": 0.5595878064632416, + "epoch": 1.8082706766917294, + "grad_norm": 0.03261560574173927, + "learning_rate": 0.0002, + "loss": 0.5595347881317139, + "mean_token_accuracy": 0.7739517390727997, + "num_tokens": 7850116.0, + "step": 481 + }, + { + "entropy": 0.5623766779899597, + "epoch": 1.8120300751879699, + "grad_norm": 0.030305257067084312, + "learning_rate": 0.0002, + "loss": 0.5494015216827393, + "mean_token_accuracy": 0.7756963670253754, + "num_tokens": 7866336.0, + "step": 482 + }, + { + "entropy": 0.5707903653383255, + "epoch": 1.8157894736842106, + "grad_norm": 0.030717138200998306, + "learning_rate": 0.0002, + "loss": 0.5605000257492065, + "mean_token_accuracy": 0.7702891528606415, + "num_tokens": 7882899.0, + "step": 483 + }, + { + "entropy": 0.5296159312129021, + "epoch": 1.8195488721804511, + "grad_norm": 0.03342661261558533, + "learning_rate": 0.0002, + "loss": 0.5307406783103943, + "mean_token_accuracy": 0.7850563228130341, + "num_tokens": 7899131.0, + "step": 484 + }, + { + "entropy": 0.545372724533081, + "epoch": 1.8233082706766917, + "grad_norm": 0.0327008031308651, + "learning_rate": 0.0002, + "loss": 0.5443350076675415, + "mean_token_accuracy": 0.7800664007663727, + "num_tokens": 7915449.0, + "step": 485 + }, + { + "entropy": 0.5288603901863098, + "epoch": 1.8270676691729322, + "grad_norm": 0.03246629983186722, + "learning_rate": 0.0002, + "loss": 0.5420411229133606, + "mean_token_accuracy": 0.779539629817009, + "num_tokens": 7931703.0, + "step": 486 + }, + { + "entropy": 0.5476890802383423, + "epoch": 1.830827067669173, + "grad_norm": 0.03365527465939522, + "learning_rate": 0.0002, + "loss": 0.5550553798675537, + "mean_token_accuracy": 0.7729549556970596, + "num_tokens": 7948074.0, + "step": 487 + }, + { + "entropy": 0.5389307886362076, + "epoch": 1.8345864661654137, + "grad_norm": 0.036491431295871735, + "learning_rate": 0.0002, + "loss": 0.5469198822975159, + "mean_token_accuracy": 0.7751343995332718, + "num_tokens": 7964150.0, + "step": 488 + }, + { + "entropy": 0.5449552834033966, + "epoch": 1.8383458646616542, + "grad_norm": 0.03082645684480667, + "learning_rate": 0.0002, + "loss": 0.5452861189842224, + "mean_token_accuracy": 0.7780899852514267, + "num_tokens": 7980409.0, + "step": 489 + }, + { + "entropy": 0.5490948259830475, + "epoch": 1.8421052631578947, + "grad_norm": 0.031109903007745743, + "learning_rate": 0.0002, + "loss": 0.5441408157348633, + "mean_token_accuracy": 0.778783529996872, + "num_tokens": 7996889.0, + "step": 490 + }, + { + "entropy": 0.5475451499223709, + "epoch": 1.8458646616541352, + "grad_norm": 0.030056826770305634, + "learning_rate": 0.0002, + "loss": 0.5430116653442383, + "mean_token_accuracy": 0.7810570746660233, + "num_tokens": 8013259.0, + "step": 491 + }, + { + "entropy": 0.559479296207428, + "epoch": 1.849624060150376, + "grad_norm": 0.035820432007312775, + "learning_rate": 0.0002, + "loss": 0.5568897128105164, + "mean_token_accuracy": 0.7710603177547455, + "num_tokens": 8029520.0, + "step": 492 + }, + { + "entropy": 0.5462630242109299, + "epoch": 1.8533834586466167, + "grad_norm": 0.031395427882671356, + "learning_rate": 0.0002, + "loss": 0.5490817427635193, + "mean_token_accuracy": 0.7747374475002289, + "num_tokens": 8045599.0, + "step": 493 + }, + { + "entropy": 0.5427971929311752, + "epoch": 1.8571428571428572, + "grad_norm": 0.032419510185718536, + "learning_rate": 0.0002, + "loss": 0.547596275806427, + "mean_token_accuracy": 0.7759164273738861, + "num_tokens": 8062030.0, + "step": 494 + }, + { + "entropy": 0.5488359779119492, + "epoch": 1.8609022556390977, + "grad_norm": 0.03382895514369011, + "learning_rate": 0.0002, + "loss": 0.5546596646308899, + "mean_token_accuracy": 0.7742781788110733, + "num_tokens": 8078279.0, + "step": 495 + }, + { + "entropy": 0.5563898682594299, + "epoch": 1.8646616541353382, + "grad_norm": 0.030559495091438293, + "learning_rate": 0.0002, + "loss": 0.5596904754638672, + "mean_token_accuracy": 0.7740778177976608, + "num_tokens": 8094627.0, + "step": 496 + }, + { + "entropy": 0.5448739975690842, + "epoch": 1.868421052631579, + "grad_norm": 0.029570002108812332, + "learning_rate": 0.0002, + "loss": 0.5441548824310303, + "mean_token_accuracy": 0.7791137993335724, + "num_tokens": 8111057.0, + "step": 497 + }, + { + "entropy": 0.5403100103139877, + "epoch": 1.8721804511278195, + "grad_norm": 0.028860216960310936, + "learning_rate": 0.0002, + "loss": 0.5392476916313171, + "mean_token_accuracy": 0.7823552191257477, + "num_tokens": 8127458.0, + "step": 498 + }, + { + "entropy": 0.547279953956604, + "epoch": 1.8759398496240602, + "grad_norm": 0.03563547134399414, + "learning_rate": 0.0002, + "loss": 0.5528260469436646, + "mean_token_accuracy": 0.7767119854688644, + "num_tokens": 8143862.0, + "step": 499 + }, + { + "entropy": 0.5525589138269424, + "epoch": 1.8796992481203008, + "grad_norm": 0.03100893273949623, + "learning_rate": 0.0002, + "loss": 0.5514292120933533, + "mean_token_accuracy": 0.7746975123882294, + "num_tokens": 8160155.0, + "step": 500 + }, + { + "entropy": 0.5513135939836502, + "epoch": 1.8834586466165413, + "grad_norm": 0.0315982848405838, + "learning_rate": 0.0002, + "loss": 0.5519658923149109, + "mean_token_accuracy": 0.7756119072437286, + "num_tokens": 8176700.0, + "step": 501 + }, + { + "entropy": 0.5485852658748627, + "epoch": 1.8872180451127818, + "grad_norm": 0.031329069286584854, + "learning_rate": 0.0002, + "loss": 0.5463511347770691, + "mean_token_accuracy": 0.779010608792305, + "num_tokens": 8193245.0, + "step": 502 + }, + { + "entropy": 0.5625745803117752, + "epoch": 1.8909774436090225, + "grad_norm": 0.029315905645489693, + "learning_rate": 0.0002, + "loss": 0.5607528686523438, + "mean_token_accuracy": 0.7741692066192627, + "num_tokens": 8209893.0, + "step": 503 + }, + { + "entropy": 0.5387315452098846, + "epoch": 1.8947368421052633, + "grad_norm": 0.03832435607910156, + "learning_rate": 0.0002, + "loss": 0.5399753451347351, + "mean_token_accuracy": 0.781536191701889, + "num_tokens": 8226239.0, + "step": 504 + }, + { + "entropy": 0.544891282916069, + "epoch": 1.8984962406015038, + "grad_norm": 0.03846210241317749, + "learning_rate": 0.0002, + "loss": 0.5546903610229492, + "mean_token_accuracy": 0.7764989882707596, + "num_tokens": 8242463.0, + "step": 505 + }, + { + "entropy": 0.5383649319410324, + "epoch": 1.9022556390977443, + "grad_norm": 0.029546573758125305, + "learning_rate": 0.0002, + "loss": 0.5443148016929626, + "mean_token_accuracy": 0.7801246345043182, + "num_tokens": 8258870.0, + "step": 506 + }, + { + "entropy": 0.5518875420093536, + "epoch": 1.9060150375939848, + "grad_norm": 0.03868366405367851, + "learning_rate": 0.0002, + "loss": 0.56158447265625, + "mean_token_accuracy": 0.7744181603193283, + "num_tokens": 8275059.0, + "step": 507 + }, + { + "entropy": 0.5304814428091049, + "epoch": 1.9097744360902256, + "grad_norm": 0.030545437708497047, + "learning_rate": 0.0002, + "loss": 0.5301219820976257, + "mean_token_accuracy": 0.7852053344249725, + "num_tokens": 8291105.0, + "step": 508 + }, + { + "entropy": 0.5690664052963257, + "epoch": 1.9135338345864663, + "grad_norm": 0.032348547130823135, + "learning_rate": 0.0002, + "loss": 0.5622092485427856, + "mean_token_accuracy": 0.769376203417778, + "num_tokens": 8307569.0, + "step": 509 + }, + { + "entropy": 0.5624774992465973, + "epoch": 1.9172932330827068, + "grad_norm": 0.02640698291361332, + "learning_rate": 0.0002, + "loss": 0.5545241236686707, + "mean_token_accuracy": 0.7744268774986267, + "num_tokens": 8323912.0, + "step": 510 + }, + { + "entropy": 0.5579835772514343, + "epoch": 1.9210526315789473, + "grad_norm": 0.031412333250045776, + "learning_rate": 0.0002, + "loss": 0.5539452433586121, + "mean_token_accuracy": 0.774582713842392, + "num_tokens": 8340119.0, + "step": 511 + }, + { + "entropy": 0.542325347661972, + "epoch": 1.9248120300751879, + "grad_norm": 0.030913738533854485, + "learning_rate": 0.0002, + "loss": 0.5458105802536011, + "mean_token_accuracy": 0.7775561809539795, + "num_tokens": 8356315.0, + "step": 512 + }, + { + "entropy": 0.529489278793335, + "epoch": 1.9285714285714286, + "grad_norm": 0.029877884313464165, + "learning_rate": 0.0002, + "loss": 0.531100332736969, + "mean_token_accuracy": 0.7838429808616638, + "num_tokens": 8372456.0, + "step": 513 + }, + { + "entropy": 0.5389499813318253, + "epoch": 1.9323308270676691, + "grad_norm": 0.030849065631628036, + "learning_rate": 0.0002, + "loss": 0.5465497374534607, + "mean_token_accuracy": 0.7783443629741669, + "num_tokens": 8388807.0, + "step": 514 + }, + { + "entropy": 0.5628852099180222, + "epoch": 1.9360902255639099, + "grad_norm": 0.03353369981050491, + "learning_rate": 0.0002, + "loss": 0.5644093751907349, + "mean_token_accuracy": 0.7698302865028381, + "num_tokens": 8405066.0, + "step": 515 + }, + { + "entropy": 0.5497677177190781, + "epoch": 1.9398496240601504, + "grad_norm": 0.028165243566036224, + "learning_rate": 0.0002, + "loss": 0.547763466835022, + "mean_token_accuracy": 0.7773574143648148, + "num_tokens": 8421460.0, + "step": 516 + }, + { + "entropy": 0.5606269836425781, + "epoch": 1.943609022556391, + "grad_norm": 0.0319550521671772, + "learning_rate": 0.0002, + "loss": 0.5551348924636841, + "mean_token_accuracy": 0.7739223390817642, + "num_tokens": 8437784.0, + "step": 517 + }, + { + "entropy": 0.5395714491605759, + "epoch": 1.9473684210526314, + "grad_norm": 0.031290777027606964, + "learning_rate": 0.0002, + "loss": 0.5381031036376953, + "mean_token_accuracy": 0.7825980633497238, + "num_tokens": 8453854.0, + "step": 518 + }, + { + "entropy": 0.5344501882791519, + "epoch": 1.9511278195488722, + "grad_norm": 0.03777296468615532, + "learning_rate": 0.0002, + "loss": 0.5455595850944519, + "mean_token_accuracy": 0.7795031368732452, + "num_tokens": 8470272.0, + "step": 519 + }, + { + "entropy": 0.5205538719892502, + "epoch": 1.954887218045113, + "grad_norm": 0.03487836569547653, + "learning_rate": 0.0002, + "loss": 0.5330216288566589, + "mean_token_accuracy": 0.7831091731786728, + "num_tokens": 8486562.0, + "step": 520 + }, + { + "entropy": 0.5428618490695953, + "epoch": 1.9586466165413534, + "grad_norm": 0.030902346596121788, + "learning_rate": 0.0002, + "loss": 0.5495193004608154, + "mean_token_accuracy": 0.7756944447755814, + "num_tokens": 8502887.0, + "step": 521 + }, + { + "entropy": 0.544492781162262, + "epoch": 1.962406015037594, + "grad_norm": 0.03169652447104454, + "learning_rate": 0.0002, + "loss": 0.5453743934631348, + "mean_token_accuracy": 0.7783046513795853, + "num_tokens": 8519068.0, + "step": 522 + }, + { + "entropy": 0.5636335015296936, + "epoch": 1.9661654135338344, + "grad_norm": 0.03021661750972271, + "learning_rate": 0.0002, + "loss": 0.5499917268753052, + "mean_token_accuracy": 0.7781661599874496, + "num_tokens": 8535634.0, + "step": 523 + }, + { + "entropy": 0.55694779753685, + "epoch": 1.9699248120300752, + "grad_norm": 0.03414059802889824, + "learning_rate": 0.0002, + "loss": 0.5477267503738403, + "mean_token_accuracy": 0.7789023220539093, + "num_tokens": 8552014.0, + "step": 524 + }, + { + "entropy": 0.5450517237186432, + "epoch": 1.973684210526316, + "grad_norm": 0.03232225775718689, + "learning_rate": 0.0002, + "loss": 0.5392122268676758, + "mean_token_accuracy": 0.777529314160347, + "num_tokens": 8568141.0, + "step": 525 + }, + { + "entropy": 0.5509356558322906, + "epoch": 1.9774436090225564, + "grad_norm": 0.03768094256520271, + "learning_rate": 0.0002, + "loss": 0.5595051050186157, + "mean_token_accuracy": 0.7724569737911224, + "num_tokens": 8584500.0, + "step": 526 + }, + { + "entropy": 0.5301109999418259, + "epoch": 1.981203007518797, + "grad_norm": 0.033885687589645386, + "learning_rate": 0.0002, + "loss": 0.5360104441642761, + "mean_token_accuracy": 0.7817398905754089, + "num_tokens": 8600622.0, + "step": 527 + }, + { + "entropy": 0.5417920649051666, + "epoch": 1.9849624060150375, + "grad_norm": 0.035579532384872437, + "learning_rate": 0.0002, + "loss": 0.5494239926338196, + "mean_token_accuracy": 0.7785082012414932, + "num_tokens": 8616969.0, + "step": 528 + }, + { + "entropy": 0.5376323908567429, + "epoch": 1.9887218045112782, + "grad_norm": 0.0296316035091877, + "learning_rate": 0.0002, + "loss": 0.5373918414115906, + "mean_token_accuracy": 0.7816532105207443, + "num_tokens": 8633437.0, + "step": 529 + }, + { + "entropy": 0.5412444472312927, + "epoch": 1.9924812030075187, + "grad_norm": 0.03037526085972786, + "learning_rate": 0.0002, + "loss": 0.539776086807251, + "mean_token_accuracy": 0.7808452993631363, + "num_tokens": 8649560.0, + "step": 530 + }, + { + "entropy": 0.554906353354454, + "epoch": 1.9962406015037595, + "grad_norm": 0.03048609383404255, + "learning_rate": 0.0002, + "loss": 0.5531030893325806, + "mean_token_accuracy": 0.7767332792282104, + "num_tokens": 8665828.0, + "step": 531 + }, + { + "entropy": 0.5544924587011337, + "epoch": 2.0, + "grad_norm": 0.03117205761373043, + "learning_rate": 0.0002, + "loss": 0.5525693893432617, + "mean_token_accuracy": 0.775643989443779, + "num_tokens": 8682083.0, + "step": 532 + }, + { + "entropy": 0.5393226593732834, + "epoch": 2.0037593984962405, + "grad_norm": 0.034238528460264206, + "learning_rate": 0.0002, + "loss": 0.527999222278595, + "mean_token_accuracy": 0.7866329997777939, + "num_tokens": 8698342.0, + "step": 533 + }, + { + "entropy": 0.5444916188716888, + "epoch": 2.007518796992481, + "grad_norm": 0.03761903941631317, + "learning_rate": 0.0002, + "loss": 0.5434718132019043, + "mean_token_accuracy": 0.7761547416448593, + "num_tokens": 8714741.0, + "step": 534 + }, + { + "entropy": 0.5060115680098534, + "epoch": 2.011278195488722, + "grad_norm": 0.036343637853860855, + "learning_rate": 0.0002, + "loss": 0.5168589353561401, + "mean_token_accuracy": 0.7898426353931427, + "num_tokens": 8731100.0, + "step": 535 + }, + { + "entropy": 0.5210407823324203, + "epoch": 2.0150375939849625, + "grad_norm": 0.04487035050988197, + "learning_rate": 0.0002, + "loss": 0.5338425040245056, + "mean_token_accuracy": 0.783848226070404, + "num_tokens": 8747374.0, + "step": 536 + }, + { + "entropy": 0.5411355942487717, + "epoch": 2.018796992481203, + "grad_norm": 0.030216895043849945, + "learning_rate": 0.0002, + "loss": 0.5343786478042603, + "mean_token_accuracy": 0.785404697060585, + "num_tokens": 8763878.0, + "step": 537 + }, + { + "entropy": 0.5372739881277084, + "epoch": 2.0225563909774436, + "grad_norm": 0.028337521478533745, + "learning_rate": 0.0002, + "loss": 0.5299405455589294, + "mean_token_accuracy": 0.7845199257135391, + "num_tokens": 8780220.0, + "step": 538 + }, + { + "entropy": 0.5464906841516495, + "epoch": 2.026315789473684, + "grad_norm": 0.036913856863975525, + "learning_rate": 0.0002, + "loss": 0.5415371656417847, + "mean_token_accuracy": 0.7804137766361237, + "num_tokens": 8796472.0, + "step": 539 + }, + { + "entropy": 0.5379135385155678, + "epoch": 2.030075187969925, + "grad_norm": 0.03262462466955185, + "learning_rate": 0.0002, + "loss": 0.5289930701255798, + "mean_token_accuracy": 0.7824063748121262, + "num_tokens": 8812711.0, + "step": 540 + }, + { + "entropy": 0.5565919727087021, + "epoch": 2.0338345864661656, + "grad_norm": 0.04293256625533104, + "learning_rate": 0.0002, + "loss": 0.5547116994857788, + "mean_token_accuracy": 0.7729399651288986, + "num_tokens": 8829053.0, + "step": 541 + }, + { + "entropy": 0.5241617634892464, + "epoch": 2.037593984962406, + "grad_norm": 0.038099389523267746, + "learning_rate": 0.0002, + "loss": 0.5281400084495544, + "mean_token_accuracy": 0.7854866534471512, + "num_tokens": 8845272.0, + "step": 542 + }, + { + "entropy": 0.5125209540128708, + "epoch": 2.0413533834586466, + "grad_norm": 0.0444987453520298, + "learning_rate": 0.0002, + "loss": 0.5245556235313416, + "mean_token_accuracy": 0.7865463197231293, + "num_tokens": 8861604.0, + "step": 543 + }, + { + "entropy": 0.5151898711919785, + "epoch": 2.045112781954887, + "grad_norm": 0.03733397275209427, + "learning_rate": 0.0002, + "loss": 0.5251218676567078, + "mean_token_accuracy": 0.7850091606378555, + "num_tokens": 8878258.0, + "step": 544 + }, + { + "entropy": 0.5284005552530289, + "epoch": 2.0488721804511276, + "grad_norm": 0.03852412849664688, + "learning_rate": 0.0002, + "loss": 0.5298153758049011, + "mean_token_accuracy": 0.7847720235586166, + "num_tokens": 8894539.0, + "step": 545 + }, + { + "entropy": 0.54307721555233, + "epoch": 2.0526315789473686, + "grad_norm": 0.033771906048059464, + "learning_rate": 0.0002, + "loss": 0.5370909571647644, + "mean_token_accuracy": 0.7825237512588501, + "num_tokens": 8910872.0, + "step": 546 + }, + { + "entropy": 0.5492400974035263, + "epoch": 2.056390977443609, + "grad_norm": 0.03574720397591591, + "learning_rate": 0.0002, + "loss": 0.5408341884613037, + "mean_token_accuracy": 0.778035119175911, + "num_tokens": 8927218.0, + "step": 547 + }, + { + "entropy": 0.5240911245346069, + "epoch": 2.0601503759398496, + "grad_norm": 0.02964242920279503, + "learning_rate": 0.0002, + "loss": 0.5206458568572998, + "mean_token_accuracy": 0.7880397886037827, + "num_tokens": 8943483.0, + "step": 548 + }, + { + "entropy": 0.5402092635631561, + "epoch": 2.06390977443609, + "grad_norm": 0.030025213956832886, + "learning_rate": 0.0002, + "loss": 0.5365015864372253, + "mean_token_accuracy": 0.7826483398675919, + "num_tokens": 8959806.0, + "step": 549 + }, + { + "entropy": 0.5332436561584473, + "epoch": 2.0676691729323307, + "grad_norm": 0.04115639254450798, + "learning_rate": 0.0002, + "loss": 0.5445111393928528, + "mean_token_accuracy": 0.7822862267494202, + "num_tokens": 8976089.0, + "step": 550 + }, + { + "entropy": 0.5036703869700432, + "epoch": 2.0714285714285716, + "grad_norm": 0.04966175556182861, + "learning_rate": 0.0002, + "loss": 0.5189836025238037, + "mean_token_accuracy": 0.7873758524656296, + "num_tokens": 8992377.0, + "step": 551 + }, + { + "entropy": 0.5350762009620667, + "epoch": 2.075187969924812, + "grad_norm": 0.03549731895327568, + "learning_rate": 0.0002, + "loss": 0.5327733755111694, + "mean_token_accuracy": 0.7879746407270432, + "num_tokens": 9008811.0, + "step": 552 + }, + { + "entropy": 0.5646320134401321, + "epoch": 2.0789473684210527, + "grad_norm": 0.03737547621130943, + "learning_rate": 0.0002, + "loss": 0.5554011464118958, + "mean_token_accuracy": 0.7747785001993179, + "num_tokens": 9025308.0, + "step": 553 + }, + { + "entropy": 0.5232708752155304, + "epoch": 2.082706766917293, + "grad_norm": 0.0358981154859066, + "learning_rate": 0.0002, + "loss": 0.5174283385276794, + "mean_token_accuracy": 0.790026530623436, + "num_tokens": 9041525.0, + "step": 554 + }, + { + "entropy": 0.5285665988922119, + "epoch": 2.0864661654135337, + "grad_norm": 0.03469764441251755, + "learning_rate": 0.0002, + "loss": 0.5286591649055481, + "mean_token_accuracy": 0.7858238369226456, + "num_tokens": 9058016.0, + "step": 555 + }, + { + "entropy": 0.5281644910573959, + "epoch": 2.090225563909774, + "grad_norm": 0.0453813299536705, + "learning_rate": 0.0002, + "loss": 0.5388556718826294, + "mean_token_accuracy": 0.7807898968458176, + "num_tokens": 9074200.0, + "step": 556 + }, + { + "entropy": 0.5271690487861633, + "epoch": 2.093984962406015, + "grad_norm": 0.032550517469644547, + "learning_rate": 0.0002, + "loss": 0.5312079787254333, + "mean_token_accuracy": 0.7843631505966187, + "num_tokens": 9090441.0, + "step": 557 + }, + { + "entropy": 0.5335165411233902, + "epoch": 2.0977443609022557, + "grad_norm": 0.045913904905319214, + "learning_rate": 0.0002, + "loss": 0.5417532324790955, + "mean_token_accuracy": 0.7792288213968277, + "num_tokens": 9106701.0, + "step": 558 + }, + { + "entropy": 0.5311940237879753, + "epoch": 2.101503759398496, + "grad_norm": 0.03551177680492401, + "learning_rate": 0.0002, + "loss": 0.5270295143127441, + "mean_token_accuracy": 0.7884976118803024, + "num_tokens": 9122828.0, + "step": 559 + }, + { + "entropy": 0.5543871223926544, + "epoch": 2.1052631578947367, + "grad_norm": 0.04049575328826904, + "learning_rate": 0.0002, + "loss": 0.5416486859321594, + "mean_token_accuracy": 0.7811383605003357, + "num_tokens": 9139283.0, + "step": 560 + }, + { + "entropy": 0.5340919494628906, + "epoch": 2.1090225563909772, + "grad_norm": 0.039224181324243546, + "learning_rate": 0.0002, + "loss": 0.5327409505844116, + "mean_token_accuracy": 0.7838027775287628, + "num_tokens": 9155474.0, + "step": 561 + }, + { + "entropy": 0.5298718512058258, + "epoch": 2.112781954887218, + "grad_norm": 0.05099140852689743, + "learning_rate": 0.0002, + "loss": 0.5340836644172668, + "mean_token_accuracy": 0.783194437623024, + "num_tokens": 9171817.0, + "step": 562 + }, + { + "entropy": 0.5186150521039963, + "epoch": 2.1165413533834587, + "grad_norm": 0.03965724632143974, + "learning_rate": 0.0002, + "loss": 0.5235821604728699, + "mean_token_accuracy": 0.7888422161340714, + "num_tokens": 9188257.0, + "step": 563 + }, + { + "entropy": 0.5331820994615555, + "epoch": 2.1203007518796992, + "grad_norm": 0.04237478971481323, + "learning_rate": 0.0002, + "loss": 0.5393993258476257, + "mean_token_accuracy": 0.7827252298593521, + "num_tokens": 9204541.0, + "step": 564 + }, + { + "entropy": 0.540572926402092, + "epoch": 2.1240601503759398, + "grad_norm": 0.04164816811680794, + "learning_rate": 0.0002, + "loss": 0.5408675670623779, + "mean_token_accuracy": 0.7807533591985703, + "num_tokens": 9220820.0, + "step": 565 + }, + { + "entropy": 0.5385376363992691, + "epoch": 2.1278195488721803, + "grad_norm": 0.036260150372982025, + "learning_rate": 0.0002, + "loss": 0.5364916324615479, + "mean_token_accuracy": 0.7820783704519272, + "num_tokens": 9237023.0, + "step": 566 + }, + { + "entropy": 0.5336015373468399, + "epoch": 2.1315789473684212, + "grad_norm": 0.037857089191675186, + "learning_rate": 0.0002, + "loss": 0.5315621495246887, + "mean_token_accuracy": 0.785429060459137, + "num_tokens": 9253551.0, + "step": 567 + }, + { + "entropy": 0.5323529243469238, + "epoch": 2.1353383458646618, + "grad_norm": 0.037011366337537766, + "learning_rate": 0.0002, + "loss": 0.5320927500724792, + "mean_token_accuracy": 0.7860363125801086, + "num_tokens": 9270061.0, + "step": 568 + }, + { + "entropy": 0.5342943072319031, + "epoch": 2.1390977443609023, + "grad_norm": 0.04501970484852791, + "learning_rate": 0.0002, + "loss": 0.541400134563446, + "mean_token_accuracy": 0.7824247628450394, + "num_tokens": 9286644.0, + "step": 569 + }, + { + "entropy": 0.5125101208686829, + "epoch": 2.142857142857143, + "grad_norm": 0.03982450067996979, + "learning_rate": 0.0002, + "loss": 0.5186954736709595, + "mean_token_accuracy": 0.7895647883415222, + "num_tokens": 9302779.0, + "step": 570 + }, + { + "entropy": 0.5302434861660004, + "epoch": 2.1466165413533833, + "grad_norm": 0.04483801871538162, + "learning_rate": 0.0002, + "loss": 0.5331039428710938, + "mean_token_accuracy": 0.7822313755750656, + "num_tokens": 9318908.0, + "step": 571 + }, + { + "entropy": 0.541576087474823, + "epoch": 2.1503759398496243, + "grad_norm": 0.04227382317185402, + "learning_rate": 0.0002, + "loss": 0.5322229862213135, + "mean_token_accuracy": 0.7839206904172897, + "num_tokens": 9335280.0, + "step": 572 + }, + { + "entropy": 0.5349045842885971, + "epoch": 2.154135338345865, + "grad_norm": 0.039713822305202484, + "learning_rate": 0.0002, + "loss": 0.5306118726730347, + "mean_token_accuracy": 0.7863682806491852, + "num_tokens": 9351717.0, + "step": 573 + }, + { + "entropy": 0.538109079003334, + "epoch": 2.1578947368421053, + "grad_norm": 0.043392788618803024, + "learning_rate": 0.0002, + "loss": 0.5441777110099792, + "mean_token_accuracy": 0.7800941169261932, + "num_tokens": 9367925.0, + "step": 574 + }, + { + "entropy": 0.543743833899498, + "epoch": 2.161654135338346, + "grad_norm": 0.036299366503953934, + "learning_rate": 0.0002, + "loss": 0.5443440675735474, + "mean_token_accuracy": 0.7788920700550079, + "num_tokens": 9384356.0, + "step": 575 + }, + { + "entropy": 0.5299166440963745, + "epoch": 2.1654135338345863, + "grad_norm": 0.04222200810909271, + "learning_rate": 0.0002, + "loss": 0.5267676711082458, + "mean_token_accuracy": 0.7834489941596985, + "num_tokens": 9400653.0, + "step": 576 + }, + { + "entropy": 0.5201265513896942, + "epoch": 2.169172932330827, + "grad_norm": 0.034343086183071136, + "learning_rate": 0.0002, + "loss": 0.5234291553497314, + "mean_token_accuracy": 0.7866221219301224, + "num_tokens": 9416889.0, + "step": 577 + }, + { + "entropy": 0.5227823704481125, + "epoch": 2.172932330827068, + "grad_norm": 0.05559639260172844, + "learning_rate": 0.0002, + "loss": 0.5304789543151855, + "mean_token_accuracy": 0.7860793620347977, + "num_tokens": 9433083.0, + "step": 578 + }, + { + "entropy": 0.5409391671419144, + "epoch": 2.1766917293233083, + "grad_norm": 0.03534764051437378, + "learning_rate": 0.0002, + "loss": 0.5437344908714294, + "mean_token_accuracy": 0.7797643393278122, + "num_tokens": 9449666.0, + "step": 579 + }, + { + "entropy": 0.5353062897920609, + "epoch": 2.180451127819549, + "grad_norm": 0.0366806834936142, + "learning_rate": 0.0002, + "loss": 0.5361766815185547, + "mean_token_accuracy": 0.7838302254676819, + "num_tokens": 9465971.0, + "step": 580 + }, + { + "entropy": 0.5455628782510757, + "epoch": 2.1842105263157894, + "grad_norm": 0.04078822582960129, + "learning_rate": 0.0002, + "loss": 0.5446187257766724, + "mean_token_accuracy": 0.7786186188459396, + "num_tokens": 9482331.0, + "step": 581 + }, + { + "entropy": 0.5441193133592606, + "epoch": 2.18796992481203, + "grad_norm": 0.03562629595398903, + "learning_rate": 0.0002, + "loss": 0.538811981678009, + "mean_token_accuracy": 0.7832597941160202, + "num_tokens": 9498498.0, + "step": 582 + }, + { + "entropy": 0.519161731004715, + "epoch": 2.191729323308271, + "grad_norm": 0.04350278899073601, + "learning_rate": 0.0002, + "loss": 0.5223026275634766, + "mean_token_accuracy": 0.7909857630729675, + "num_tokens": 9514937.0, + "step": 583 + }, + { + "entropy": 0.5520303696393967, + "epoch": 2.1954887218045114, + "grad_norm": 0.04176495969295502, + "learning_rate": 0.0002, + "loss": 0.5509821772575378, + "mean_token_accuracy": 0.7763593196868896, + "num_tokens": 9531256.0, + "step": 584 + }, + { + "entropy": 0.5262609422206879, + "epoch": 2.199248120300752, + "grad_norm": 0.07633325457572937, + "learning_rate": 0.0002, + "loss": 0.5259430408477783, + "mean_token_accuracy": 0.7863292992115021, + "num_tokens": 9547509.0, + "step": 585 + }, + { + "entropy": 0.53122878074646, + "epoch": 2.2030075187969924, + "grad_norm": 0.04210652410984039, + "learning_rate": 0.0002, + "loss": 0.531125545501709, + "mean_token_accuracy": 0.7854439616203308, + "num_tokens": 9563675.0, + "step": 586 + }, + { + "entropy": 0.5309283137321472, + "epoch": 2.206766917293233, + "grad_norm": 0.042596347630023956, + "learning_rate": 0.0002, + "loss": 0.5361312627792358, + "mean_token_accuracy": 0.7840573638677597, + "num_tokens": 9580247.0, + "step": 587 + }, + { + "entropy": 0.523199625313282, + "epoch": 2.2105263157894735, + "grad_norm": 0.06264178454875946, + "learning_rate": 0.0002, + "loss": 0.5371831655502319, + "mean_token_accuracy": 0.7846156805753708, + "num_tokens": 9596084.0, + "step": 588 + }, + { + "entropy": 0.5497414767742157, + "epoch": 2.2142857142857144, + "grad_norm": 0.049970485270023346, + "learning_rate": 0.0002, + "loss": 0.5482587218284607, + "mean_token_accuracy": 0.7772606760263443, + "num_tokens": 9612439.0, + "step": 589 + }, + { + "entropy": 0.5475651770830154, + "epoch": 2.218045112781955, + "grad_norm": 0.047052860260009766, + "learning_rate": 0.0002, + "loss": 0.5382542610168457, + "mean_token_accuracy": 0.7837767452001572, + "num_tokens": 9628574.0, + "step": 590 + }, + { + "entropy": 0.5442479848861694, + "epoch": 2.2218045112781954, + "grad_norm": 0.03252498432993889, + "learning_rate": 0.0002, + "loss": 0.5315850973129272, + "mean_token_accuracy": 0.7825820297002792, + "num_tokens": 9644837.0, + "step": 591 + }, + { + "entropy": 0.5471898764371872, + "epoch": 2.225563909774436, + "grad_norm": 0.048182275146245956, + "learning_rate": 0.0002, + "loss": 0.5472801923751831, + "mean_token_accuracy": 0.776175931096077, + "num_tokens": 9661070.0, + "step": 592 + }, + { + "entropy": 0.5267005264759064, + "epoch": 2.2293233082706765, + "grad_norm": 0.04179242253303528, + "learning_rate": 0.0002, + "loss": 0.5309768319129944, + "mean_token_accuracy": 0.7826364785432816, + "num_tokens": 9677378.0, + "step": 593 + }, + { + "entropy": 0.5416758507490158, + "epoch": 2.2330827067669174, + "grad_norm": 0.04981589689850807, + "learning_rate": 0.0002, + "loss": 0.549900472164154, + "mean_token_accuracy": 0.7765727639198303, + "num_tokens": 9693819.0, + "step": 594 + }, + { + "entropy": 0.5369458198547363, + "epoch": 2.236842105263158, + "grad_norm": 0.051439523696899414, + "learning_rate": 0.0002, + "loss": 0.5440854430198669, + "mean_token_accuracy": 0.7789760231971741, + "num_tokens": 9710189.0, + "step": 595 + }, + { + "entropy": 0.5342868715524673, + "epoch": 2.2406015037593985, + "grad_norm": 0.04235680773854256, + "learning_rate": 0.0002, + "loss": 0.5430835485458374, + "mean_token_accuracy": 0.7785050868988037, + "num_tokens": 9726526.0, + "step": 596 + }, + { + "entropy": 0.5481905192136765, + "epoch": 2.244360902255639, + "grad_norm": 0.044252388179302216, + "learning_rate": 0.0002, + "loss": 0.5456714034080505, + "mean_token_accuracy": 0.7800015658140182, + "num_tokens": 9742892.0, + "step": 597 + }, + { + "entropy": 0.5490403324365616, + "epoch": 2.2481203007518795, + "grad_norm": 0.036522816866636276, + "learning_rate": 0.0002, + "loss": 0.5348387956619263, + "mean_token_accuracy": 0.7838009893894196, + "num_tokens": 9759316.0, + "step": 598 + }, + { + "entropy": 0.5373188108205795, + "epoch": 2.2518796992481205, + "grad_norm": 0.0484786219894886, + "learning_rate": 0.0002, + "loss": 0.5393818616867065, + "mean_token_accuracy": 0.7799521684646606, + "num_tokens": 9775422.0, + "step": 599 + }, + { + "entropy": 0.5350137799978256, + "epoch": 2.255639097744361, + "grad_norm": 0.03971916437149048, + "learning_rate": 0.0002, + "loss": 0.5390014052391052, + "mean_token_accuracy": 0.7825258076190948, + "num_tokens": 9791645.0, + "step": 600 + }, + { + "entropy": 0.529654249548912, + "epoch": 2.2593984962406015, + "grad_norm": 0.03677717223763466, + "learning_rate": 0.0002, + "loss": 0.5347926020622253, + "mean_token_accuracy": 0.7820286452770233, + "num_tokens": 9807863.0, + "step": 601 + }, + { + "entropy": 0.5160931199789047, + "epoch": 2.263157894736842, + "grad_norm": 0.04103193059563637, + "learning_rate": 0.0002, + "loss": 0.5219160914421082, + "mean_token_accuracy": 0.7898968160152435, + "num_tokens": 9823834.0, + "step": 602 + }, + { + "entropy": 0.547026053071022, + "epoch": 2.2669172932330826, + "grad_norm": 0.035431135445833206, + "learning_rate": 0.0002, + "loss": 0.5403215289115906, + "mean_token_accuracy": 0.7804599404335022, + "num_tokens": 9840527.0, + "step": 603 + }, + { + "entropy": 0.5330915451049805, + "epoch": 2.2706766917293235, + "grad_norm": 0.03688134625554085, + "learning_rate": 0.0002, + "loss": 0.5308654308319092, + "mean_token_accuracy": 0.7851675152778625, + "num_tokens": 9856677.0, + "step": 604 + }, + { + "entropy": 0.5384332090616226, + "epoch": 2.274436090225564, + "grad_norm": 0.04168199747800827, + "learning_rate": 0.0002, + "loss": 0.5318323373794556, + "mean_token_accuracy": 0.7833025008440018, + "num_tokens": 9872958.0, + "step": 605 + }, + { + "entropy": 0.5483455657958984, + "epoch": 2.2781954887218046, + "grad_norm": 0.0458533950150013, + "learning_rate": 0.0002, + "loss": 0.5497722625732422, + "mean_token_accuracy": 0.7783730030059814, + "num_tokens": 9889301.0, + "step": 606 + }, + { + "entropy": 0.5242274850606918, + "epoch": 2.281954887218045, + "grad_norm": 0.03992198407649994, + "learning_rate": 0.0002, + "loss": 0.5323127508163452, + "mean_token_accuracy": 0.7856701463460922, + "num_tokens": 9905738.0, + "step": 607 + }, + { + "entropy": 0.5306910574436188, + "epoch": 2.2857142857142856, + "grad_norm": 0.03714906424283981, + "learning_rate": 0.0002, + "loss": 0.5334057807922363, + "mean_token_accuracy": 0.7845153957605362, + "num_tokens": 9922153.0, + "step": 608 + }, + { + "entropy": 0.5255761742591858, + "epoch": 2.2894736842105265, + "grad_norm": 0.037783432751894, + "learning_rate": 0.0002, + "loss": 0.5267370343208313, + "mean_token_accuracy": 0.7860815078020096, + "num_tokens": 9938520.0, + "step": 609 + }, + { + "entropy": 0.528737261891365, + "epoch": 2.293233082706767, + "grad_norm": 0.03467050567269325, + "learning_rate": 0.0002, + "loss": 0.5269864797592163, + "mean_token_accuracy": 0.789274126291275, + "num_tokens": 9954806.0, + "step": 610 + }, + { + "entropy": 0.5392419397830963, + "epoch": 2.2969924812030076, + "grad_norm": 0.03630411997437477, + "learning_rate": 0.0002, + "loss": 0.5344975590705872, + "mean_token_accuracy": 0.7834292352199554, + "num_tokens": 9971123.0, + "step": 611 + }, + { + "entropy": 0.5148891359567642, + "epoch": 2.300751879699248, + "grad_norm": 0.03637854382395744, + "learning_rate": 0.0002, + "loss": 0.5145090222358704, + "mean_token_accuracy": 0.7894360274076462, + "num_tokens": 9987229.0, + "step": 612 + }, + { + "entropy": 0.538021132349968, + "epoch": 2.3045112781954886, + "grad_norm": 0.03751857578754425, + "learning_rate": 0.0002, + "loss": 0.541398286819458, + "mean_token_accuracy": 0.7807863056659698, + "num_tokens": 10003519.0, + "step": 613 + }, + { + "entropy": 0.5272123515605927, + "epoch": 2.308270676691729, + "grad_norm": 0.04051438719034195, + "learning_rate": 0.0002, + "loss": 0.5344090461730957, + "mean_token_accuracy": 0.7857641130685806, + "num_tokens": 10019993.0, + "step": 614 + }, + { + "entropy": 0.5179824233055115, + "epoch": 2.31203007518797, + "grad_norm": 0.04479973390698433, + "learning_rate": 0.0002, + "loss": 0.5279502272605896, + "mean_token_accuracy": 0.7859090268611908, + "num_tokens": 10036196.0, + "step": 615 + }, + { + "entropy": 0.5467290729284286, + "epoch": 2.3157894736842106, + "grad_norm": 0.03927797079086304, + "learning_rate": 0.0002, + "loss": 0.5486882328987122, + "mean_token_accuracy": 0.7768010795116425, + "num_tokens": 10052474.0, + "step": 616 + }, + { + "entropy": 0.5408567190170288, + "epoch": 2.319548872180451, + "grad_norm": 0.03986404091119766, + "learning_rate": 0.0002, + "loss": 0.5317103862762451, + "mean_token_accuracy": 0.7851662039756775, + "num_tokens": 10068775.0, + "step": 617 + }, + { + "entropy": 0.5392286479473114, + "epoch": 2.3233082706766917, + "grad_norm": 0.03838985413312912, + "learning_rate": 0.0002, + "loss": 0.530458927154541, + "mean_token_accuracy": 0.7848429083824158, + "num_tokens": 10084946.0, + "step": 618 + }, + { + "entropy": 0.5223991498351097, + "epoch": 2.327067669172932, + "grad_norm": 0.03357016295194626, + "learning_rate": 0.0002, + "loss": 0.5164550542831421, + "mean_token_accuracy": 0.7903633117675781, + "num_tokens": 10101221.0, + "step": 619 + }, + { + "entropy": 0.5287820845842361, + "epoch": 2.3308270676691727, + "grad_norm": 0.041184201836586, + "learning_rate": 0.0002, + "loss": 0.5312986373901367, + "mean_token_accuracy": 0.7844579666852951, + "num_tokens": 10117440.0, + "step": 620 + }, + { + "entropy": 0.5136409252882004, + "epoch": 2.3345864661654137, + "grad_norm": 0.044375885277986526, + "learning_rate": 0.0002, + "loss": 0.5256669521331787, + "mean_token_accuracy": 0.7870495319366455, + "num_tokens": 10133537.0, + "step": 621 + }, + { + "entropy": 0.5296864807605743, + "epoch": 2.338345864661654, + "grad_norm": 0.043142594397068024, + "learning_rate": 0.0002, + "loss": 0.5372653007507324, + "mean_token_accuracy": 0.7797198593616486, + "num_tokens": 10149832.0, + "step": 622 + }, + { + "entropy": 0.5296363830566406, + "epoch": 2.3421052631578947, + "grad_norm": 0.04168247431516647, + "learning_rate": 0.0002, + "loss": 0.5342837572097778, + "mean_token_accuracy": 0.7827459424734116, + "num_tokens": 10166206.0, + "step": 623 + }, + { + "entropy": 0.5279521271586418, + "epoch": 2.345864661654135, + "grad_norm": 0.03668156638741493, + "learning_rate": 0.0002, + "loss": 0.5243417024612427, + "mean_token_accuracy": 0.7867815494537354, + "num_tokens": 10182574.0, + "step": 624 + }, + { + "entropy": 0.5396132320165634, + "epoch": 2.3496240601503757, + "grad_norm": 0.040590520948171616, + "learning_rate": 0.0002, + "loss": 0.534129798412323, + "mean_token_accuracy": 0.7840494364500046, + "num_tokens": 10198963.0, + "step": 625 + }, + { + "entropy": 0.5384691059589386, + "epoch": 2.3533834586466167, + "grad_norm": 0.03799832612276077, + "learning_rate": 0.0002, + "loss": 0.5275224447250366, + "mean_token_accuracy": 0.788055807352066, + "num_tokens": 10215363.0, + "step": 626 + }, + { + "entropy": 0.5355971157550812, + "epoch": 2.357142857142857, + "grad_norm": 0.03812744468450546, + "learning_rate": 0.0002, + "loss": 0.5373313426971436, + "mean_token_accuracy": 0.7830821126699448, + "num_tokens": 10231721.0, + "step": 627 + }, + { + "entropy": 0.5379942953586578, + "epoch": 2.3609022556390977, + "grad_norm": 0.04219618812203407, + "learning_rate": 0.0002, + "loss": 0.5430394411087036, + "mean_token_accuracy": 0.779607817530632, + "num_tokens": 10248150.0, + "step": 628 + }, + { + "entropy": 0.5369090437889099, + "epoch": 2.3646616541353382, + "grad_norm": 0.04251544550061226, + "learning_rate": 0.0002, + "loss": 0.5445953011512756, + "mean_token_accuracy": 0.778522789478302, + "num_tokens": 10264414.0, + "step": 629 + }, + { + "entropy": 0.5455975085496902, + "epoch": 2.3684210526315788, + "grad_norm": 0.04128441959619522, + "learning_rate": 0.0002, + "loss": 0.5464663505554199, + "mean_token_accuracy": 0.7782220393419266, + "num_tokens": 10280655.0, + "step": 630 + }, + { + "entropy": 0.5499599725008011, + "epoch": 2.3721804511278197, + "grad_norm": 0.0386635959148407, + "learning_rate": 0.0002, + "loss": 0.542563259601593, + "mean_token_accuracy": 0.7798319011926651, + "num_tokens": 10297357.0, + "step": 631 + }, + { + "entropy": 0.5534010380506516, + "epoch": 2.3759398496240602, + "grad_norm": 0.040974393486976624, + "learning_rate": 0.0002, + "loss": 0.5562258362770081, + "mean_token_accuracy": 0.7761926651000977, + "num_tokens": 10313788.0, + "step": 632 + }, + { + "entropy": 0.5357997566461563, + "epoch": 2.3796992481203008, + "grad_norm": 0.03751135990023613, + "learning_rate": 0.0002, + "loss": 0.5311724543571472, + "mean_token_accuracy": 0.7860594242811203, + "num_tokens": 10330164.0, + "step": 633 + }, + { + "entropy": 0.5399480760097504, + "epoch": 2.3834586466165413, + "grad_norm": 0.0392535962164402, + "learning_rate": 0.0002, + "loss": 0.5405341982841492, + "mean_token_accuracy": 0.782960519194603, + "num_tokens": 10346587.0, + "step": 634 + }, + { + "entropy": 0.5351511463522911, + "epoch": 2.387218045112782, + "grad_norm": 0.04137985408306122, + "learning_rate": 0.0002, + "loss": 0.5435580611228943, + "mean_token_accuracy": 0.7791251242160797, + "num_tokens": 10362964.0, + "step": 635 + }, + { + "entropy": 0.5337197929620743, + "epoch": 2.3909774436090228, + "grad_norm": 0.04529615119099617, + "learning_rate": 0.0002, + "loss": 0.54475998878479, + "mean_token_accuracy": 0.7794527411460876, + "num_tokens": 10379194.0, + "step": 636 + }, + { + "entropy": 0.5295632779598236, + "epoch": 2.3947368421052633, + "grad_norm": 0.03818366676568985, + "learning_rate": 0.0002, + "loss": 0.53121417760849, + "mean_token_accuracy": 0.7843088060617447, + "num_tokens": 10395289.0, + "step": 637 + }, + { + "entropy": 0.5338181853294373, + "epoch": 2.398496240601504, + "grad_norm": 0.04155934602022171, + "learning_rate": 0.0002, + "loss": 0.5273146033287048, + "mean_token_accuracy": 0.7871305495500565, + "num_tokens": 10411478.0, + "step": 638 + }, + { + "entropy": 0.5275490283966064, + "epoch": 2.4022556390977443, + "grad_norm": 0.03884044289588928, + "learning_rate": 0.0002, + "loss": 0.5259033441543579, + "mean_token_accuracy": 0.7865510582923889, + "num_tokens": 10428000.0, + "step": 639 + }, + { + "entropy": 0.5296481549739838, + "epoch": 2.406015037593985, + "grad_norm": 0.03892350569367409, + "learning_rate": 0.0002, + "loss": 0.5338611602783203, + "mean_token_accuracy": 0.7841958701610565, + "num_tokens": 10444531.0, + "step": 640 + }, + { + "entropy": 0.5326656997203827, + "epoch": 2.409774436090226, + "grad_norm": 0.04130466282367706, + "learning_rate": 0.0002, + "loss": 0.5334239602088928, + "mean_token_accuracy": 0.7844693660736084, + "num_tokens": 10460884.0, + "step": 641 + }, + { + "entropy": 0.5167141184210777, + "epoch": 2.4135338345864663, + "grad_norm": 0.04298912361264229, + "learning_rate": 0.0002, + "loss": 0.5224160552024841, + "mean_token_accuracy": 0.790846198797226, + "num_tokens": 10476946.0, + "step": 642 + }, + { + "entropy": 0.5394491106271744, + "epoch": 2.417293233082707, + "grad_norm": 0.0389692522585392, + "learning_rate": 0.0002, + "loss": 0.5456172823905945, + "mean_token_accuracy": 0.7784712016582489, + "num_tokens": 10493157.0, + "step": 643 + }, + { + "entropy": 0.5317131578922272, + "epoch": 2.4210526315789473, + "grad_norm": 0.03282848745584488, + "learning_rate": 0.0002, + "loss": 0.5272088050842285, + "mean_token_accuracy": 0.7835191786289215, + "num_tokens": 10509339.0, + "step": 644 + }, + { + "entropy": 0.5249821543693542, + "epoch": 2.424812030075188, + "grad_norm": 0.03486508131027222, + "learning_rate": 0.0002, + "loss": 0.5219942927360535, + "mean_token_accuracy": 0.787269338965416, + "num_tokens": 10525556.0, + "step": 645 + }, + { + "entropy": 0.5392860472202301, + "epoch": 2.4285714285714284, + "grad_norm": 0.03448896110057831, + "learning_rate": 0.0002, + "loss": 0.5338496565818787, + "mean_token_accuracy": 0.7829862833023071, + "num_tokens": 10541761.0, + "step": 646 + }, + { + "entropy": 0.5386904329061508, + "epoch": 2.4323308270676693, + "grad_norm": 0.037768758833408356, + "learning_rate": 0.0002, + "loss": 0.5425961017608643, + "mean_token_accuracy": 0.7781831622123718, + "num_tokens": 10558311.0, + "step": 647 + }, + { + "entropy": 0.5251231044530869, + "epoch": 2.43609022556391, + "grad_norm": 0.03807547688484192, + "learning_rate": 0.0002, + "loss": 0.5291208624839783, + "mean_token_accuracy": 0.783474326133728, + "num_tokens": 10574696.0, + "step": 648 + }, + { + "entropy": 0.5356583297252655, + "epoch": 2.4398496240601504, + "grad_norm": 0.03421357646584511, + "learning_rate": 0.0002, + "loss": 0.5309426188468933, + "mean_token_accuracy": 0.7826003879308701, + "num_tokens": 10591225.0, + "step": 649 + }, + { + "entropy": 0.5321584492921829, + "epoch": 2.443609022556391, + "grad_norm": 0.04219021648168564, + "learning_rate": 0.0002, + "loss": 0.5343624353408813, + "mean_token_accuracy": 0.7819913923740387, + "num_tokens": 10607648.0, + "step": 650 + }, + { + "entropy": 0.5409150719642639, + "epoch": 2.4473684210526314, + "grad_norm": 0.039848409593105316, + "learning_rate": 0.0002, + "loss": 0.5406517386436462, + "mean_token_accuracy": 0.7809206694364548, + "num_tokens": 10623965.0, + "step": 651 + }, + { + "entropy": 0.5184071511030197, + "epoch": 2.451127819548872, + "grad_norm": 0.04401297867298126, + "learning_rate": 0.0002, + "loss": 0.5264937877655029, + "mean_token_accuracy": 0.7875054776668549, + "num_tokens": 10640111.0, + "step": 652 + }, + { + "entropy": 0.5153327658772469, + "epoch": 2.454887218045113, + "grad_norm": 0.037109002470970154, + "learning_rate": 0.0002, + "loss": 0.5220255255699158, + "mean_token_accuracy": 0.7878341674804688, + "num_tokens": 10656391.0, + "step": 653 + }, + { + "entropy": 0.534611888229847, + "epoch": 2.4586466165413534, + "grad_norm": 0.047087740153074265, + "learning_rate": 0.0002, + "loss": 0.5327281951904297, + "mean_token_accuracy": 0.7858874797821045, + "num_tokens": 10672550.0, + "step": 654 + }, + { + "entropy": 0.5468750447034836, + "epoch": 2.462406015037594, + "grad_norm": 0.03793250396847725, + "learning_rate": 0.0002, + "loss": 0.5467609167098999, + "mean_token_accuracy": 0.7752472460269928, + "num_tokens": 10688678.0, + "step": 655 + }, + { + "entropy": 0.5618661195039749, + "epoch": 2.4661654135338344, + "grad_norm": 0.043232064694166183, + "learning_rate": 0.0002, + "loss": 0.557094395160675, + "mean_token_accuracy": 0.7767215073108673, + "num_tokens": 10705231.0, + "step": 656 + }, + { + "entropy": 0.5481238514184952, + "epoch": 2.469924812030075, + "grad_norm": 0.04276246577501297, + "learning_rate": 0.0002, + "loss": 0.5488662719726562, + "mean_token_accuracy": 0.780038595199585, + "num_tokens": 10721712.0, + "step": 657 + }, + { + "entropy": 0.5505738407373428, + "epoch": 2.473684210526316, + "grad_norm": 0.040987517684698105, + "learning_rate": 0.0002, + "loss": 0.5510429739952087, + "mean_token_accuracy": 0.7774406224489212, + "num_tokens": 10737970.0, + "step": 658 + }, + { + "entropy": 0.5473013371229172, + "epoch": 2.4774436090225564, + "grad_norm": 0.051042236387729645, + "learning_rate": 0.0002, + "loss": 0.5507328510284424, + "mean_token_accuracy": 0.7794748395681381, + "num_tokens": 10754101.0, + "step": 659 + }, + { + "entropy": 0.5286405235528946, + "epoch": 2.481203007518797, + "grad_norm": 0.04263005033135414, + "learning_rate": 0.0002, + "loss": 0.5302000045776367, + "mean_token_accuracy": 0.7844719737768173, + "num_tokens": 10770357.0, + "step": 660 + }, + { + "entropy": 0.5383267849683762, + "epoch": 2.4849624060150375, + "grad_norm": 0.03854911029338837, + "learning_rate": 0.0002, + "loss": 0.54207444190979, + "mean_token_accuracy": 0.7791945487260818, + "num_tokens": 10786804.0, + "step": 661 + }, + { + "entropy": 0.5230704694986343, + "epoch": 2.488721804511278, + "grad_norm": 0.04200039431452751, + "learning_rate": 0.0002, + "loss": 0.5254136919975281, + "mean_token_accuracy": 0.7850333154201508, + "num_tokens": 10802992.0, + "step": 662 + }, + { + "entropy": 0.5294183790683746, + "epoch": 2.492481203007519, + "grad_norm": 0.04227717965841293, + "learning_rate": 0.0002, + "loss": 0.5372048616409302, + "mean_token_accuracy": 0.7844373136758804, + "num_tokens": 10819187.0, + "step": 663 + }, + { + "entropy": 0.5186149403452873, + "epoch": 2.4962406015037595, + "grad_norm": 0.03944484889507294, + "learning_rate": 0.0002, + "loss": 0.5234470367431641, + "mean_token_accuracy": 0.7857441008090973, + "num_tokens": 10835170.0, + "step": 664 + }, + { + "entropy": 0.5416997969150543, + "epoch": 2.5, + "grad_norm": 0.043196793645620346, + "learning_rate": 0.0002, + "loss": 0.5474759936332703, + "mean_token_accuracy": 0.7749510407447815, + "num_tokens": 10851563.0, + "step": 665 + }, + { + "entropy": 0.5275483727455139, + "epoch": 2.5037593984962405, + "grad_norm": 0.03911745548248291, + "learning_rate": 0.0002, + "loss": 0.5205013155937195, + "mean_token_accuracy": 0.7898803949356079, + "num_tokens": 10867571.0, + "step": 666 + }, + { + "entropy": 0.5302275121212006, + "epoch": 2.507518796992481, + "grad_norm": 0.03766452148556709, + "learning_rate": 0.0002, + "loss": 0.5310875773429871, + "mean_token_accuracy": 0.7819045037031174, + "num_tokens": 10883849.0, + "step": 667 + }, + { + "entropy": 0.5416832715272903, + "epoch": 2.511278195488722, + "grad_norm": 0.03993174061179161, + "learning_rate": 0.0002, + "loss": 0.5426294207572937, + "mean_token_accuracy": 0.7777436971664429, + "num_tokens": 10900103.0, + "step": 668 + }, + { + "entropy": 0.5554288029670715, + "epoch": 2.5150375939849625, + "grad_norm": 0.046043481677770615, + "learning_rate": 0.0002, + "loss": 0.5500344634056091, + "mean_token_accuracy": 0.7746063023805618, + "num_tokens": 10916472.0, + "step": 669 + }, + { + "entropy": 0.5500206649303436, + "epoch": 2.518796992481203, + "grad_norm": 0.04341411218047142, + "learning_rate": 0.0002, + "loss": 0.5484751462936401, + "mean_token_accuracy": 0.7778518944978714, + "num_tokens": 10932960.0, + "step": 670 + }, + { + "entropy": 0.5585402101278305, + "epoch": 2.5225563909774436, + "grad_norm": 0.04927565157413483, + "learning_rate": 0.0002, + "loss": 0.5563656091690063, + "mean_token_accuracy": 0.7734353542327881, + "num_tokens": 10949340.0, + "step": 671 + }, + { + "entropy": 0.5314253345131874, + "epoch": 2.526315789473684, + "grad_norm": 0.04110320657491684, + "learning_rate": 0.0002, + "loss": 0.5281319618225098, + "mean_token_accuracy": 0.7881615608930588, + "num_tokens": 10965640.0, + "step": 672 + }, + { + "entropy": 0.519628070294857, + "epoch": 2.530075187969925, + "grad_norm": 0.03798144683241844, + "learning_rate": 0.0002, + "loss": 0.5186299085617065, + "mean_token_accuracy": 0.7885057926177979, + "num_tokens": 10982162.0, + "step": 673 + }, + { + "entropy": 0.5199308693408966, + "epoch": 2.5338345864661656, + "grad_norm": 0.04168830066919327, + "learning_rate": 0.0002, + "loss": 0.5289560556411743, + "mean_token_accuracy": 0.7860239744186401, + "num_tokens": 10998283.0, + "step": 674 + }, + { + "entropy": 0.5352334305644035, + "epoch": 2.537593984962406, + "grad_norm": 0.04851493611931801, + "learning_rate": 0.0002, + "loss": 0.5395171642303467, + "mean_token_accuracy": 0.781098335981369, + "num_tokens": 11014541.0, + "step": 675 + }, + { + "entropy": 0.5220839083194733, + "epoch": 2.5413533834586466, + "grad_norm": 0.03901033103466034, + "learning_rate": 0.0002, + "loss": 0.5202946662902832, + "mean_token_accuracy": 0.7897375226020813, + "num_tokens": 11030626.0, + "step": 676 + }, + { + "entropy": 0.5660356432199478, + "epoch": 2.545112781954887, + "grad_norm": 0.040614161640405655, + "learning_rate": 0.0002, + "loss": 0.5683348774909973, + "mean_token_accuracy": 0.7686392664909363, + "num_tokens": 11047170.0, + "step": 677 + }, + { + "entropy": 0.5248497724533081, + "epoch": 2.548872180451128, + "grad_norm": 0.050087373703718185, + "learning_rate": 0.0002, + "loss": 0.5326120257377625, + "mean_token_accuracy": 0.7856886386871338, + "num_tokens": 11063651.0, + "step": 678 + }, + { + "entropy": 0.5423640608787537, + "epoch": 2.5526315789473686, + "grad_norm": 0.05331513658165932, + "learning_rate": 0.0002, + "loss": 0.5449936389923096, + "mean_token_accuracy": 0.778554379940033, + "num_tokens": 11080048.0, + "step": 679 + }, + { + "entropy": 0.5384076982736588, + "epoch": 2.556390977443609, + "grad_norm": 0.04410131275653839, + "learning_rate": 0.0002, + "loss": 0.5350104570388794, + "mean_token_accuracy": 0.7837571948766708, + "num_tokens": 11096391.0, + "step": 680 + }, + { + "entropy": 0.529449462890625, + "epoch": 2.5601503759398496, + "grad_norm": 0.03738116845488548, + "learning_rate": 0.0002, + "loss": 0.5299030542373657, + "mean_token_accuracy": 0.7870044708251953, + "num_tokens": 11112709.0, + "step": 681 + }, + { + "entropy": 0.5311971455812454, + "epoch": 2.56390977443609, + "grad_norm": 0.04492153227329254, + "learning_rate": 0.0002, + "loss": 0.5362582206726074, + "mean_token_accuracy": 0.780634418129921, + "num_tokens": 11129093.0, + "step": 682 + }, + { + "entropy": 0.5400303602218628, + "epoch": 2.567669172932331, + "grad_norm": 0.036020781844854355, + "learning_rate": 0.0002, + "loss": 0.5404684543609619, + "mean_token_accuracy": 0.7825169265270233, + "num_tokens": 11145314.0, + "step": 683 + }, + { + "entropy": 0.5410858988761902, + "epoch": 2.571428571428571, + "grad_norm": 0.04276980832219124, + "learning_rate": 0.0002, + "loss": 0.5423122048377991, + "mean_token_accuracy": 0.7814541161060333, + "num_tokens": 11161581.0, + "step": 684 + }, + { + "entropy": 0.5380300432443619, + "epoch": 2.575187969924812, + "grad_norm": 0.03481379151344299, + "learning_rate": 0.0002, + "loss": 0.5358370542526245, + "mean_token_accuracy": 0.7818766683340073, + "num_tokens": 11177989.0, + "step": 685 + }, + { + "entropy": 0.5248596295714378, + "epoch": 2.5789473684210527, + "grad_norm": 0.036602359265089035, + "learning_rate": 0.0002, + "loss": 0.5253828763961792, + "mean_token_accuracy": 0.7854669690132141, + "num_tokens": 11194032.0, + "step": 686 + }, + { + "entropy": 0.5219234973192215, + "epoch": 2.582706766917293, + "grad_norm": 0.040489669889211655, + "learning_rate": 0.0002, + "loss": 0.5243583917617798, + "mean_token_accuracy": 0.786599799990654, + "num_tokens": 11210092.0, + "step": 687 + }, + { + "entropy": 0.5334769785404205, + "epoch": 2.5864661654135337, + "grad_norm": 0.03958981856703758, + "learning_rate": 0.0002, + "loss": 0.5376310348510742, + "mean_token_accuracy": 0.7825024574995041, + "num_tokens": 11226462.0, + "step": 688 + }, + { + "entropy": 0.5297794789075851, + "epoch": 2.590225563909774, + "grad_norm": 0.039997756481170654, + "learning_rate": 0.0002, + "loss": 0.5335977077484131, + "mean_token_accuracy": 0.7828920185565948, + "num_tokens": 11242781.0, + "step": 689 + }, + { + "entropy": 0.535497397184372, + "epoch": 2.593984962406015, + "grad_norm": 0.03865867853164673, + "learning_rate": 0.0002, + "loss": 0.5379775762557983, + "mean_token_accuracy": 0.7825619131326675, + "num_tokens": 11259131.0, + "step": 690 + }, + { + "entropy": 0.5340843796730042, + "epoch": 2.5977443609022557, + "grad_norm": 0.037679754197597504, + "learning_rate": 0.0002, + "loss": 0.5335901975631714, + "mean_token_accuracy": 0.7848968952894211, + "num_tokens": 11275370.0, + "step": 691 + }, + { + "entropy": 0.5506868213415146, + "epoch": 2.601503759398496, + "grad_norm": 0.04139415919780731, + "learning_rate": 0.0002, + "loss": 0.5515389442443848, + "mean_token_accuracy": 0.7779832780361176, + "num_tokens": 11291675.0, + "step": 692 + }, + { + "entropy": 0.5458535552024841, + "epoch": 2.6052631578947367, + "grad_norm": 0.03914312273263931, + "learning_rate": 0.0002, + "loss": 0.5428761839866638, + "mean_token_accuracy": 0.7802267819643021, + "num_tokens": 11308082.0, + "step": 693 + }, + { + "entropy": 0.5242106392979622, + "epoch": 2.6090225563909772, + "grad_norm": 0.03517727553844452, + "learning_rate": 0.0002, + "loss": 0.5183535218238831, + "mean_token_accuracy": 0.7899799644947052, + "num_tokens": 11324349.0, + "step": 694 + }, + { + "entropy": 0.527122899889946, + "epoch": 2.612781954887218, + "grad_norm": 0.03646351397037506, + "learning_rate": 0.0002, + "loss": 0.5237759351730347, + "mean_token_accuracy": 0.7876067459583282, + "num_tokens": 11340804.0, + "step": 695 + }, + { + "entropy": 0.5334932953119278, + "epoch": 2.6165413533834587, + "grad_norm": 0.03501564636826515, + "learning_rate": 0.0002, + "loss": 0.5345377326011658, + "mean_token_accuracy": 0.7828026562929153, + "num_tokens": 11357207.0, + "step": 696 + }, + { + "entropy": 0.5264469981193542, + "epoch": 2.6203007518796992, + "grad_norm": 0.042768895626068115, + "learning_rate": 0.0002, + "loss": 0.5306587219238281, + "mean_token_accuracy": 0.7863332629203796, + "num_tokens": 11373543.0, + "step": 697 + }, + { + "entropy": 0.5400331318378448, + "epoch": 2.6240601503759398, + "grad_norm": 0.03265206515789032, + "learning_rate": 0.0002, + "loss": 0.5402212142944336, + "mean_token_accuracy": 0.7809455096721649, + "num_tokens": 11390155.0, + "step": 698 + }, + { + "entropy": 0.5565398335456848, + "epoch": 2.6278195488721803, + "grad_norm": 0.04417556896805763, + "learning_rate": 0.0002, + "loss": 0.5573287010192871, + "mean_token_accuracy": 0.7738644480705261, + "num_tokens": 11406739.0, + "step": 699 + }, + { + "entropy": 0.5443829298019409, + "epoch": 2.6315789473684212, + "grad_norm": 0.03721097856760025, + "learning_rate": 0.0002, + "loss": 0.5420445799827576, + "mean_token_accuracy": 0.7787856310606003, + "num_tokens": 11423213.0, + "step": 700 + }, + { + "entropy": 0.5284033268690109, + "epoch": 2.6353383458646618, + "grad_norm": 0.041038673371076584, + "learning_rate": 0.0002, + "loss": 0.5301244258880615, + "mean_token_accuracy": 0.7856591492891312, + "num_tokens": 11439231.0, + "step": 701 + }, + { + "entropy": 0.5442045629024506, + "epoch": 2.6390977443609023, + "grad_norm": 0.03640377148985863, + "learning_rate": 0.0002, + "loss": 0.5464366674423218, + "mean_token_accuracy": 0.7776281535625458, + "num_tokens": 11455738.0, + "step": 702 + }, + { + "entropy": 0.5383570641279221, + "epoch": 2.642857142857143, + "grad_norm": 0.04412476718425751, + "learning_rate": 0.0002, + "loss": 0.544456422328949, + "mean_token_accuracy": 0.7783865183591843, + "num_tokens": 11471797.0, + "step": 703 + }, + { + "entropy": 0.5191052407026291, + "epoch": 2.6466165413533833, + "grad_norm": 0.035958074033260345, + "learning_rate": 0.0002, + "loss": 0.5193113088607788, + "mean_token_accuracy": 0.7863477617502213, + "num_tokens": 11487876.0, + "step": 704 + }, + { + "entropy": 0.5466601550579071, + "epoch": 2.6503759398496243, + "grad_norm": 0.048238396644592285, + "learning_rate": 0.0002, + "loss": 0.5443681478500366, + "mean_token_accuracy": 0.7801824659109116, + "num_tokens": 11504122.0, + "step": 705 + }, + { + "entropy": 0.5602389425039291, + "epoch": 2.654135338345865, + "grad_norm": 0.0392533615231514, + "learning_rate": 0.0002, + "loss": 0.5607460141181946, + "mean_token_accuracy": 0.7710349410772324, + "num_tokens": 11520493.0, + "step": 706 + }, + { + "entropy": 0.5393271297216415, + "epoch": 2.6578947368421053, + "grad_norm": 0.046152085065841675, + "learning_rate": 0.0002, + "loss": 0.5473223924636841, + "mean_token_accuracy": 0.7810050994157791, + "num_tokens": 11536519.0, + "step": 707 + }, + { + "entropy": 0.5321537107229233, + "epoch": 2.661654135338346, + "grad_norm": 0.038532763719558716, + "learning_rate": 0.0002, + "loss": 0.5388097763061523, + "mean_token_accuracy": 0.7796639204025269, + "num_tokens": 11552787.0, + "step": 708 + }, + { + "entropy": 0.5336644947528839, + "epoch": 2.6654135338345863, + "grad_norm": 0.043611474335193634, + "learning_rate": 0.0002, + "loss": 0.5328789949417114, + "mean_token_accuracy": 0.7849068492650986, + "num_tokens": 11569073.0, + "step": 709 + }, + { + "entropy": 0.5428521186113358, + "epoch": 2.6691729323308273, + "grad_norm": 0.03883448615670204, + "learning_rate": 0.0002, + "loss": 0.5391871333122253, + "mean_token_accuracy": 0.781522735953331, + "num_tokens": 11585504.0, + "step": 710 + }, + { + "entropy": 0.5335109233856201, + "epoch": 2.672932330827068, + "grad_norm": 0.03785593435168266, + "learning_rate": 0.0002, + "loss": 0.5298542976379395, + "mean_token_accuracy": 0.7834679186344147, + "num_tokens": 11601813.0, + "step": 711 + }, + { + "entropy": 0.527670718729496, + "epoch": 2.6766917293233083, + "grad_norm": 0.036839164793491364, + "learning_rate": 0.0002, + "loss": 0.5316509008407593, + "mean_token_accuracy": 0.7826409935951233, + "num_tokens": 11618283.0, + "step": 712 + }, + { + "entropy": 0.5326329097151756, + "epoch": 2.680451127819549, + "grad_norm": 0.04807848483324051, + "learning_rate": 0.0002, + "loss": 0.5426601767539978, + "mean_token_accuracy": 0.7812999188899994, + "num_tokens": 11634632.0, + "step": 713 + }, + { + "entropy": 0.5393012017011642, + "epoch": 2.6842105263157894, + "grad_norm": 0.038986288011074066, + "learning_rate": 0.0002, + "loss": 0.5428729057312012, + "mean_token_accuracy": 0.7807578444480896, + "num_tokens": 11650999.0, + "step": 714 + }, + { + "entropy": 0.5483723729848862, + "epoch": 2.6879699248120303, + "grad_norm": 0.03780362382531166, + "learning_rate": 0.0002, + "loss": 0.5442914366722107, + "mean_token_accuracy": 0.7784056067466736, + "num_tokens": 11667151.0, + "step": 715 + }, + { + "entropy": 0.547231912612915, + "epoch": 2.6917293233082704, + "grad_norm": 0.045203741639852524, + "learning_rate": 0.0002, + "loss": 0.5431523323059082, + "mean_token_accuracy": 0.7817295789718628, + "num_tokens": 11683514.0, + "step": 716 + }, + { + "entropy": 0.5371780097484589, + "epoch": 2.6954887218045114, + "grad_norm": 0.03749014437198639, + "learning_rate": 0.0002, + "loss": 0.5376321077346802, + "mean_token_accuracy": 0.7811625152826309, + "num_tokens": 11699727.0, + "step": 717 + }, + { + "entropy": 0.5319441854953766, + "epoch": 2.699248120300752, + "grad_norm": 0.04130973294377327, + "learning_rate": 0.0002, + "loss": 0.5348937511444092, + "mean_token_accuracy": 0.784428283572197, + "num_tokens": 11716234.0, + "step": 718 + }, + { + "entropy": 0.5342800319194794, + "epoch": 2.7030075187969924, + "grad_norm": 0.04313354194164276, + "learning_rate": 0.0002, + "loss": 0.5452970266342163, + "mean_token_accuracy": 0.7770380526781082, + "num_tokens": 11732506.0, + "step": 719 + }, + { + "entropy": 0.5398904979228973, + "epoch": 2.706766917293233, + "grad_norm": 0.04417818412184715, + "learning_rate": 0.0002, + "loss": 0.5421609878540039, + "mean_token_accuracy": 0.7809232920408249, + "num_tokens": 11748768.0, + "step": 720 + }, + { + "entropy": 0.5440465807914734, + "epoch": 2.7105263157894735, + "grad_norm": 0.036389391869306564, + "learning_rate": 0.0002, + "loss": 0.5376783609390259, + "mean_token_accuracy": 0.7818926721811295, + "num_tokens": 11765164.0, + "step": 721 + }, + { + "entropy": 0.5312932878732681, + "epoch": 2.7142857142857144, + "grad_norm": 0.037032727152109146, + "learning_rate": 0.0002, + "loss": 0.5279201865196228, + "mean_token_accuracy": 0.7845446914434433, + "num_tokens": 11781577.0, + "step": 722 + }, + { + "entropy": 0.5704400539398193, + "epoch": 2.718045112781955, + "grad_norm": 0.03669275715947151, + "learning_rate": 0.0002, + "loss": 0.5670531988143921, + "mean_token_accuracy": 0.7707259953022003, + "num_tokens": 11798120.0, + "step": 723 + }, + { + "entropy": 0.5271944850683212, + "epoch": 2.7218045112781954, + "grad_norm": 0.04460054636001587, + "learning_rate": 0.0002, + "loss": 0.531152606010437, + "mean_token_accuracy": 0.7819943279027939, + "num_tokens": 11814241.0, + "step": 724 + }, + { + "entropy": 0.5407906174659729, + "epoch": 2.725563909774436, + "grad_norm": 0.04240792244672775, + "learning_rate": 0.0002, + "loss": 0.5359742045402527, + "mean_token_accuracy": 0.7843276411294937, + "num_tokens": 11830762.0, + "step": 725 + }, + { + "entropy": 0.538364827632904, + "epoch": 2.7293233082706765, + "grad_norm": 0.04200772941112518, + "learning_rate": 0.0002, + "loss": 0.5396072864532471, + "mean_token_accuracy": 0.7798211723566055, + "num_tokens": 11847252.0, + "step": 726 + }, + { + "entropy": 0.5308995842933655, + "epoch": 2.7330827067669174, + "grad_norm": 0.03762137144804001, + "learning_rate": 0.0002, + "loss": 0.5341114401817322, + "mean_token_accuracy": 0.7839807718992233, + "num_tokens": 11863535.0, + "step": 727 + }, + { + "entropy": 0.5268086791038513, + "epoch": 2.736842105263158, + "grad_norm": 0.03609534725546837, + "learning_rate": 0.0002, + "loss": 0.5221338868141174, + "mean_token_accuracy": 0.789483904838562, + "num_tokens": 11879928.0, + "step": 728 + }, + { + "entropy": 0.5412466526031494, + "epoch": 2.7406015037593985, + "grad_norm": 0.040453530848026276, + "learning_rate": 0.0002, + "loss": 0.5429666042327881, + "mean_token_accuracy": 0.7812945246696472, + "num_tokens": 11896142.0, + "step": 729 + }, + { + "entropy": 0.5352004170417786, + "epoch": 2.744360902255639, + "grad_norm": 0.044242773205041885, + "learning_rate": 0.0002, + "loss": 0.536725640296936, + "mean_token_accuracy": 0.7831927388906479, + "num_tokens": 11912241.0, + "step": 730 + }, + { + "entropy": 0.5453604012727737, + "epoch": 2.7481203007518795, + "grad_norm": 0.0423831045627594, + "learning_rate": 0.0002, + "loss": 0.5527924299240112, + "mean_token_accuracy": 0.7745030075311661, + "num_tokens": 11928611.0, + "step": 731 + }, + { + "entropy": 0.5306564420461655, + "epoch": 2.7518796992481205, + "grad_norm": 0.0449826754629612, + "learning_rate": 0.0002, + "loss": 0.5404161214828491, + "mean_token_accuracy": 0.7825066149234772, + "num_tokens": 11944963.0, + "step": 732 + }, + { + "entropy": 0.5378609150648117, + "epoch": 2.755639097744361, + "grad_norm": 0.04047499597072601, + "learning_rate": 0.0002, + "loss": 0.5455936193466187, + "mean_token_accuracy": 0.7781111598014832, + "num_tokens": 11961304.0, + "step": 733 + }, + { + "entropy": 0.5367683172225952, + "epoch": 2.7593984962406015, + "grad_norm": 0.04174184799194336, + "learning_rate": 0.0002, + "loss": 0.5363747477531433, + "mean_token_accuracy": 0.7800599485635757, + "num_tokens": 11977719.0, + "step": 734 + }, + { + "entropy": 0.5561744570732117, + "epoch": 2.763157894736842, + "grad_norm": 0.04008743166923523, + "learning_rate": 0.0002, + "loss": 0.552983283996582, + "mean_token_accuracy": 0.7766020447015762, + "num_tokens": 11993844.0, + "step": 735 + }, + { + "entropy": 0.5463001132011414, + "epoch": 2.7669172932330826, + "grad_norm": 0.03661397472023964, + "learning_rate": 0.0002, + "loss": 0.5395646691322327, + "mean_token_accuracy": 0.7784713059663773, + "num_tokens": 12010281.0, + "step": 736 + }, + { + "entropy": 0.5210074186325073, + "epoch": 2.7706766917293235, + "grad_norm": 0.03591572865843773, + "learning_rate": 0.0002, + "loss": 0.5220502018928528, + "mean_token_accuracy": 0.7874239087104797, + "num_tokens": 12026530.0, + "step": 737 + }, + { + "entropy": 0.5433954000473022, + "epoch": 2.774436090225564, + "grad_norm": 0.04104798287153244, + "learning_rate": 0.0002, + "loss": 0.5510661005973816, + "mean_token_accuracy": 0.7753429859876633, + "num_tokens": 12042889.0, + "step": 738 + }, + { + "entropy": 0.5119400694966316, + "epoch": 2.7781954887218046, + "grad_norm": 0.039529718458652496, + "learning_rate": 0.0002, + "loss": 0.5171459317207336, + "mean_token_accuracy": 0.7895881831645966, + "num_tokens": 12059138.0, + "step": 739 + }, + { + "entropy": 0.5456018000841141, + "epoch": 2.781954887218045, + "grad_norm": 0.03834446892142296, + "learning_rate": 0.0002, + "loss": 0.5516197681427002, + "mean_token_accuracy": 0.7791079431772232, + "num_tokens": 12075629.0, + "step": 740 + }, + { + "entropy": 0.5416502356529236, + "epoch": 2.7857142857142856, + "grad_norm": 0.03950374945998192, + "learning_rate": 0.0002, + "loss": 0.541545033454895, + "mean_token_accuracy": 0.7776272892951965, + "num_tokens": 12091966.0, + "step": 741 + }, + { + "entropy": 0.5439035892486572, + "epoch": 2.7894736842105265, + "grad_norm": 0.03714444488286972, + "learning_rate": 0.0002, + "loss": 0.5373456478118896, + "mean_token_accuracy": 0.7819632142782211, + "num_tokens": 12108429.0, + "step": 742 + }, + { + "entropy": 0.5513075590133667, + "epoch": 2.793233082706767, + "grad_norm": 0.03567977994680405, + "learning_rate": 0.0002, + "loss": 0.5416471362113953, + "mean_token_accuracy": 0.7816196233034134, + "num_tokens": 12124997.0, + "step": 743 + }, + { + "entropy": 0.5525044798851013, + "epoch": 2.7969924812030076, + "grad_norm": 0.036792755126953125, + "learning_rate": 0.0002, + "loss": 0.5522248148918152, + "mean_token_accuracy": 0.7766036689281464, + "num_tokens": 12141338.0, + "step": 744 + }, + { + "entropy": 0.522551566362381, + "epoch": 2.800751879699248, + "grad_norm": 0.03983981907367706, + "learning_rate": 0.0002, + "loss": 0.5232869982719421, + "mean_token_accuracy": 0.7857565432786942, + "num_tokens": 12157683.0, + "step": 745 + }, + { + "entropy": 0.5314129739999771, + "epoch": 2.8045112781954886, + "grad_norm": 0.03918331488966942, + "learning_rate": 0.0002, + "loss": 0.5321224927902222, + "mean_token_accuracy": 0.7834707945585251, + "num_tokens": 12174145.0, + "step": 746 + }, + { + "entropy": 0.5208713561296463, + "epoch": 2.8082706766917296, + "grad_norm": 0.03813806548714638, + "learning_rate": 0.0002, + "loss": 0.5278118848800659, + "mean_token_accuracy": 0.7842634320259094, + "num_tokens": 12190434.0, + "step": 747 + }, + { + "entropy": 0.5349813252687454, + "epoch": 2.8120300751879697, + "grad_norm": 0.04137561097741127, + "learning_rate": 0.0002, + "loss": 0.5378336906433105, + "mean_token_accuracy": 0.7831988483667374, + "num_tokens": 12206552.0, + "step": 748 + }, + { + "entropy": 0.529716819524765, + "epoch": 2.8157894736842106, + "grad_norm": 0.037089038640260696, + "learning_rate": 0.0002, + "loss": 0.530727744102478, + "mean_token_accuracy": 0.787126213312149, + "num_tokens": 12222985.0, + "step": 749 + }, + { + "entropy": 0.5329919755458832, + "epoch": 2.819548872180451, + "grad_norm": 0.03868598863482475, + "learning_rate": 0.0002, + "loss": 0.535510241985321, + "mean_token_accuracy": 0.7821749895811081, + "num_tokens": 12239387.0, + "step": 750 + }, + { + "entropy": 0.5512770563364029, + "epoch": 2.8233082706766917, + "grad_norm": 0.03504098951816559, + "learning_rate": 0.0002, + "loss": 0.5498230457305908, + "mean_token_accuracy": 0.77789406478405, + "num_tokens": 12255678.0, + "step": 751 + }, + { + "entropy": 0.5387983024120331, + "epoch": 2.827067669172932, + "grad_norm": 0.04012952372431755, + "learning_rate": 0.0002, + "loss": 0.5449475049972534, + "mean_token_accuracy": 0.7773616015911102, + "num_tokens": 12271735.0, + "step": 752 + }, + { + "entropy": 0.5438449382781982, + "epoch": 2.8308270676691727, + "grad_norm": 0.04448486492037773, + "learning_rate": 0.0002, + "loss": 0.5473355650901794, + "mean_token_accuracy": 0.7765258699655533, + "num_tokens": 12288034.0, + "step": 753 + }, + { + "entropy": 0.5242600291967392, + "epoch": 2.8345864661654137, + "grad_norm": 0.03874325752258301, + "learning_rate": 0.0002, + "loss": 0.5232968330383301, + "mean_token_accuracy": 0.7877610623836517, + "num_tokens": 12304188.0, + "step": 754 + }, + { + "entropy": 0.5431344211101532, + "epoch": 2.838345864661654, + "grad_norm": 0.04510108754038811, + "learning_rate": 0.0002, + "loss": 0.5374618768692017, + "mean_token_accuracy": 0.783510684967041, + "num_tokens": 12320210.0, + "step": 755 + }, + { + "entropy": 0.566683366894722, + "epoch": 2.8421052631578947, + "grad_norm": 0.038339611142873764, + "learning_rate": 0.0002, + "loss": 0.5602604746818542, + "mean_token_accuracy": 0.7746738642454147, + "num_tokens": 12336736.0, + "step": 756 + }, + { + "entropy": 0.5256731361150742, + "epoch": 2.845864661654135, + "grad_norm": 0.04725516587495804, + "learning_rate": 0.0002, + "loss": 0.5308937430381775, + "mean_token_accuracy": 0.7819661647081375, + "num_tokens": 12353304.0, + "step": 757 + }, + { + "entropy": 0.5368983596563339, + "epoch": 2.8496240601503757, + "grad_norm": 0.04469098895788193, + "learning_rate": 0.0002, + "loss": 0.5494676828384399, + "mean_token_accuracy": 0.7781397998332977, + "num_tokens": 12369897.0, + "step": 758 + }, + { + "entropy": 0.5407442450523376, + "epoch": 2.8533834586466167, + "grad_norm": 0.04544219374656677, + "learning_rate": 0.0002, + "loss": 0.5484528541564941, + "mean_token_accuracy": 0.7776692062616348, + "num_tokens": 12385920.0, + "step": 759 + }, + { + "entropy": 0.5232048332691193, + "epoch": 2.857142857142857, + "grad_norm": 0.03687431663274765, + "learning_rate": 0.0002, + "loss": 0.5165009498596191, + "mean_token_accuracy": 0.789492592215538, + "num_tokens": 12402444.0, + "step": 760 + }, + { + "entropy": 0.5273272693157196, + "epoch": 2.8609022556390977, + "grad_norm": 0.037794262170791626, + "learning_rate": 0.0002, + "loss": 0.5232701301574707, + "mean_token_accuracy": 0.788696900010109, + "num_tokens": 12418988.0, + "step": 761 + }, + { + "entropy": 0.5304031819105148, + "epoch": 2.8646616541353382, + "grad_norm": 0.038420420140028, + "learning_rate": 0.0002, + "loss": 0.5247512459754944, + "mean_token_accuracy": 0.7857597023248672, + "num_tokens": 12435536.0, + "step": 762 + }, + { + "entropy": 0.5269620269536972, + "epoch": 2.8684210526315788, + "grad_norm": 0.04084121063351631, + "learning_rate": 0.0002, + "loss": 0.5284534692764282, + "mean_token_accuracy": 0.7831205129623413, + "num_tokens": 12451737.0, + "step": 763 + }, + { + "entropy": 0.5162742882966995, + "epoch": 2.8721804511278197, + "grad_norm": 0.04410441219806671, + "learning_rate": 0.0002, + "loss": 0.5282053351402283, + "mean_token_accuracy": 0.7836557477712631, + "num_tokens": 12467925.0, + "step": 764 + }, + { + "entropy": 0.5351501703262329, + "epoch": 2.8759398496240602, + "grad_norm": 0.04215250536799431, + "learning_rate": 0.0002, + "loss": 0.5436667799949646, + "mean_token_accuracy": 0.7797116935253143, + "num_tokens": 12484385.0, + "step": 765 + }, + { + "entropy": 0.5445809066295624, + "epoch": 2.8796992481203008, + "grad_norm": 0.039003774523735046, + "learning_rate": 0.0002, + "loss": 0.5466570854187012, + "mean_token_accuracy": 0.7810900658369064, + "num_tokens": 12500782.0, + "step": 766 + }, + { + "entropy": 0.5677538812160492, + "epoch": 2.8834586466165413, + "grad_norm": 0.038001179695129395, + "learning_rate": 0.0002, + "loss": 0.5561648011207581, + "mean_token_accuracy": 0.7711465805768967, + "num_tokens": 12517241.0, + "step": 767 + }, + { + "entropy": 0.5477330982685089, + "epoch": 2.887218045112782, + "grad_norm": 0.03719984367489815, + "learning_rate": 0.0002, + "loss": 0.5399020910263062, + "mean_token_accuracy": 0.7845228165388107, + "num_tokens": 12533645.0, + "step": 768 + }, + { + "entropy": 0.5322476327419281, + "epoch": 2.8909774436090228, + "grad_norm": 0.04132302105426788, + "learning_rate": 0.0002, + "loss": 0.5327161550521851, + "mean_token_accuracy": 0.7837435156106949, + "num_tokens": 12550190.0, + "step": 769 + }, + { + "entropy": 0.5217838287353516, + "epoch": 2.8947368421052633, + "grad_norm": 0.041548822075128555, + "learning_rate": 0.0002, + "loss": 0.5239148139953613, + "mean_token_accuracy": 0.7885714769363403, + "num_tokens": 12566418.0, + "step": 770 + }, + { + "entropy": 0.5343627035617828, + "epoch": 2.898496240601504, + "grad_norm": 0.04029269516468048, + "learning_rate": 0.0002, + "loss": 0.5422418117523193, + "mean_token_accuracy": 0.7791919559240341, + "num_tokens": 12582647.0, + "step": 771 + }, + { + "entropy": 0.5284289866685867, + "epoch": 2.9022556390977443, + "grad_norm": 0.04448118433356285, + "learning_rate": 0.0002, + "loss": 0.5392597913742065, + "mean_token_accuracy": 0.7816968858242035, + "num_tokens": 12598795.0, + "step": 772 + }, + { + "entropy": 0.5162788778543472, + "epoch": 2.906015037593985, + "grad_norm": 0.04028403386473656, + "learning_rate": 0.0002, + "loss": 0.521114706993103, + "mean_token_accuracy": 0.7890318781137466, + "num_tokens": 12615105.0, + "step": 773 + }, + { + "entropy": 0.5632917135953903, + "epoch": 2.909774436090226, + "grad_norm": 0.04001300409436226, + "learning_rate": 0.0002, + "loss": 0.5603697299957275, + "mean_token_accuracy": 0.7751758396625519, + "num_tokens": 12631390.0, + "step": 774 + }, + { + "entropy": 0.5503305643796921, + "epoch": 2.9135338345864663, + "grad_norm": 0.03347298875451088, + "learning_rate": 0.0002, + "loss": 0.5459069609642029, + "mean_token_accuracy": 0.7786167114973068, + "num_tokens": 12647885.0, + "step": 775 + }, + { + "entropy": 0.5473008453845978, + "epoch": 2.917293233082707, + "grad_norm": 0.03752491995692253, + "learning_rate": 0.0002, + "loss": 0.5333649516105652, + "mean_token_accuracy": 0.7828412652015686, + "num_tokens": 12664120.0, + "step": 776 + }, + { + "entropy": 0.5354459285736084, + "epoch": 2.9210526315789473, + "grad_norm": 0.04058157652616501, + "learning_rate": 0.0002, + "loss": 0.5341867208480835, + "mean_token_accuracy": 0.7867896258831024, + "num_tokens": 12680500.0, + "step": 777 + }, + { + "entropy": 0.5142473876476288, + "epoch": 2.924812030075188, + "grad_norm": 0.04209408536553383, + "learning_rate": 0.0002, + "loss": 0.5206042528152466, + "mean_token_accuracy": 0.7850682884454727, + "num_tokens": 12696593.0, + "step": 778 + }, + { + "entropy": 0.5365364253520966, + "epoch": 2.928571428571429, + "grad_norm": 0.04453515261411667, + "learning_rate": 0.0002, + "loss": 0.545800507068634, + "mean_token_accuracy": 0.7796301394701004, + "num_tokens": 12712691.0, + "step": 779 + }, + { + "entropy": 0.542564183473587, + "epoch": 2.932330827067669, + "grad_norm": 0.03840424865484238, + "learning_rate": 0.0002, + "loss": 0.5449208617210388, + "mean_token_accuracy": 0.778635174036026, + "num_tokens": 12729062.0, + "step": 780 + }, + { + "entropy": 0.5423157215118408, + "epoch": 2.93609022556391, + "grad_norm": 0.0474003404378891, + "learning_rate": 0.0002, + "loss": 0.5478240251541138, + "mean_token_accuracy": 0.7766861170530319, + "num_tokens": 12745381.0, + "step": 781 + }, + { + "entropy": 0.5361933559179306, + "epoch": 2.9398496240601504, + "grad_norm": 0.037907540798187256, + "learning_rate": 0.0002, + "loss": 0.5324196815490723, + "mean_token_accuracy": 0.7846821397542953, + "num_tokens": 12761688.0, + "step": 782 + }, + { + "entropy": 0.5589640736579895, + "epoch": 2.943609022556391, + "grad_norm": 0.04339439421892166, + "learning_rate": 0.0002, + "loss": 0.5444428324699402, + "mean_token_accuracy": 0.7806793451309204, + "num_tokens": 12778289.0, + "step": 783 + }, + { + "entropy": 0.5389928370714188, + "epoch": 2.9473684210526314, + "grad_norm": 0.03586737811565399, + "learning_rate": 0.0002, + "loss": 0.5383816957473755, + "mean_token_accuracy": 0.7810381203889847, + "num_tokens": 12794954.0, + "step": 784 + }, + { + "entropy": 0.5266241282224655, + "epoch": 2.951127819548872, + "grad_norm": 0.03784513846039772, + "learning_rate": 0.0002, + "loss": 0.5282174348831177, + "mean_token_accuracy": 0.7867349982261658, + "num_tokens": 12811150.0, + "step": 785 + }, + { + "entropy": 0.5349175482988358, + "epoch": 2.954887218045113, + "grad_norm": 0.04314623400568962, + "learning_rate": 0.0002, + "loss": 0.5450260043144226, + "mean_token_accuracy": 0.7768904566764832, + "num_tokens": 12827293.0, + "step": 786 + }, + { + "entropy": 0.5137490779161453, + "epoch": 2.9586466165413534, + "grad_norm": 0.04252813383936882, + "learning_rate": 0.0002, + "loss": 0.5246796011924744, + "mean_token_accuracy": 0.7863982170820236, + "num_tokens": 12843307.0, + "step": 787 + }, + { + "entropy": 0.5352135896682739, + "epoch": 2.962406015037594, + "grad_norm": 0.045887961983680725, + "learning_rate": 0.0002, + "loss": 0.5371412634849548, + "mean_token_accuracy": 0.7804872691631317, + "num_tokens": 12859595.0, + "step": 788 + }, + { + "entropy": 0.5446542203426361, + "epoch": 2.9661654135338344, + "grad_norm": 0.04673901945352554, + "learning_rate": 0.0002, + "loss": 0.5501778721809387, + "mean_token_accuracy": 0.7773697823286057, + "num_tokens": 12875931.0, + "step": 789 + }, + { + "entropy": 0.5408057272434235, + "epoch": 2.969924812030075, + "grad_norm": 0.0367148295044899, + "learning_rate": 0.0002, + "loss": 0.5386841297149658, + "mean_token_accuracy": 0.779689833521843, + "num_tokens": 12892289.0, + "step": 790 + }, + { + "entropy": 0.538294106721878, + "epoch": 2.973684210526316, + "grad_norm": 0.035284459590911865, + "learning_rate": 0.0002, + "loss": 0.5302733778953552, + "mean_token_accuracy": 0.7843924909830093, + "num_tokens": 12908646.0, + "step": 791 + }, + { + "entropy": 0.5408864170312881, + "epoch": 2.9774436090225564, + "grad_norm": 0.03952067717909813, + "learning_rate": 0.0002, + "loss": 0.5328561663627625, + "mean_token_accuracy": 0.7823582589626312, + "num_tokens": 12924940.0, + "step": 792 + }, + { + "entropy": 0.5341958701610565, + "epoch": 2.981203007518797, + "grad_norm": 0.03711646795272827, + "learning_rate": 0.0002, + "loss": 0.5313258767127991, + "mean_token_accuracy": 0.7841775417327881, + "num_tokens": 12941104.0, + "step": 793 + }, + { + "entropy": 0.5351585075259209, + "epoch": 2.9849624060150375, + "grad_norm": 0.04043775424361229, + "learning_rate": 0.0002, + "loss": 0.5411684513092041, + "mean_token_accuracy": 0.7801253944635391, + "num_tokens": 12957327.0, + "step": 794 + }, + { + "entropy": 0.5278606861829758, + "epoch": 2.988721804511278, + "grad_norm": 0.04125319793820381, + "learning_rate": 0.0002, + "loss": 0.5394368171691895, + "mean_token_accuracy": 0.7814257442951202, + "num_tokens": 12973968.0, + "step": 795 + }, + { + "entropy": 0.5424105674028397, + "epoch": 2.992481203007519, + "grad_norm": 0.04019284248352051, + "learning_rate": 0.0002, + "loss": 0.5428224802017212, + "mean_token_accuracy": 0.7811149209737778, + "num_tokens": 12990151.0, + "step": 796 + }, + { + "entropy": 0.526485025882721, + "epoch": 2.9962406015037595, + "grad_norm": 0.04355369135737419, + "learning_rate": 0.0002, + "loss": 0.524267315864563, + "mean_token_accuracy": 0.7883585393428802, + "num_tokens": 13006619.0, + "step": 797 + }, + { + "entropy": 0.5499685406684875, + "epoch": 3.0, + "grad_norm": 0.04084917902946472, + "learning_rate": 0.0002, + "loss": 0.5499616265296936, + "mean_token_accuracy": 0.7766987532377243, + "num_tokens": 13023154.0, + "step": 798 + } + ], + "logging_steps": 1, + "max_steps": 798, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.2137387169173996e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}