clean-subliminal-learning-otters / trainer_state.json
eac123's picture
Upload final checkpoint (checkpoint-798)
8e7c8e7 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 798,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.1405175626277924,
"epoch": 0.0037593984962406013,
"grad_norm": 0.40029582381248474,
"learning_rate": 0.0002,
"loss": 2.4748640060424805,
"mean_token_accuracy": 0.5338118821382523,
"num_tokens": 16246.0,
"step": 1
},
{
"entropy": 1.2275302708148956,
"epoch": 0.007518796992481203,
"grad_norm": 0.36828649044036865,
"learning_rate": 0.0002,
"loss": 2.125943660736084,
"mean_token_accuracy": 0.5713680684566498,
"num_tokens": 32716.0,
"step": 2
},
{
"entropy": 1.4195487797260284,
"epoch": 0.011278195488721804,
"grad_norm": 0.29105839133262634,
"learning_rate": 0.0002,
"loss": 1.735130786895752,
"mean_token_accuracy": 0.5909573882818222,
"num_tokens": 48967.0,
"step": 3
},
{
"entropy": 1.3783348500728607,
"epoch": 0.015037593984962405,
"grad_norm": 0.2323397547006607,
"learning_rate": 0.0002,
"loss": 1.4040782451629639,
"mean_token_accuracy": 0.6318088620901108,
"num_tokens": 65467.0,
"step": 4
},
{
"entropy": 1.3656240701675415,
"epoch": 0.018796992481203006,
"grad_norm": 0.2868480384349823,
"learning_rate": 0.0002,
"loss": 1.3035261631011963,
"mean_token_accuracy": 0.6341304779052734,
"num_tokens": 81665.0,
"step": 5
},
{
"entropy": 1.264964371919632,
"epoch": 0.022556390977443608,
"grad_norm": 0.14605936408042908,
"learning_rate": 0.0002,
"loss": 1.1722630262374878,
"mean_token_accuracy": 0.6646067351102829,
"num_tokens": 97913.0,
"step": 6
},
{
"entropy": 1.1983447670936584,
"epoch": 0.02631578947368421,
"grad_norm": 0.10632229596376419,
"learning_rate": 0.0002,
"loss": 1.1054309606552124,
"mean_token_accuracy": 0.6686217486858368,
"num_tokens": 113953.0,
"step": 7
},
{
"entropy": 1.1218359470367432,
"epoch": 0.03007518796992481,
"grad_norm": 0.09761745482683182,
"learning_rate": 0.0002,
"loss": 1.0230426788330078,
"mean_token_accuracy": 0.676657035946846,
"num_tokens": 130177.0,
"step": 8
},
{
"entropy": 1.0549319684505463,
"epoch": 0.03383458646616541,
"grad_norm": 0.1231616735458374,
"learning_rate": 0.0002,
"loss": 0.9938599467277527,
"mean_token_accuracy": 0.6875758469104767,
"num_tokens": 146621.0,
"step": 9
},
{
"entropy": 0.987179160118103,
"epoch": 0.03759398496240601,
"grad_norm": 0.11966806650161743,
"learning_rate": 0.0002,
"loss": 0.9243900775909424,
"mean_token_accuracy": 0.6994709670543671,
"num_tokens": 162843.0,
"step": 10
},
{
"entropy": 0.935651957988739,
"epoch": 0.041353383458646614,
"grad_norm": 0.10380394756793976,
"learning_rate": 0.0002,
"loss": 0.866508960723877,
"mean_token_accuracy": 0.7096800655126572,
"num_tokens": 179313.0,
"step": 11
},
{
"entropy": 0.9110619872808456,
"epoch": 0.045112781954887216,
"grad_norm": 0.10094986110925674,
"learning_rate": 0.0002,
"loss": 0.832156240940094,
"mean_token_accuracy": 0.7104088068008423,
"num_tokens": 195785.0,
"step": 12
},
{
"entropy": 0.855834111571312,
"epoch": 0.04887218045112782,
"grad_norm": 0.37487563490867615,
"learning_rate": 0.0002,
"loss": 0.8014079332351685,
"mean_token_accuracy": 0.7197864800691605,
"num_tokens": 212026.0,
"step": 13
},
{
"entropy": 0.7773148268461227,
"epoch": 0.05263157894736842,
"grad_norm": 0.09044307470321655,
"learning_rate": 0.0002,
"loss": 0.7479192614555359,
"mean_token_accuracy": 0.7304967045783997,
"num_tokens": 228294.0,
"step": 14
},
{
"entropy": 0.7414887696504593,
"epoch": 0.05639097744360902,
"grad_norm": 0.11246141791343689,
"learning_rate": 0.0002,
"loss": 0.7355879545211792,
"mean_token_accuracy": 0.7314187586307526,
"num_tokens": 244681.0,
"step": 15
},
{
"entropy": 0.7010335773229599,
"epoch": 0.06015037593984962,
"grad_norm": 0.11098679155111313,
"learning_rate": 0.0002,
"loss": 0.6920604109764099,
"mean_token_accuracy": 0.7372281551361084,
"num_tokens": 261053.0,
"step": 16
},
{
"entropy": 0.6938799321651459,
"epoch": 0.06390977443609022,
"grad_norm": 0.08114200830459595,
"learning_rate": 0.0002,
"loss": 0.6897510886192322,
"mean_token_accuracy": 0.7408226281404495,
"num_tokens": 277338.0,
"step": 17
},
{
"entropy": 0.6835978478193283,
"epoch": 0.06766917293233082,
"grad_norm": 0.08077364414930344,
"learning_rate": 0.0002,
"loss": 0.6768285632133484,
"mean_token_accuracy": 0.740087628364563,
"num_tokens": 293709.0,
"step": 18
},
{
"entropy": 0.6589517742395401,
"epoch": 0.07142857142857142,
"grad_norm": 0.0879955068230629,
"learning_rate": 0.0002,
"loss": 0.65667724609375,
"mean_token_accuracy": 0.7443644404411316,
"num_tokens": 310128.0,
"step": 19
},
{
"entropy": 0.6506444960832596,
"epoch": 0.07518796992481203,
"grad_norm": 0.080411896109581,
"learning_rate": 0.0002,
"loss": 0.641387403011322,
"mean_token_accuracy": 0.7495939880609512,
"num_tokens": 326607.0,
"step": 20
},
{
"entropy": 0.6619953960180283,
"epoch": 0.07894736842105263,
"grad_norm": 0.0845642164349556,
"learning_rate": 0.0002,
"loss": 0.6475294232368469,
"mean_token_accuracy": 0.7457321733236313,
"num_tokens": 342774.0,
"step": 21
},
{
"entropy": 0.6577392071485519,
"epoch": 0.08270676691729323,
"grad_norm": 0.07965292036533356,
"learning_rate": 0.0002,
"loss": 0.6407521367073059,
"mean_token_accuracy": 0.7490587830543518,
"num_tokens": 359099.0,
"step": 22
},
{
"entropy": 0.6155381500720978,
"epoch": 0.08646616541353383,
"grad_norm": 0.07591664046049118,
"learning_rate": 0.0002,
"loss": 0.6092519760131836,
"mean_token_accuracy": 0.7603109776973724,
"num_tokens": 375179.0,
"step": 23
},
{
"entropy": 0.5885609835386276,
"epoch": 0.09022556390977443,
"grad_norm": 0.06627360731363297,
"learning_rate": 0.0002,
"loss": 0.5951059460639954,
"mean_token_accuracy": 0.7678095996379852,
"num_tokens": 391354.0,
"step": 24
},
{
"entropy": 0.5992416590452194,
"epoch": 0.09398496240601503,
"grad_norm": 0.08137614279985428,
"learning_rate": 0.0002,
"loss": 0.6067847013473511,
"mean_token_accuracy": 0.7620100975036621,
"num_tokens": 407719.0,
"step": 25
},
{
"entropy": 0.6116904020309448,
"epoch": 0.09774436090225563,
"grad_norm": 0.06891811639070511,
"learning_rate": 0.0002,
"loss": 0.6175057888031006,
"mean_token_accuracy": 0.7556122690439224,
"num_tokens": 424041.0,
"step": 26
},
{
"entropy": 0.6106788814067841,
"epoch": 0.10150375939849623,
"grad_norm": 0.059570278972387314,
"learning_rate": 0.0002,
"loss": 0.5937588214874268,
"mean_token_accuracy": 0.7666491121053696,
"num_tokens": 440295.0,
"step": 27
},
{
"entropy": 0.6181164085865021,
"epoch": 0.10526315789473684,
"grad_norm": 0.07394946366548538,
"learning_rate": 0.0002,
"loss": 0.6043965220451355,
"mean_token_accuracy": 0.7635089755058289,
"num_tokens": 456614.0,
"step": 28
},
{
"entropy": 0.6283685266971588,
"epoch": 0.10902255639097744,
"grad_norm": 0.07618279755115509,
"learning_rate": 0.0002,
"loss": 0.6195181608200073,
"mean_token_accuracy": 0.752281054854393,
"num_tokens": 472965.0,
"step": 29
},
{
"entropy": 0.5851932466030121,
"epoch": 0.11278195488721804,
"grad_norm": 0.05518079921603203,
"learning_rate": 0.0002,
"loss": 0.5881266593933105,
"mean_token_accuracy": 0.7650770843029022,
"num_tokens": 489391.0,
"step": 30
},
{
"entropy": 0.5895522385835648,
"epoch": 0.11654135338345864,
"grad_norm": 0.06688102334737778,
"learning_rate": 0.0002,
"loss": 0.6028741002082825,
"mean_token_accuracy": 0.7601553350687027,
"num_tokens": 505837.0,
"step": 31
},
{
"entropy": 0.5878616869449615,
"epoch": 0.12030075187969924,
"grad_norm": 0.059780046343803406,
"learning_rate": 0.0002,
"loss": 0.6033408045768738,
"mean_token_accuracy": 0.7582006454467773,
"num_tokens": 522243.0,
"step": 32
},
{
"entropy": 0.5838498622179031,
"epoch": 0.12406015037593984,
"grad_norm": 0.04929976165294647,
"learning_rate": 0.0002,
"loss": 0.5896713137626648,
"mean_token_accuracy": 0.761729434132576,
"num_tokens": 538731.0,
"step": 33
},
{
"entropy": 0.5691559016704559,
"epoch": 0.12781954887218044,
"grad_norm": 0.06266291439533234,
"learning_rate": 0.0002,
"loss": 0.5734342932701111,
"mean_token_accuracy": 0.7672057747840881,
"num_tokens": 554848.0,
"step": 34
},
{
"entropy": 0.5915598571300507,
"epoch": 0.13157894736842105,
"grad_norm": 0.06152564287185669,
"learning_rate": 0.0002,
"loss": 0.5912453532218933,
"mean_token_accuracy": 0.7633904218673706,
"num_tokens": 571057.0,
"step": 35
},
{
"entropy": 0.597556471824646,
"epoch": 0.13533834586466165,
"grad_norm": 0.04998990520834923,
"learning_rate": 0.0002,
"loss": 0.5882090330123901,
"mean_token_accuracy": 0.7643049657344818,
"num_tokens": 587326.0,
"step": 36
},
{
"entropy": 0.5905885845422745,
"epoch": 0.13909774436090225,
"grad_norm": 0.049017250537872314,
"learning_rate": 0.0002,
"loss": 0.5855776071548462,
"mean_token_accuracy": 0.7655442655086517,
"num_tokens": 603538.0,
"step": 37
},
{
"entropy": 0.586976170539856,
"epoch": 0.14285714285714285,
"grad_norm": 0.046413078904151917,
"learning_rate": 0.0002,
"loss": 0.5790608525276184,
"mean_token_accuracy": 0.767949789762497,
"num_tokens": 619734.0,
"step": 38
},
{
"entropy": 0.5844197869300842,
"epoch": 0.14661654135338345,
"grad_norm": 0.04495161026716232,
"learning_rate": 0.0002,
"loss": 0.5842206478118896,
"mean_token_accuracy": 0.7648505717515945,
"num_tokens": 636104.0,
"step": 39
},
{
"entropy": 0.5523269921541214,
"epoch": 0.15037593984962405,
"grad_norm": 0.04233352467417717,
"learning_rate": 0.0002,
"loss": 0.5523208975791931,
"mean_token_accuracy": 0.7776841074228287,
"num_tokens": 652478.0,
"step": 40
},
{
"entropy": 0.569878563284874,
"epoch": 0.15413533834586465,
"grad_norm": 0.04850724712014198,
"learning_rate": 0.0002,
"loss": 0.5725483298301697,
"mean_token_accuracy": 0.7687844336032867,
"num_tokens": 669008.0,
"step": 41
},
{
"entropy": 0.5655312091112137,
"epoch": 0.15789473684210525,
"grad_norm": 0.04192538931965828,
"learning_rate": 0.0002,
"loss": 0.5679923892021179,
"mean_token_accuracy": 0.7717834413051605,
"num_tokens": 685165.0,
"step": 42
},
{
"entropy": 0.5601242333650589,
"epoch": 0.16165413533834586,
"grad_norm": 0.042079195380210876,
"learning_rate": 0.0002,
"loss": 0.5594381093978882,
"mean_token_accuracy": 0.7740506827831268,
"num_tokens": 701529.0,
"step": 43
},
{
"entropy": 0.575413703918457,
"epoch": 0.16541353383458646,
"grad_norm": 0.04416325315833092,
"learning_rate": 0.0002,
"loss": 0.5747635364532471,
"mean_token_accuracy": 0.7721781879663467,
"num_tokens": 717922.0,
"step": 44
},
{
"entropy": 0.5668691843748093,
"epoch": 0.16917293233082706,
"grad_norm": 0.05360032618045807,
"learning_rate": 0.0002,
"loss": 0.5617860555648804,
"mean_token_accuracy": 0.7762805074453354,
"num_tokens": 733933.0,
"step": 45
},
{
"entropy": 0.5761540979146957,
"epoch": 0.17293233082706766,
"grad_norm": 0.040452998131513596,
"learning_rate": 0.0002,
"loss": 0.5704891085624695,
"mean_token_accuracy": 0.7709734439849854,
"num_tokens": 750555.0,
"step": 46
},
{
"entropy": 0.5610938370227814,
"epoch": 0.17669172932330826,
"grad_norm": 0.04221005737781525,
"learning_rate": 0.0002,
"loss": 0.5613417029380798,
"mean_token_accuracy": 0.7761952430009842,
"num_tokens": 766693.0,
"step": 47
},
{
"entropy": 0.5707991421222687,
"epoch": 0.18045112781954886,
"grad_norm": 0.03976718708872795,
"learning_rate": 0.0002,
"loss": 0.5678077936172485,
"mean_token_accuracy": 0.7737486809492111,
"num_tokens": 783330.0,
"step": 48
},
{
"entropy": 0.5475099235773087,
"epoch": 0.18421052631578946,
"grad_norm": 0.04141751676797867,
"learning_rate": 0.0002,
"loss": 0.5536777973175049,
"mean_token_accuracy": 0.7761508077383041,
"num_tokens": 799528.0,
"step": 49
},
{
"entropy": 0.5602568089962006,
"epoch": 0.18796992481203006,
"grad_norm": 0.04497222229838371,
"learning_rate": 0.0002,
"loss": 0.5695174336433411,
"mean_token_accuracy": 0.7716410309076309,
"num_tokens": 815957.0,
"step": 50
},
{
"entropy": 0.5643552988767624,
"epoch": 0.19172932330827067,
"grad_norm": 0.041956499218940735,
"learning_rate": 0.0002,
"loss": 0.5748574733734131,
"mean_token_accuracy": 0.7680526673793793,
"num_tokens": 832365.0,
"step": 51
},
{
"entropy": 0.5510173141956329,
"epoch": 0.19548872180451127,
"grad_norm": 0.04074239730834961,
"learning_rate": 0.0002,
"loss": 0.5555428266525269,
"mean_token_accuracy": 0.775487020611763,
"num_tokens": 848532.0,
"step": 52
},
{
"entropy": 0.5738573223352432,
"epoch": 0.19924812030075187,
"grad_norm": 0.036227982491254807,
"learning_rate": 0.0002,
"loss": 0.5651305913925171,
"mean_token_accuracy": 0.7725107222795486,
"num_tokens": 864646.0,
"step": 53
},
{
"entropy": 0.5808417797088623,
"epoch": 0.20300751879699247,
"grad_norm": 0.03816494345664978,
"learning_rate": 0.0002,
"loss": 0.5638910531997681,
"mean_token_accuracy": 0.7741686254739761,
"num_tokens": 881239.0,
"step": 54
},
{
"entropy": 0.5693863034248352,
"epoch": 0.20676691729323307,
"grad_norm": 0.035037554800510406,
"learning_rate": 0.0002,
"loss": 0.5701916813850403,
"mean_token_accuracy": 0.7687424123287201,
"num_tokens": 897601.0,
"step": 55
},
{
"entropy": 0.5595564395189285,
"epoch": 0.21052631578947367,
"grad_norm": 0.038008302450180054,
"learning_rate": 0.0002,
"loss": 0.5662519931793213,
"mean_token_accuracy": 0.7714412808418274,
"num_tokens": 914184.0,
"step": 56
},
{
"entropy": 0.5745149552822113,
"epoch": 0.21428571428571427,
"grad_norm": 0.03566848114132881,
"learning_rate": 0.0002,
"loss": 0.5779574513435364,
"mean_token_accuracy": 0.7686354070901871,
"num_tokens": 930380.0,
"step": 57
},
{
"entropy": 0.5675694793462753,
"epoch": 0.21804511278195488,
"grad_norm": 0.03368304297327995,
"learning_rate": 0.0002,
"loss": 0.5728892087936401,
"mean_token_accuracy": 0.7670125216245651,
"num_tokens": 946749.0,
"step": 58
},
{
"entropy": 0.5651668012142181,
"epoch": 0.22180451127819548,
"grad_norm": 0.035859547555446625,
"learning_rate": 0.0002,
"loss": 0.5706139802932739,
"mean_token_accuracy": 0.7697967290878296,
"num_tokens": 963053.0,
"step": 59
},
{
"entropy": 0.5670004636049271,
"epoch": 0.22556390977443608,
"grad_norm": 0.03998008742928505,
"learning_rate": 0.0002,
"loss": 0.5656613111495972,
"mean_token_accuracy": 0.7728914767503738,
"num_tokens": 979368.0,
"step": 60
},
{
"entropy": 0.5696548968553543,
"epoch": 0.22932330827067668,
"grad_norm": 0.04078423231840134,
"learning_rate": 0.0002,
"loss": 0.5716832280158997,
"mean_token_accuracy": 0.7699559330940247,
"num_tokens": 995406.0,
"step": 61
},
{
"entropy": 0.590179905295372,
"epoch": 0.23308270676691728,
"grad_norm": 0.0332336388528347,
"learning_rate": 0.0002,
"loss": 0.5876976847648621,
"mean_token_accuracy": 0.7626538276672363,
"num_tokens": 1011804.0,
"step": 62
},
{
"entropy": 0.5567612648010254,
"epoch": 0.23684210526315788,
"grad_norm": 0.033585552126169205,
"learning_rate": 0.0002,
"loss": 0.552665650844574,
"mean_token_accuracy": 0.7773807644844055,
"num_tokens": 1027984.0,
"step": 63
},
{
"entropy": 0.5729009807109833,
"epoch": 0.24060150375939848,
"grad_norm": 0.037177689373493195,
"learning_rate": 0.0002,
"loss": 0.5675500631332397,
"mean_token_accuracy": 0.7715246975421906,
"num_tokens": 1044274.0,
"step": 64
},
{
"entropy": 0.5565147399902344,
"epoch": 0.24436090225563908,
"grad_norm": 0.034301500767469406,
"learning_rate": 0.0002,
"loss": 0.5531203150749207,
"mean_token_accuracy": 0.7778400331735611,
"num_tokens": 1060650.0,
"step": 65
},
{
"entropy": 0.5595405846834183,
"epoch": 0.24812030075187969,
"grad_norm": 0.032111674547195435,
"learning_rate": 0.0002,
"loss": 0.5613226294517517,
"mean_token_accuracy": 0.7748188674449921,
"num_tokens": 1077082.0,
"step": 66
},
{
"entropy": 0.5684429109096527,
"epoch": 0.2518796992481203,
"grad_norm": 0.036634527146816254,
"learning_rate": 0.0002,
"loss": 0.5726494789123535,
"mean_token_accuracy": 0.7709641754627228,
"num_tokens": 1093328.0,
"step": 67
},
{
"entropy": 0.5331402271986008,
"epoch": 0.2556390977443609,
"grad_norm": 0.03533982113003731,
"learning_rate": 0.0002,
"loss": 0.5389207601547241,
"mean_token_accuracy": 0.7816744297742844,
"num_tokens": 1109550.0,
"step": 68
},
{
"entropy": 0.5601552575826645,
"epoch": 0.2593984962406015,
"grad_norm": 0.03249680623412132,
"learning_rate": 0.0002,
"loss": 0.5670143961906433,
"mean_token_accuracy": 0.7690982818603516,
"num_tokens": 1125670.0,
"step": 69
},
{
"entropy": 0.5491845458745956,
"epoch": 0.2631578947368421,
"grad_norm": 0.03275011479854584,
"learning_rate": 0.0002,
"loss": 0.5448943972587585,
"mean_token_accuracy": 0.7807547152042389,
"num_tokens": 1141797.0,
"step": 70
},
{
"entropy": 0.5585113912820816,
"epoch": 0.2669172932330827,
"grad_norm": 0.03664859011769295,
"learning_rate": 0.0002,
"loss": 0.560217022895813,
"mean_token_accuracy": 0.7755073606967926,
"num_tokens": 1158252.0,
"step": 71
},
{
"entropy": 0.5534943342208862,
"epoch": 0.2706766917293233,
"grad_norm": 0.03374176472425461,
"learning_rate": 0.0002,
"loss": 0.5520960688591003,
"mean_token_accuracy": 0.7764160335063934,
"num_tokens": 1174369.0,
"step": 72
},
{
"entropy": 0.5600117444992065,
"epoch": 0.2744360902255639,
"grad_norm": 0.033763986080884933,
"learning_rate": 0.0002,
"loss": 0.5588683485984802,
"mean_token_accuracy": 0.7761770337820053,
"num_tokens": 1190928.0,
"step": 73
},
{
"entropy": 0.5625056624412537,
"epoch": 0.2781954887218045,
"grad_norm": 0.034332193434238434,
"learning_rate": 0.0002,
"loss": 0.5600336790084839,
"mean_token_accuracy": 0.7748808860778809,
"num_tokens": 1207372.0,
"step": 74
},
{
"entropy": 0.5520483404397964,
"epoch": 0.2819548872180451,
"grad_norm": 0.03450694680213928,
"learning_rate": 0.0002,
"loss": 0.5558054447174072,
"mean_token_accuracy": 0.7750240415334702,
"num_tokens": 1223643.0,
"step": 75
},
{
"entropy": 0.5441252887248993,
"epoch": 0.2857142857142857,
"grad_norm": 0.03436208888888359,
"learning_rate": 0.0002,
"loss": 0.5533716678619385,
"mean_token_accuracy": 0.7759858965873718,
"num_tokens": 1239688.0,
"step": 76
},
{
"entropy": 0.5603705495595932,
"epoch": 0.2894736842105263,
"grad_norm": 0.03493620082736015,
"learning_rate": 0.0002,
"loss": 0.5694956183433533,
"mean_token_accuracy": 0.7717721164226532,
"num_tokens": 1255884.0,
"step": 77
},
{
"entropy": 0.5612094402313232,
"epoch": 0.2932330827067669,
"grad_norm": 0.03372187912464142,
"learning_rate": 0.0002,
"loss": 0.5608274936676025,
"mean_token_accuracy": 0.7747389078140259,
"num_tokens": 1271939.0,
"step": 78
},
{
"entropy": 0.5706307291984558,
"epoch": 0.29699248120300753,
"grad_norm": 0.0331907719373703,
"learning_rate": 0.0002,
"loss": 0.5624843239784241,
"mean_token_accuracy": 0.7734071314334869,
"num_tokens": 1288328.0,
"step": 79
},
{
"entropy": 0.5670299082994461,
"epoch": 0.3007518796992481,
"grad_norm": 0.033556245267391205,
"learning_rate": 0.0002,
"loss": 0.560691237449646,
"mean_token_accuracy": 0.7734449654817581,
"num_tokens": 1304760.0,
"step": 80
},
{
"entropy": 0.5619105398654938,
"epoch": 0.30451127819548873,
"grad_norm": 0.034520749002695084,
"learning_rate": 0.0002,
"loss": 0.5578286647796631,
"mean_token_accuracy": 0.774708941578865,
"num_tokens": 1321100.0,
"step": 81
},
{
"entropy": 0.5670763552188873,
"epoch": 0.3082706766917293,
"grad_norm": 0.04056672751903534,
"learning_rate": 0.0002,
"loss": 0.5737652778625488,
"mean_token_accuracy": 0.76849165558815,
"num_tokens": 1337796.0,
"step": 82
},
{
"entropy": 0.5314440876245499,
"epoch": 0.31203007518796994,
"grad_norm": 0.03262212499976158,
"learning_rate": 0.0002,
"loss": 0.535086989402771,
"mean_token_accuracy": 0.7845727354288101,
"num_tokens": 1354331.0,
"step": 83
},
{
"entropy": 0.5603013932704926,
"epoch": 0.3157894736842105,
"grad_norm": 0.036167021840810776,
"learning_rate": 0.0002,
"loss": 0.5675747394561768,
"mean_token_accuracy": 0.771581381559372,
"num_tokens": 1370543.0,
"step": 84
},
{
"entropy": 0.5526834577322006,
"epoch": 0.31954887218045114,
"grad_norm": 0.03807472810149193,
"learning_rate": 0.0002,
"loss": 0.5507928729057312,
"mean_token_accuracy": 0.7803521752357483,
"num_tokens": 1386874.0,
"step": 85
},
{
"entropy": 0.5730793476104736,
"epoch": 0.3233082706766917,
"grad_norm": 0.03474927321076393,
"learning_rate": 0.0002,
"loss": 0.5660271644592285,
"mean_token_accuracy": 0.7727594673633575,
"num_tokens": 1403110.0,
"step": 86
},
{
"entropy": 0.563334196805954,
"epoch": 0.32706766917293234,
"grad_norm": 0.03167711943387985,
"learning_rate": 0.0002,
"loss": 0.56499844789505,
"mean_token_accuracy": 0.7736751586198807,
"num_tokens": 1419614.0,
"step": 87
},
{
"entropy": 0.5451017022132874,
"epoch": 0.3308270676691729,
"grad_norm": 0.03233160078525543,
"learning_rate": 0.0002,
"loss": 0.5535646677017212,
"mean_token_accuracy": 0.7740109711885452,
"num_tokens": 1436028.0,
"step": 88
},
{
"entropy": 0.5493156313896179,
"epoch": 0.33458646616541354,
"grad_norm": 0.039253026247024536,
"learning_rate": 0.0002,
"loss": 0.5615313649177551,
"mean_token_accuracy": 0.7725273966789246,
"num_tokens": 1452644.0,
"step": 89
},
{
"entropy": 0.5737167149782181,
"epoch": 0.3383458646616541,
"grad_norm": 0.032968465238809586,
"learning_rate": 0.0002,
"loss": 0.5743820667266846,
"mean_token_accuracy": 0.7698662877082825,
"num_tokens": 1469108.0,
"step": 90
},
{
"entropy": 0.5741334408521652,
"epoch": 0.34210526315789475,
"grad_norm": 0.040047451853752136,
"learning_rate": 0.0002,
"loss": 0.5673686265945435,
"mean_token_accuracy": 0.7704142928123474,
"num_tokens": 1485445.0,
"step": 91
},
{
"entropy": 0.5617086589336395,
"epoch": 0.3458646616541353,
"grad_norm": 0.03181539848446846,
"learning_rate": 0.0002,
"loss": 0.5534920692443848,
"mean_token_accuracy": 0.7758883982896805,
"num_tokens": 1501801.0,
"step": 92
},
{
"entropy": 0.5597693920135498,
"epoch": 0.34962406015037595,
"grad_norm": 0.03365252912044525,
"learning_rate": 0.0002,
"loss": 0.5625807046890259,
"mean_token_accuracy": 0.7725406587123871,
"num_tokens": 1518047.0,
"step": 93
},
{
"entropy": 0.5496240109205246,
"epoch": 0.3533834586466165,
"grad_norm": 0.0320061519742012,
"learning_rate": 0.0002,
"loss": 0.5572867393493652,
"mean_token_accuracy": 0.7759815156459808,
"num_tokens": 1534447.0,
"step": 94
},
{
"entropy": 0.5630564987659454,
"epoch": 0.35714285714285715,
"grad_norm": 0.03503059223294258,
"learning_rate": 0.0002,
"loss": 0.5757870674133301,
"mean_token_accuracy": 0.766523465514183,
"num_tokens": 1550660.0,
"step": 95
},
{
"entropy": 0.5605316013097763,
"epoch": 0.3609022556390977,
"grad_norm": 0.032678134739398956,
"learning_rate": 0.0002,
"loss": 0.5634536743164062,
"mean_token_accuracy": 0.7716304063796997,
"num_tokens": 1566883.0,
"step": 96
},
{
"entropy": 0.5838266015052795,
"epoch": 0.36466165413533835,
"grad_norm": 0.030517758801579475,
"learning_rate": 0.0002,
"loss": 0.5759112238883972,
"mean_token_accuracy": 0.7689571380615234,
"num_tokens": 1583221.0,
"step": 97
},
{
"entropy": 0.575135201215744,
"epoch": 0.3684210526315789,
"grad_norm": 0.03620682284235954,
"learning_rate": 0.0002,
"loss": 0.5637581944465637,
"mean_token_accuracy": 0.7740969359874725,
"num_tokens": 1599392.0,
"step": 98
},
{
"entropy": 0.5724876075983047,
"epoch": 0.37218045112781956,
"grad_norm": 0.029337450861930847,
"learning_rate": 0.0002,
"loss": 0.5643174052238464,
"mean_token_accuracy": 0.77228944003582,
"num_tokens": 1615899.0,
"step": 99
},
{
"entropy": 0.5502088665962219,
"epoch": 0.37593984962406013,
"grad_norm": 0.03381618484854698,
"learning_rate": 0.0002,
"loss": 0.5598064661026001,
"mean_token_accuracy": 0.7747711390256882,
"num_tokens": 1632274.0,
"step": 100
},
{
"entropy": 0.5598712712526321,
"epoch": 0.37969924812030076,
"grad_norm": 0.03598952665925026,
"learning_rate": 0.0002,
"loss": 0.5719908475875854,
"mean_token_accuracy": 0.7700261324644089,
"num_tokens": 1648688.0,
"step": 101
},
{
"entropy": 0.5630699545145035,
"epoch": 0.38345864661654133,
"grad_norm": 0.031423430889844894,
"learning_rate": 0.0002,
"loss": 0.565830409526825,
"mean_token_accuracy": 0.7715611904859543,
"num_tokens": 1665258.0,
"step": 102
},
{
"entropy": 0.5845702290534973,
"epoch": 0.38721804511278196,
"grad_norm": 0.02941996045410633,
"learning_rate": 0.0002,
"loss": 0.5816816687583923,
"mean_token_accuracy": 0.7648696899414062,
"num_tokens": 1681639.0,
"step": 103
},
{
"entropy": 0.57722607254982,
"epoch": 0.39097744360902253,
"grad_norm": 0.034051019698381424,
"learning_rate": 0.0002,
"loss": 0.5756963491439819,
"mean_token_accuracy": 0.7672083526849747,
"num_tokens": 1698010.0,
"step": 104
},
{
"entropy": 0.5672426074743271,
"epoch": 0.39473684210526316,
"grad_norm": 0.03516025468707085,
"learning_rate": 0.0002,
"loss": 0.5597167015075684,
"mean_token_accuracy": 0.7757037431001663,
"num_tokens": 1714351.0,
"step": 105
},
{
"entropy": 0.5414413064718246,
"epoch": 0.39849624060150374,
"grad_norm": 0.03341100364923477,
"learning_rate": 0.0002,
"loss": 0.5480563640594482,
"mean_token_accuracy": 0.7781668901443481,
"num_tokens": 1730536.0,
"step": 106
},
{
"entropy": 0.5462717562913895,
"epoch": 0.40225563909774437,
"grad_norm": 0.03385477513074875,
"learning_rate": 0.0002,
"loss": 0.5512043833732605,
"mean_token_accuracy": 0.7787721008062363,
"num_tokens": 1746896.0,
"step": 107
},
{
"entropy": 0.5501613169908524,
"epoch": 0.40601503759398494,
"grad_norm": 0.035874005407094955,
"learning_rate": 0.0002,
"loss": 0.561366081237793,
"mean_token_accuracy": 0.7721621990203857,
"num_tokens": 1763235.0,
"step": 108
},
{
"entropy": 0.5445860922336578,
"epoch": 0.40977443609022557,
"grad_norm": 0.030480582267045975,
"learning_rate": 0.0002,
"loss": 0.5476114153862,
"mean_token_accuracy": 0.7789607793092728,
"num_tokens": 1779550.0,
"step": 109
},
{
"entropy": 0.5542454719543457,
"epoch": 0.41353383458646614,
"grad_norm": 0.0321124792098999,
"learning_rate": 0.0002,
"loss": 0.5565616488456726,
"mean_token_accuracy": 0.7755739092826843,
"num_tokens": 1795761.0,
"step": 110
},
{
"entropy": 0.5581567585468292,
"epoch": 0.41729323308270677,
"grad_norm": 0.0360286608338356,
"learning_rate": 0.0002,
"loss": 0.5496086478233337,
"mean_token_accuracy": 0.775969922542572,
"num_tokens": 1811759.0,
"step": 111
},
{
"entropy": 0.549008384346962,
"epoch": 0.42105263157894735,
"grad_norm": 0.029972167685627937,
"learning_rate": 0.0002,
"loss": 0.5420917272567749,
"mean_token_accuracy": 0.7787465006113052,
"num_tokens": 1827840.0,
"step": 112
},
{
"entropy": 0.5631350576877594,
"epoch": 0.424812030075188,
"grad_norm": 0.028662627562880516,
"learning_rate": 0.0002,
"loss": 0.5532713532447815,
"mean_token_accuracy": 0.7749679088592529,
"num_tokens": 1844167.0,
"step": 113
},
{
"entropy": 0.5277586579322815,
"epoch": 0.42857142857142855,
"grad_norm": 0.03287903964519501,
"learning_rate": 0.0002,
"loss": 0.5350267887115479,
"mean_token_accuracy": 0.7830938249826431,
"num_tokens": 1860530.0,
"step": 114
},
{
"entropy": 0.5497393310070038,
"epoch": 0.4323308270676692,
"grad_norm": 0.03770268335938454,
"learning_rate": 0.0002,
"loss": 0.5615973472595215,
"mean_token_accuracy": 0.7720151543617249,
"num_tokens": 1876970.0,
"step": 115
},
{
"entropy": 0.5729877650737762,
"epoch": 0.43609022556390975,
"grad_norm": 0.033978965133428574,
"learning_rate": 0.0002,
"loss": 0.5777981877326965,
"mean_token_accuracy": 0.7680597454309464,
"num_tokens": 1893575.0,
"step": 116
},
{
"entropy": 0.5504349619150162,
"epoch": 0.4398496240601504,
"grad_norm": 0.03185052052140236,
"learning_rate": 0.0002,
"loss": 0.5459255576133728,
"mean_token_accuracy": 0.7792946100234985,
"num_tokens": 1909809.0,
"step": 117
},
{
"entropy": 0.5565227419137955,
"epoch": 0.44360902255639095,
"grad_norm": 0.028807369992136955,
"learning_rate": 0.0002,
"loss": 0.551781177520752,
"mean_token_accuracy": 0.7776060104370117,
"num_tokens": 1925981.0,
"step": 118
},
{
"entropy": 0.5547512769699097,
"epoch": 0.4473684210526316,
"grad_norm": 0.0315021388232708,
"learning_rate": 0.0002,
"loss": 0.5484083890914917,
"mean_token_accuracy": 0.7798104882240295,
"num_tokens": 1942636.0,
"step": 119
},
{
"entropy": 0.5606597065925598,
"epoch": 0.45112781954887216,
"grad_norm": 0.02974752150475979,
"learning_rate": 0.0002,
"loss": 0.5633252263069153,
"mean_token_accuracy": 0.7710647433996201,
"num_tokens": 1959143.0,
"step": 120
},
{
"entropy": 0.5621981024742126,
"epoch": 0.4548872180451128,
"grad_norm": 0.03396495804190636,
"learning_rate": 0.0002,
"loss": 0.5700369477272034,
"mean_token_accuracy": 0.7708666622638702,
"num_tokens": 1975709.0,
"step": 121
},
{
"entropy": 0.5484206080436707,
"epoch": 0.45864661654135336,
"grad_norm": 0.03273981064558029,
"learning_rate": 0.0002,
"loss": 0.5635251998901367,
"mean_token_accuracy": 0.7709483653306961,
"num_tokens": 1992105.0,
"step": 122
},
{
"entropy": 0.5378261581063271,
"epoch": 0.462406015037594,
"grad_norm": 0.03221985325217247,
"learning_rate": 0.0002,
"loss": 0.5449070334434509,
"mean_token_accuracy": 0.7815380096435547,
"num_tokens": 2008467.0,
"step": 123
},
{
"entropy": 0.5606098920106888,
"epoch": 0.46616541353383456,
"grad_norm": 0.03314457833766937,
"learning_rate": 0.0002,
"loss": 0.563465416431427,
"mean_token_accuracy": 0.7709829658269882,
"num_tokens": 2024710.0,
"step": 124
},
{
"entropy": 0.5656619518995285,
"epoch": 0.4699248120300752,
"grad_norm": 0.03133262321352959,
"learning_rate": 0.0002,
"loss": 0.5610048174858093,
"mean_token_accuracy": 0.7718383222818375,
"num_tokens": 2040853.0,
"step": 125
},
{
"entropy": 0.5635328441858292,
"epoch": 0.47368421052631576,
"grad_norm": 0.030308736488223076,
"learning_rate": 0.0002,
"loss": 0.5604254007339478,
"mean_token_accuracy": 0.7731337696313858,
"num_tokens": 2057006.0,
"step": 126
},
{
"entropy": 0.57016222178936,
"epoch": 0.4774436090225564,
"grad_norm": 0.03194103017449379,
"learning_rate": 0.0002,
"loss": 0.5620253086090088,
"mean_token_accuracy": 0.7717723101377487,
"num_tokens": 2073332.0,
"step": 127
},
{
"entropy": 0.5490193665027618,
"epoch": 0.48120300751879697,
"grad_norm": 0.02910369262099266,
"learning_rate": 0.0002,
"loss": 0.5538103580474854,
"mean_token_accuracy": 0.7780880033969879,
"num_tokens": 2089495.0,
"step": 128
},
{
"entropy": 0.5662434548139572,
"epoch": 0.4849624060150376,
"grad_norm": 0.029468489810824394,
"learning_rate": 0.0002,
"loss": 0.5681107044219971,
"mean_token_accuracy": 0.7689958661794662,
"num_tokens": 2106114.0,
"step": 129
},
{
"entropy": 0.5431465953588486,
"epoch": 0.48872180451127817,
"grad_norm": 0.03223656490445137,
"learning_rate": 0.0002,
"loss": 0.5507116317749023,
"mean_token_accuracy": 0.7764191329479218,
"num_tokens": 2122567.0,
"step": 130
},
{
"entropy": 0.5563855171203613,
"epoch": 0.4924812030075188,
"grad_norm": 0.028281886130571365,
"learning_rate": 0.0002,
"loss": 0.5583161115646362,
"mean_token_accuracy": 0.7736326307058334,
"num_tokens": 2139083.0,
"step": 131
},
{
"entropy": 0.5674906224012375,
"epoch": 0.49624060150375937,
"grad_norm": 0.02878589555621147,
"learning_rate": 0.0002,
"loss": 0.564136803150177,
"mean_token_accuracy": 0.7724441289901733,
"num_tokens": 2155542.0,
"step": 132
},
{
"entropy": 0.5472439229488373,
"epoch": 0.5,
"grad_norm": 0.029321735724806786,
"learning_rate": 0.0002,
"loss": 0.5442805290222168,
"mean_token_accuracy": 0.7798047512769699,
"num_tokens": 2171801.0,
"step": 133
},
{
"entropy": 0.565643772482872,
"epoch": 0.5037593984962406,
"grad_norm": 0.028855223208665848,
"learning_rate": 0.0002,
"loss": 0.5595606565475464,
"mean_token_accuracy": 0.774070993065834,
"num_tokens": 2188167.0,
"step": 134
},
{
"entropy": 0.5532195568084717,
"epoch": 0.5075187969924813,
"grad_norm": 0.03198866546154022,
"learning_rate": 0.0002,
"loss": 0.5570374131202698,
"mean_token_accuracy": 0.7740880846977234,
"num_tokens": 2204470.0,
"step": 135
},
{
"entropy": 0.5408245772123337,
"epoch": 0.5112781954887218,
"grad_norm": 0.030379725620150566,
"learning_rate": 0.0002,
"loss": 0.5514412522315979,
"mean_token_accuracy": 0.7769049108028412,
"num_tokens": 2220739.0,
"step": 136
},
{
"entropy": 0.5346933305263519,
"epoch": 0.5150375939849624,
"grad_norm": 0.03085665963590145,
"learning_rate": 0.0002,
"loss": 0.5364114046096802,
"mean_token_accuracy": 0.7843690663576126,
"num_tokens": 2237147.0,
"step": 137
},
{
"entropy": 0.5493077784776688,
"epoch": 0.518796992481203,
"grad_norm": 0.02923487313091755,
"learning_rate": 0.0002,
"loss": 0.5560771822929382,
"mean_token_accuracy": 0.7737279832363129,
"num_tokens": 2253415.0,
"step": 138
},
{
"entropy": 0.5472232103347778,
"epoch": 0.5225563909774437,
"grad_norm": 0.031521063297986984,
"learning_rate": 0.0002,
"loss": 0.5497399568557739,
"mean_token_accuracy": 0.777409166097641,
"num_tokens": 2269589.0,
"step": 139
},
{
"entropy": 0.5515349954366684,
"epoch": 0.5263157894736842,
"grad_norm": 0.02956547960639,
"learning_rate": 0.0002,
"loss": 0.5464341640472412,
"mean_token_accuracy": 0.7794498354196548,
"num_tokens": 2285953.0,
"step": 140
},
{
"entropy": 0.5558236241340637,
"epoch": 0.5300751879699248,
"grad_norm": 0.02974775619804859,
"learning_rate": 0.0002,
"loss": 0.5577874779701233,
"mean_token_accuracy": 0.7712955176830292,
"num_tokens": 2302120.0,
"step": 141
},
{
"entropy": 0.5856722742319107,
"epoch": 0.5338345864661654,
"grad_norm": 0.03199459984898567,
"learning_rate": 0.0002,
"loss": 0.5856820940971375,
"mean_token_accuracy": 0.7616758495569229,
"num_tokens": 2318555.0,
"step": 142
},
{
"entropy": 0.5560419261455536,
"epoch": 0.5375939849624061,
"grad_norm": 0.03210260346531868,
"learning_rate": 0.0002,
"loss": 0.5606544613838196,
"mean_token_accuracy": 0.7734680622816086,
"num_tokens": 2334764.0,
"step": 143
},
{
"entropy": 0.5652720183134079,
"epoch": 0.5413533834586466,
"grad_norm": 0.025965852662920952,
"learning_rate": 0.0002,
"loss": 0.562166690826416,
"mean_token_accuracy": 0.77190200984478,
"num_tokens": 2351198.0,
"step": 144
},
{
"entropy": 0.531855046749115,
"epoch": 0.5451127819548872,
"grad_norm": 0.029480863362550735,
"learning_rate": 0.0002,
"loss": 0.5261865854263306,
"mean_token_accuracy": 0.7886765003204346,
"num_tokens": 2367340.0,
"step": 145
},
{
"entropy": 0.5517164468765259,
"epoch": 0.5488721804511278,
"grad_norm": 0.03105936385691166,
"learning_rate": 0.0002,
"loss": 0.5542973875999451,
"mean_token_accuracy": 0.7738576829433441,
"num_tokens": 2383605.0,
"step": 146
},
{
"entropy": 0.5376151502132416,
"epoch": 0.5526315789473685,
"grad_norm": 0.03337828442454338,
"learning_rate": 0.0002,
"loss": 0.5453506708145142,
"mean_token_accuracy": 0.7788939327001572,
"num_tokens": 2399719.0,
"step": 147
},
{
"entropy": 0.5623980462551117,
"epoch": 0.556390977443609,
"grad_norm": 0.028280731290578842,
"learning_rate": 0.0002,
"loss": 0.560990035533905,
"mean_token_accuracy": 0.7726676762104034,
"num_tokens": 2416182.0,
"step": 148
},
{
"entropy": 0.5573243647813797,
"epoch": 0.5601503759398496,
"grad_norm": 0.032505616545677185,
"learning_rate": 0.0002,
"loss": 0.5568500757217407,
"mean_token_accuracy": 0.7742682248353958,
"num_tokens": 2432558.0,
"step": 149
},
{
"entropy": 0.5573329925537109,
"epoch": 0.5639097744360902,
"grad_norm": 0.03238248452544212,
"learning_rate": 0.0002,
"loss": 0.5538819432258606,
"mean_token_accuracy": 0.777379959821701,
"num_tokens": 2448908.0,
"step": 150
},
{
"entropy": 0.5407138615846634,
"epoch": 0.5676691729323309,
"grad_norm": 0.02900576777756214,
"learning_rate": 0.0002,
"loss": 0.5466345548629761,
"mean_token_accuracy": 0.7775551229715347,
"num_tokens": 2465270.0,
"step": 151
},
{
"entropy": 0.554168626666069,
"epoch": 0.5714285714285714,
"grad_norm": 0.0312657356262207,
"learning_rate": 0.0002,
"loss": 0.5629188418388367,
"mean_token_accuracy": 0.7751999050378799,
"num_tokens": 2481577.0,
"step": 152
},
{
"entropy": 0.5447106957435608,
"epoch": 0.575187969924812,
"grad_norm": 0.02679499238729477,
"learning_rate": 0.0002,
"loss": 0.5434100031852722,
"mean_token_accuracy": 0.7805473357439041,
"num_tokens": 2498025.0,
"step": 153
},
{
"entropy": 0.5469905585050583,
"epoch": 0.5789473684210527,
"grad_norm": 0.03267526254057884,
"learning_rate": 0.0002,
"loss": 0.5438751578330994,
"mean_token_accuracy": 0.7798020392656326,
"num_tokens": 2514245.0,
"step": 154
},
{
"entropy": 0.5860631912946701,
"epoch": 0.5827067669172933,
"grad_norm": 0.03039904497563839,
"learning_rate": 0.0002,
"loss": 0.5810500383377075,
"mean_token_accuracy": 0.7673344761133194,
"num_tokens": 2530676.0,
"step": 155
},
{
"entropy": 0.5545631796121597,
"epoch": 0.5864661654135338,
"grad_norm": 0.028710732236504555,
"learning_rate": 0.0002,
"loss": 0.5573135614395142,
"mean_token_accuracy": 0.7758313864469528,
"num_tokens": 2547029.0,
"step": 156
},
{
"entropy": 0.5309299975633621,
"epoch": 0.5902255639097744,
"grad_norm": 0.037456102669239044,
"learning_rate": 0.0002,
"loss": 0.5443962812423706,
"mean_token_accuracy": 0.7781406342983246,
"num_tokens": 2563337.0,
"step": 157
},
{
"entropy": 0.5590629875659943,
"epoch": 0.5939849624060151,
"grad_norm": 0.03138922527432442,
"learning_rate": 0.0002,
"loss": 0.570573627948761,
"mean_token_accuracy": 0.7692520618438721,
"num_tokens": 2579699.0,
"step": 158
},
{
"entropy": 0.5507991015911102,
"epoch": 0.5977443609022557,
"grad_norm": 0.031148385256528854,
"learning_rate": 0.0002,
"loss": 0.549103856086731,
"mean_token_accuracy": 0.7769458442926407,
"num_tokens": 2596012.0,
"step": 159
},
{
"entropy": 0.5691386461257935,
"epoch": 0.6015037593984962,
"grad_norm": 0.03321440890431404,
"learning_rate": 0.0002,
"loss": 0.5682097673416138,
"mean_token_accuracy": 0.7695286124944687,
"num_tokens": 2612192.0,
"step": 160
},
{
"entropy": 0.5378303825855255,
"epoch": 0.6052631578947368,
"grad_norm": 0.029134051874279976,
"learning_rate": 0.0002,
"loss": 0.5314258337020874,
"mean_token_accuracy": 0.7879271060228348,
"num_tokens": 2628354.0,
"step": 161
},
{
"entropy": 0.5507005900144577,
"epoch": 0.6090225563909775,
"grad_norm": 0.028996866196393967,
"learning_rate": 0.0002,
"loss": 0.5531865358352661,
"mean_token_accuracy": 0.7761473655700684,
"num_tokens": 2644501.0,
"step": 162
},
{
"entropy": 0.5587231516838074,
"epoch": 0.6127819548872181,
"grad_norm": 0.03128351643681526,
"learning_rate": 0.0002,
"loss": 0.5601255297660828,
"mean_token_accuracy": 0.7728810757398605,
"num_tokens": 2660638.0,
"step": 163
},
{
"entropy": 0.5519489645957947,
"epoch": 0.6165413533834586,
"grad_norm": 0.03436357155442238,
"learning_rate": 0.0002,
"loss": 0.5580562949180603,
"mean_token_accuracy": 0.7739841938018799,
"num_tokens": 2676953.0,
"step": 164
},
{
"entropy": 0.5486033111810684,
"epoch": 0.6203007518796992,
"grad_norm": 0.030973074957728386,
"learning_rate": 0.0002,
"loss": 0.5505262613296509,
"mean_token_accuracy": 0.7756275236606598,
"num_tokens": 2693031.0,
"step": 165
},
{
"entropy": 0.5522639453411102,
"epoch": 0.6240601503759399,
"grad_norm": 0.03254729509353638,
"learning_rate": 0.0002,
"loss": 0.5508989095687866,
"mean_token_accuracy": 0.7748342007398605,
"num_tokens": 2709299.0,
"step": 166
},
{
"entropy": 0.5678143799304962,
"epoch": 0.6278195488721805,
"grad_norm": 0.027512261644005775,
"learning_rate": 0.0002,
"loss": 0.5593494772911072,
"mean_token_accuracy": 0.7736407816410065,
"num_tokens": 2725613.0,
"step": 167
},
{
"entropy": 0.5474298894405365,
"epoch": 0.631578947368421,
"grad_norm": 0.02777693048119545,
"learning_rate": 0.0002,
"loss": 0.5416566729545593,
"mean_token_accuracy": 0.7782540619373322,
"num_tokens": 2741762.0,
"step": 168
},
{
"entropy": 0.5676318109035492,
"epoch": 0.6353383458646616,
"grad_norm": 0.029206767678260803,
"learning_rate": 0.0002,
"loss": 0.5748559832572937,
"mean_token_accuracy": 0.7664623707532883,
"num_tokens": 2757964.0,
"step": 169
},
{
"entropy": 0.5471738129854202,
"epoch": 0.6390977443609023,
"grad_norm": 0.03809071704745293,
"learning_rate": 0.0002,
"loss": 0.5600809454917908,
"mean_token_accuracy": 0.7715400904417038,
"num_tokens": 2774260.0,
"step": 170
},
{
"entropy": 0.5543518960475922,
"epoch": 0.6428571428571429,
"grad_norm": 0.029330087825655937,
"learning_rate": 0.0002,
"loss": 0.5620079040527344,
"mean_token_accuracy": 0.7744479775428772,
"num_tokens": 2790354.0,
"step": 171
},
{
"entropy": 0.5556869655847549,
"epoch": 0.6466165413533834,
"grad_norm": 0.03219934552907944,
"learning_rate": 0.0002,
"loss": 0.5567511916160583,
"mean_token_accuracy": 0.7723055630922318,
"num_tokens": 2806411.0,
"step": 172
},
{
"entropy": 0.5598954260349274,
"epoch": 0.650375939849624,
"grad_norm": 0.03049585595726967,
"learning_rate": 0.0002,
"loss": 0.5581772923469543,
"mean_token_accuracy": 0.7723381072282791,
"num_tokens": 2822457.0,
"step": 173
},
{
"entropy": 0.5619530379772186,
"epoch": 0.6541353383458647,
"grad_norm": 0.029140042141079903,
"learning_rate": 0.0002,
"loss": 0.5565066337585449,
"mean_token_accuracy": 0.7765934616327286,
"num_tokens": 2838821.0,
"step": 174
},
{
"entropy": 0.5609161257743835,
"epoch": 0.6578947368421053,
"grad_norm": 0.03307173773646355,
"learning_rate": 0.0002,
"loss": 0.5584904551506042,
"mean_token_accuracy": 0.7731504142284393,
"num_tokens": 2854976.0,
"step": 175
},
{
"entropy": 0.5472587794065475,
"epoch": 0.6616541353383458,
"grad_norm": 0.027935896068811417,
"learning_rate": 0.0002,
"loss": 0.5532994270324707,
"mean_token_accuracy": 0.7745202481746674,
"num_tokens": 2871053.0,
"step": 176
},
{
"entropy": 0.5559375882148743,
"epoch": 0.6654135338345865,
"grad_norm": 0.028821157291531563,
"learning_rate": 0.0002,
"loss": 0.5584789514541626,
"mean_token_accuracy": 0.7747485786676407,
"num_tokens": 2887600.0,
"step": 177
},
{
"entropy": 0.5338730216026306,
"epoch": 0.6691729323308271,
"grad_norm": 0.026577429845929146,
"learning_rate": 0.0002,
"loss": 0.5381085276603699,
"mean_token_accuracy": 0.7791920751333237,
"num_tokens": 2903970.0,
"step": 178
},
{
"entropy": 0.556627482175827,
"epoch": 0.6729323308270677,
"grad_norm": 0.028157442808151245,
"learning_rate": 0.0002,
"loss": 0.5612574219703674,
"mean_token_accuracy": 0.7728701531887054,
"num_tokens": 2920095.0,
"step": 179
},
{
"entropy": 0.5468809902667999,
"epoch": 0.6766917293233082,
"grad_norm": 0.026617249473929405,
"learning_rate": 0.0002,
"loss": 0.5438866019248962,
"mean_token_accuracy": 0.776974618434906,
"num_tokens": 2936400.0,
"step": 180
},
{
"entropy": 0.5707015246152878,
"epoch": 0.6804511278195489,
"grad_norm": 0.03165828064084053,
"learning_rate": 0.0002,
"loss": 0.5632250905036926,
"mean_token_accuracy": 0.7731919437646866,
"num_tokens": 2952758.0,
"step": 181
},
{
"entropy": 0.5669363737106323,
"epoch": 0.6842105263157895,
"grad_norm": 0.03147813677787781,
"learning_rate": 0.0002,
"loss": 0.5652462840080261,
"mean_token_accuracy": 0.7679423987865448,
"num_tokens": 2969082.0,
"step": 182
},
{
"entropy": 0.5380169749259949,
"epoch": 0.6879699248120301,
"grad_norm": 0.027151955291628838,
"learning_rate": 0.0002,
"loss": 0.5455905795097351,
"mean_token_accuracy": 0.7796274274587631,
"num_tokens": 2985183.0,
"step": 183
},
{
"entropy": 0.5574334859848022,
"epoch": 0.6917293233082706,
"grad_norm": 0.03327858820557594,
"learning_rate": 0.0002,
"loss": 0.5695413947105408,
"mean_token_accuracy": 0.7701131999492645,
"num_tokens": 3001508.0,
"step": 184
},
{
"entropy": 0.5463923811912537,
"epoch": 0.6954887218045113,
"grad_norm": 0.07987584918737411,
"learning_rate": 0.0002,
"loss": 0.5507839918136597,
"mean_token_accuracy": 0.7769906222820282,
"num_tokens": 3017824.0,
"step": 185
},
{
"entropy": 0.5602079033851624,
"epoch": 0.6992481203007519,
"grad_norm": 0.032177284359931946,
"learning_rate": 0.0002,
"loss": 0.5561395883560181,
"mean_token_accuracy": 0.7731778472661972,
"num_tokens": 3034234.0,
"step": 186
},
{
"entropy": 0.5552242249250412,
"epoch": 0.7030075187969925,
"grad_norm": 0.17276985943317413,
"learning_rate": 0.0002,
"loss": 0.5665730237960815,
"mean_token_accuracy": 0.7776633650064468,
"num_tokens": 3050476.0,
"step": 187
},
{
"entropy": 0.5759404450654984,
"epoch": 0.706766917293233,
"grad_norm": 0.03187716379761696,
"learning_rate": 0.0002,
"loss": 0.5729998350143433,
"mean_token_accuracy": 0.7687390595674515,
"num_tokens": 3066888.0,
"step": 188
},
{
"entropy": 0.5559865832328796,
"epoch": 0.7105263157894737,
"grad_norm": 0.03442467749118805,
"learning_rate": 0.0002,
"loss": 0.5568963885307312,
"mean_token_accuracy": 0.7721963822841644,
"num_tokens": 3083234.0,
"step": 189
},
{
"entropy": 0.5560625046491623,
"epoch": 0.7142857142857143,
"grad_norm": 0.033102214336395264,
"learning_rate": 0.0002,
"loss": 0.5556387305259705,
"mean_token_accuracy": 0.7737521678209305,
"num_tokens": 3099426.0,
"step": 190
},
{
"entropy": 0.5532419383525848,
"epoch": 0.7180451127819549,
"grad_norm": 0.03335823863744736,
"learning_rate": 0.0002,
"loss": 0.5556282997131348,
"mean_token_accuracy": 0.7746775895357132,
"num_tokens": 3115788.0,
"step": 191
},
{
"entropy": 0.5511862933635712,
"epoch": 0.7218045112781954,
"grad_norm": 0.04099865257740021,
"learning_rate": 0.0002,
"loss": 0.564994752407074,
"mean_token_accuracy": 0.7689872086048126,
"num_tokens": 3132132.0,
"step": 192
},
{
"entropy": 0.5518632382154465,
"epoch": 0.7255639097744361,
"grad_norm": 0.03417513892054558,
"learning_rate": 0.0002,
"loss": 0.5622019171714783,
"mean_token_accuracy": 0.7704385071992874,
"num_tokens": 3148387.0,
"step": 193
},
{
"entropy": 0.5632559806108475,
"epoch": 0.7293233082706767,
"grad_norm": 0.030820859596133232,
"learning_rate": 0.0002,
"loss": 0.5607547163963318,
"mean_token_accuracy": 0.7714632153511047,
"num_tokens": 3164505.0,
"step": 194
},
{
"entropy": 0.589142233133316,
"epoch": 0.7330827067669173,
"grad_norm": 0.029547762125730515,
"learning_rate": 0.0002,
"loss": 0.5773433446884155,
"mean_token_accuracy": 0.7666076868772507,
"num_tokens": 3180879.0,
"step": 195
},
{
"entropy": 0.5543933212757111,
"epoch": 0.7368421052631579,
"grad_norm": 0.03714846074581146,
"learning_rate": 0.0002,
"loss": 0.5530077219009399,
"mean_token_accuracy": 0.7751282453536987,
"num_tokens": 3196997.0,
"step": 196
},
{
"entropy": 0.5504618287086487,
"epoch": 0.7406015037593985,
"grad_norm": 0.03167671337723732,
"learning_rate": 0.0002,
"loss": 0.5446099042892456,
"mean_token_accuracy": 0.7800730615854263,
"num_tokens": 3213232.0,
"step": 197
},
{
"entropy": 0.5440194606781006,
"epoch": 0.7443609022556391,
"grad_norm": 0.028702866286039352,
"learning_rate": 0.0002,
"loss": 0.5420858860015869,
"mean_token_accuracy": 0.780303880572319,
"num_tokens": 3229429.0,
"step": 198
},
{
"entropy": 0.5432772487401962,
"epoch": 0.7481203007518797,
"grad_norm": 0.04096582531929016,
"learning_rate": 0.0002,
"loss": 0.5523824095726013,
"mean_token_accuracy": 0.7756204158067703,
"num_tokens": 3245679.0,
"step": 199
},
{
"entropy": 0.5610463172197342,
"epoch": 0.7518796992481203,
"grad_norm": 0.036679867655038834,
"learning_rate": 0.0002,
"loss": 0.5655776262283325,
"mean_token_accuracy": 0.7715456783771515,
"num_tokens": 3262189.0,
"step": 200
},
{
"entropy": 0.5549308806657791,
"epoch": 0.7556390977443609,
"grad_norm": 0.02466488443315029,
"learning_rate": 0.0002,
"loss": 0.5475676655769348,
"mean_token_accuracy": 0.7779862135648727,
"num_tokens": 3278554.0,
"step": 201
},
{
"entropy": 0.5799617767333984,
"epoch": 0.7593984962406015,
"grad_norm": 0.028492242097854614,
"learning_rate": 0.0002,
"loss": 0.5770009160041809,
"mean_token_accuracy": 0.768639862537384,
"num_tokens": 3295063.0,
"step": 202
},
{
"entropy": 0.5529991090297699,
"epoch": 0.7631578947368421,
"grad_norm": 0.034728050231933594,
"learning_rate": 0.0002,
"loss": 0.5533767938613892,
"mean_token_accuracy": 0.7767061442136765,
"num_tokens": 3311348.0,
"step": 203
},
{
"entropy": 0.5689148902893066,
"epoch": 0.7669172932330827,
"grad_norm": 0.026985110715031624,
"learning_rate": 0.0002,
"loss": 0.5640019774436951,
"mean_token_accuracy": 0.7733623534440994,
"num_tokens": 3327811.0,
"step": 204
},
{
"entropy": 0.5497773736715317,
"epoch": 0.7706766917293233,
"grad_norm": 0.026469919830560684,
"learning_rate": 0.0002,
"loss": 0.5544072389602661,
"mean_token_accuracy": 0.7730964869260788,
"num_tokens": 3344190.0,
"step": 205
},
{
"entropy": 0.5487343817949295,
"epoch": 0.7744360902255639,
"grad_norm": 0.03394508361816406,
"learning_rate": 0.0002,
"loss": 0.5584373474121094,
"mean_token_accuracy": 0.7742648869752884,
"num_tokens": 3360318.0,
"step": 206
},
{
"entropy": 0.5593785345554352,
"epoch": 0.7781954887218046,
"grad_norm": 0.032090939581394196,
"learning_rate": 0.0002,
"loss": 0.5755316019058228,
"mean_token_accuracy": 0.7676598578691483,
"num_tokens": 3376652.0,
"step": 207
},
{
"entropy": 0.5540517121553421,
"epoch": 0.7819548872180451,
"grad_norm": 0.029152996838092804,
"learning_rate": 0.0002,
"loss": 0.553016185760498,
"mean_token_accuracy": 0.7774887681007385,
"num_tokens": 3392915.0,
"step": 208
},
{
"entropy": 0.5617629438638687,
"epoch": 0.7857142857142857,
"grad_norm": 0.029667040333151817,
"learning_rate": 0.0002,
"loss": 0.5602532625198364,
"mean_token_accuracy": 0.7753290235996246,
"num_tokens": 3409209.0,
"step": 209
},
{
"entropy": 0.5676616579294205,
"epoch": 0.7894736842105263,
"grad_norm": 0.03213479742407799,
"learning_rate": 0.0002,
"loss": 0.5651354789733887,
"mean_token_accuracy": 0.7729621976613998,
"num_tokens": 3425474.0,
"step": 210
},
{
"entropy": 0.5594458729028702,
"epoch": 0.793233082706767,
"grad_norm": 0.029152261093258858,
"learning_rate": 0.0002,
"loss": 0.5545633435249329,
"mean_token_accuracy": 0.7748460322618484,
"num_tokens": 3441810.0,
"step": 211
},
{
"entropy": 0.5657470673322678,
"epoch": 0.7969924812030075,
"grad_norm": 0.030394772067666054,
"learning_rate": 0.0002,
"loss": 0.5634792447090149,
"mean_token_accuracy": 0.7723300457000732,
"num_tokens": 3458017.0,
"step": 212
},
{
"entropy": 0.5386789590120316,
"epoch": 0.8007518796992481,
"grad_norm": 0.030803421512246132,
"learning_rate": 0.0002,
"loss": 0.543491780757904,
"mean_token_accuracy": 0.7788570076227188,
"num_tokens": 3474394.0,
"step": 213
},
{
"entropy": 0.5462117493152618,
"epoch": 0.8045112781954887,
"grad_norm": 0.032262928783893585,
"learning_rate": 0.0002,
"loss": 0.5550574064254761,
"mean_token_accuracy": 0.7757156640291214,
"num_tokens": 3490659.0,
"step": 214
},
{
"entropy": 0.5618492513895035,
"epoch": 0.8082706766917294,
"grad_norm": 0.030515553429722786,
"learning_rate": 0.0002,
"loss": 0.5604183673858643,
"mean_token_accuracy": 0.7713865786790848,
"num_tokens": 3507047.0,
"step": 215
},
{
"entropy": 0.5674788951873779,
"epoch": 0.8120300751879699,
"grad_norm": 0.03319476544857025,
"learning_rate": 0.0002,
"loss": 0.5704171657562256,
"mean_token_accuracy": 0.7660792618989944,
"num_tokens": 3523740.0,
"step": 216
},
{
"entropy": 0.5655016303062439,
"epoch": 0.8157894736842105,
"grad_norm": 0.025443432852625847,
"learning_rate": 0.0002,
"loss": 0.5628257989883423,
"mean_token_accuracy": 0.7704775929450989,
"num_tokens": 3540342.0,
"step": 217
},
{
"entropy": 0.5403912216424942,
"epoch": 0.8195488721804511,
"grad_norm": 0.03260233253240585,
"learning_rate": 0.0002,
"loss": 0.542536735534668,
"mean_token_accuracy": 0.7788421809673309,
"num_tokens": 3556623.0,
"step": 218
},
{
"entropy": 0.5680458843708038,
"epoch": 0.8233082706766918,
"grad_norm": 0.034483131021261215,
"learning_rate": 0.0002,
"loss": 0.5691131353378296,
"mean_token_accuracy": 0.76755091547966,
"num_tokens": 3573182.0,
"step": 219
},
{
"entropy": 0.5689092427492142,
"epoch": 0.8270676691729323,
"grad_norm": 0.027871334925293922,
"learning_rate": 0.0002,
"loss": 0.5706035494804382,
"mean_token_accuracy": 0.768176794052124,
"num_tokens": 3589235.0,
"step": 220
},
{
"entropy": 0.563735768198967,
"epoch": 0.8308270676691729,
"grad_norm": 0.02944294363260269,
"learning_rate": 0.0002,
"loss": 0.5672820806503296,
"mean_token_accuracy": 0.7710028737783432,
"num_tokens": 3605593.0,
"step": 221
},
{
"entropy": 0.5397096872329712,
"epoch": 0.8345864661654135,
"grad_norm": 0.030527444556355476,
"learning_rate": 0.0002,
"loss": 0.5446432828903198,
"mean_token_accuracy": 0.7779533118009567,
"num_tokens": 3621959.0,
"step": 222
},
{
"entropy": 0.5514500439167023,
"epoch": 0.8383458646616542,
"grad_norm": 0.029658010229468346,
"learning_rate": 0.0002,
"loss": 0.5571471452713013,
"mean_token_accuracy": 0.7720492035150528,
"num_tokens": 3638089.0,
"step": 223
},
{
"entropy": 0.5721202939748764,
"epoch": 0.8421052631578947,
"grad_norm": 0.026809731498360634,
"learning_rate": 0.0002,
"loss": 0.5748306512832642,
"mean_token_accuracy": 0.7655669301748276,
"num_tokens": 3654508.0,
"step": 224
},
{
"entropy": 0.5657171607017517,
"epoch": 0.8458646616541353,
"grad_norm": 0.02784072421491146,
"learning_rate": 0.0002,
"loss": 0.5645638704299927,
"mean_token_accuracy": 0.7713258415460587,
"num_tokens": 3670883.0,
"step": 225
},
{
"entropy": 0.5707942843437195,
"epoch": 0.849624060150376,
"grad_norm": 0.027495261281728745,
"learning_rate": 0.0002,
"loss": 0.5690877437591553,
"mean_token_accuracy": 0.7672522664070129,
"num_tokens": 3687138.0,
"step": 226
},
{
"entropy": 0.5599692463874817,
"epoch": 0.8533834586466166,
"grad_norm": 0.02714758738875389,
"learning_rate": 0.0002,
"loss": 0.558695912361145,
"mean_token_accuracy": 0.7728016823530197,
"num_tokens": 3703748.0,
"step": 227
},
{
"entropy": 0.5557542443275452,
"epoch": 0.8571428571428571,
"grad_norm": 0.027014488354325294,
"learning_rate": 0.0002,
"loss": 0.5528618097305298,
"mean_token_accuracy": 0.7744259238243103,
"num_tokens": 3720292.0,
"step": 228
},
{
"entropy": 0.5545012503862381,
"epoch": 0.8609022556390977,
"grad_norm": 0.030803967267274857,
"learning_rate": 0.0002,
"loss": 0.5548436045646667,
"mean_token_accuracy": 0.772901862859726,
"num_tokens": 3736719.0,
"step": 229
},
{
"entropy": 0.5630923807621002,
"epoch": 0.8646616541353384,
"grad_norm": 0.025556016713380814,
"learning_rate": 0.0002,
"loss": 0.5638667941093445,
"mean_token_accuracy": 0.7724170237779617,
"num_tokens": 3753111.0,
"step": 230
},
{
"entropy": 0.5482154339551926,
"epoch": 0.868421052631579,
"grad_norm": 0.026636675000190735,
"learning_rate": 0.0002,
"loss": 0.5516517758369446,
"mean_token_accuracy": 0.7738501876592636,
"num_tokens": 3769379.0,
"step": 231
},
{
"entropy": 0.5542188733816147,
"epoch": 0.8721804511278195,
"grad_norm": 0.030669352039694786,
"learning_rate": 0.0002,
"loss": 0.562447190284729,
"mean_token_accuracy": 0.7716392129659653,
"num_tokens": 3785882.0,
"step": 232
},
{
"entropy": 0.5528077483177185,
"epoch": 0.8759398496240601,
"grad_norm": 0.02840394526720047,
"learning_rate": 0.0002,
"loss": 0.5538339614868164,
"mean_token_accuracy": 0.7760019749403,
"num_tokens": 3802159.0,
"step": 233
},
{
"entropy": 0.5367541313171387,
"epoch": 0.8796992481203008,
"grad_norm": 0.027923524379730225,
"learning_rate": 0.0002,
"loss": 0.5381957292556763,
"mean_token_accuracy": 0.7805743962526321,
"num_tokens": 3818361.0,
"step": 234
},
{
"entropy": 0.5520175248384476,
"epoch": 0.8834586466165414,
"grad_norm": 0.03241734206676483,
"learning_rate": 0.0002,
"loss": 0.5536331534385681,
"mean_token_accuracy": 0.773536428809166,
"num_tokens": 3834731.0,
"step": 235
},
{
"entropy": 0.5460867285728455,
"epoch": 0.8872180451127819,
"grad_norm": 0.027079345658421516,
"learning_rate": 0.0002,
"loss": 0.5475375056266785,
"mean_token_accuracy": 0.7766189575195312,
"num_tokens": 3850982.0,
"step": 236
},
{
"entropy": 0.5568866729736328,
"epoch": 0.8909774436090225,
"grad_norm": 0.02961307018995285,
"learning_rate": 0.0002,
"loss": 0.5572586059570312,
"mean_token_accuracy": 0.7737904638051987,
"num_tokens": 3867054.0,
"step": 237
},
{
"entropy": 0.5462281703948975,
"epoch": 0.8947368421052632,
"grad_norm": 0.02547132968902588,
"learning_rate": 0.0002,
"loss": 0.5462326407432556,
"mean_token_accuracy": 0.779721811413765,
"num_tokens": 3883377.0,
"step": 238
},
{
"entropy": 0.5601012706756592,
"epoch": 0.8984962406015038,
"grad_norm": 0.027931643649935722,
"learning_rate": 0.0002,
"loss": 0.5673293471336365,
"mean_token_accuracy": 0.7699201852083206,
"num_tokens": 3899760.0,
"step": 239
},
{
"entropy": 0.558964416384697,
"epoch": 0.9022556390977443,
"grad_norm": 0.027888454496860504,
"learning_rate": 0.0002,
"loss": 0.5613861083984375,
"mean_token_accuracy": 0.7711526602506638,
"num_tokens": 3916259.0,
"step": 240
},
{
"entropy": 0.5591289699077606,
"epoch": 0.9060150375939849,
"grad_norm": 0.027367601171135902,
"learning_rate": 0.0002,
"loss": 0.5553447008132935,
"mean_token_accuracy": 0.7748121023178101,
"num_tokens": 3932764.0,
"step": 241
},
{
"entropy": 0.5419012606143951,
"epoch": 0.9097744360902256,
"grad_norm": 0.02720046602189541,
"learning_rate": 0.0002,
"loss": 0.5389461517333984,
"mean_token_accuracy": 0.7815262824296951,
"num_tokens": 3948767.0,
"step": 242
},
{
"entropy": 0.5506538301706314,
"epoch": 0.9135338345864662,
"grad_norm": 0.04870102182030678,
"learning_rate": 0.0002,
"loss": 0.5555541515350342,
"mean_token_accuracy": 0.7749286592006683,
"num_tokens": 3964899.0,
"step": 243
},
{
"entropy": 0.5377955883741379,
"epoch": 0.9172932330827067,
"grad_norm": 0.030033506453037262,
"learning_rate": 0.0002,
"loss": 0.5442740321159363,
"mean_token_accuracy": 0.7790930420160294,
"num_tokens": 3981257.0,
"step": 244
},
{
"entropy": 0.5506607741117477,
"epoch": 0.9210526315789473,
"grad_norm": 0.03199909254908562,
"learning_rate": 0.0002,
"loss": 0.5553537607192993,
"mean_token_accuracy": 0.7754099667072296,
"num_tokens": 3997442.0,
"step": 245
},
{
"entropy": 0.5611073523759842,
"epoch": 0.924812030075188,
"grad_norm": 0.027019886299967766,
"learning_rate": 0.0002,
"loss": 0.5553584098815918,
"mean_token_accuracy": 0.7750442922115326,
"num_tokens": 4013644.0,
"step": 246
},
{
"entropy": 0.5641084164381027,
"epoch": 0.9285714285714286,
"grad_norm": 0.028763286769390106,
"learning_rate": 0.0002,
"loss": 0.5639767050743103,
"mean_token_accuracy": 0.7705299705266953,
"num_tokens": 4029960.0,
"step": 247
},
{
"entropy": 0.5596693158149719,
"epoch": 0.9323308270676691,
"grad_norm": 0.029457937926054,
"learning_rate": 0.0002,
"loss": 0.5553030371665955,
"mean_token_accuracy": 0.7704959660768509,
"num_tokens": 4046137.0,
"step": 248
},
{
"entropy": 0.5426951497793198,
"epoch": 0.9360902255639098,
"grad_norm": 0.030174724757671356,
"learning_rate": 0.0002,
"loss": 0.5424360036849976,
"mean_token_accuracy": 0.7784756273031235,
"num_tokens": 4062488.0,
"step": 249
},
{
"entropy": 0.5482533425092697,
"epoch": 0.9398496240601504,
"grad_norm": 0.029116198420524597,
"learning_rate": 0.0002,
"loss": 0.548699676990509,
"mean_token_accuracy": 0.7772116810083389,
"num_tokens": 4079035.0,
"step": 250
},
{
"entropy": 0.5659994781017303,
"epoch": 0.943609022556391,
"grad_norm": 0.028919357806444168,
"learning_rate": 0.0002,
"loss": 0.5734626054763794,
"mean_token_accuracy": 0.7644091695547104,
"num_tokens": 4095496.0,
"step": 251
},
{
"entropy": 0.5390999913215637,
"epoch": 0.9473684210526315,
"grad_norm": 0.029156571254134178,
"learning_rate": 0.0002,
"loss": 0.542834460735321,
"mean_token_accuracy": 0.778347447514534,
"num_tokens": 4111786.0,
"step": 252
},
{
"entropy": 0.5335533022880554,
"epoch": 0.9511278195488722,
"grad_norm": 0.03090072236955166,
"learning_rate": 0.0002,
"loss": 0.5460265874862671,
"mean_token_accuracy": 0.777598574757576,
"num_tokens": 4127806.0,
"step": 253
},
{
"entropy": 0.5576867163181305,
"epoch": 0.9548872180451128,
"grad_norm": 0.0250933188945055,
"learning_rate": 0.0002,
"loss": 0.5579800605773926,
"mean_token_accuracy": 0.772262915968895,
"num_tokens": 4144255.0,
"step": 254
},
{
"entropy": 0.5680612325668335,
"epoch": 0.9586466165413534,
"grad_norm": 0.02682660147547722,
"learning_rate": 0.0002,
"loss": 0.5625680685043335,
"mean_token_accuracy": 0.7703745514154434,
"num_tokens": 4160554.0,
"step": 255
},
{
"entropy": 0.5646774917840958,
"epoch": 0.9624060150375939,
"grad_norm": 0.02460050955414772,
"learning_rate": 0.0002,
"loss": 0.5615121126174927,
"mean_token_accuracy": 0.7717017978429794,
"num_tokens": 4177058.0,
"step": 256
},
{
"entropy": 0.565275639295578,
"epoch": 0.9661654135338346,
"grad_norm": 0.028230059891939163,
"learning_rate": 0.0002,
"loss": 0.5602483153343201,
"mean_token_accuracy": 0.7725579738616943,
"num_tokens": 4193529.0,
"step": 257
},
{
"entropy": 0.5464546531438828,
"epoch": 0.9699248120300752,
"grad_norm": 0.028305059298872948,
"learning_rate": 0.0002,
"loss": 0.5506906509399414,
"mean_token_accuracy": 0.7744488716125488,
"num_tokens": 4209843.0,
"step": 258
},
{
"entropy": 0.5543451011180878,
"epoch": 0.9736842105263158,
"grad_norm": 0.026113279163837433,
"learning_rate": 0.0002,
"loss": 0.5566228628158569,
"mean_token_accuracy": 0.7761884778738022,
"num_tokens": 4226371.0,
"step": 259
},
{
"entropy": 0.5395558923482895,
"epoch": 0.9774436090225563,
"grad_norm": 0.027898062020540237,
"learning_rate": 0.0002,
"loss": 0.551036536693573,
"mean_token_accuracy": 0.7777495980262756,
"num_tokens": 4242588.0,
"step": 260
},
{
"entropy": 0.5481285452842712,
"epoch": 0.981203007518797,
"grad_norm": 0.027225090190768242,
"learning_rate": 0.0002,
"loss": 0.55158931016922,
"mean_token_accuracy": 0.7746086716651917,
"num_tokens": 4258895.0,
"step": 261
},
{
"entropy": 0.5476398766040802,
"epoch": 0.9849624060150376,
"grad_norm": 0.025991205126047134,
"learning_rate": 0.0002,
"loss": 0.550503671169281,
"mean_token_accuracy": 0.778662696480751,
"num_tokens": 4275233.0,
"step": 262
},
{
"entropy": 0.5611831694841385,
"epoch": 0.9887218045112782,
"grad_norm": 0.026602452620863914,
"learning_rate": 0.0002,
"loss": 0.5595046877861023,
"mean_token_accuracy": 0.7710649222135544,
"num_tokens": 4291628.0,
"step": 263
},
{
"entropy": 0.5607927143573761,
"epoch": 0.9924812030075187,
"grad_norm": 0.029126716777682304,
"learning_rate": 0.0002,
"loss": 0.55509352684021,
"mean_token_accuracy": 0.773261696100235,
"num_tokens": 4308266.0,
"step": 264
},
{
"entropy": 0.5344236195087433,
"epoch": 0.9962406015037594,
"grad_norm": 0.024904625490307808,
"learning_rate": 0.0002,
"loss": 0.5374810099601746,
"mean_token_accuracy": 0.7795998752117157,
"num_tokens": 4324647.0,
"step": 265
},
{
"entropy": 0.5802602022886276,
"epoch": 1.0,
"grad_norm": 0.02991756983101368,
"learning_rate": 0.0002,
"loss": 0.5802874565124512,
"mean_token_accuracy": 0.7651515454053879,
"num_tokens": 4341020.0,
"step": 266
},
{
"entropy": 0.5359837561845779,
"epoch": 1.0037593984962405,
"grad_norm": 0.028310680761933327,
"learning_rate": 0.0002,
"loss": 0.5382672548294067,
"mean_token_accuracy": 0.7797826081514359,
"num_tokens": 4356946.0,
"step": 267
},
{
"entropy": 0.547169104218483,
"epoch": 1.0075187969924813,
"grad_norm": 0.026942851021885872,
"learning_rate": 0.0002,
"loss": 0.5483385324478149,
"mean_token_accuracy": 0.7762030512094498,
"num_tokens": 4373376.0,
"step": 268
},
{
"entropy": 0.5396238714456558,
"epoch": 1.0112781954887218,
"grad_norm": 0.026464859023690224,
"learning_rate": 0.0002,
"loss": 0.5366930961608887,
"mean_token_accuracy": 0.7836534827947617,
"num_tokens": 4389434.0,
"step": 269
},
{
"entropy": 0.5377503633499146,
"epoch": 1.0150375939849625,
"grad_norm": 0.028936585411429405,
"learning_rate": 0.0002,
"loss": 0.5381658673286438,
"mean_token_accuracy": 0.7795982360839844,
"num_tokens": 4405773.0,
"step": 270
},
{
"entropy": 0.5378166139125824,
"epoch": 1.018796992481203,
"grad_norm": 0.026616571471095085,
"learning_rate": 0.0002,
"loss": 0.5366747975349426,
"mean_token_accuracy": 0.7815251797437668,
"num_tokens": 4422223.0,
"step": 271
},
{
"entropy": 0.5556348860263824,
"epoch": 1.0225563909774436,
"grad_norm": 0.03760155290365219,
"learning_rate": 0.0002,
"loss": 0.5643568634986877,
"mean_token_accuracy": 0.7716861069202423,
"num_tokens": 4438566.0,
"step": 272
},
{
"entropy": 0.5393058955669403,
"epoch": 1.0263157894736843,
"grad_norm": 0.028112079948186874,
"learning_rate": 0.0002,
"loss": 0.536059558391571,
"mean_token_accuracy": 0.7806826084852219,
"num_tokens": 4454882.0,
"step": 273
},
{
"entropy": 0.5509982258081436,
"epoch": 1.0300751879699248,
"grad_norm": 0.031216077506542206,
"learning_rate": 0.0002,
"loss": 0.545498251914978,
"mean_token_accuracy": 0.7785268127918243,
"num_tokens": 4471138.0,
"step": 274
},
{
"entropy": 0.562383309006691,
"epoch": 1.0338345864661653,
"grad_norm": 0.029023578390479088,
"learning_rate": 0.0002,
"loss": 0.5549452900886536,
"mean_token_accuracy": 0.7746210545301437,
"num_tokens": 4487599.0,
"step": 275
},
{
"entropy": 0.533460721373558,
"epoch": 1.037593984962406,
"grad_norm": 0.02839999832212925,
"learning_rate": 0.0002,
"loss": 0.5428166389465332,
"mean_token_accuracy": 0.7788663357496262,
"num_tokens": 4503718.0,
"step": 276
},
{
"entropy": 0.534645140171051,
"epoch": 1.0413533834586466,
"grad_norm": 0.03183748945593834,
"learning_rate": 0.0002,
"loss": 0.5435906052589417,
"mean_token_accuracy": 0.780232772231102,
"num_tokens": 4519836.0,
"step": 277
},
{
"entropy": 0.5403695106506348,
"epoch": 1.045112781954887,
"grad_norm": 0.03128998726606369,
"learning_rate": 0.0002,
"loss": 0.546108603477478,
"mean_token_accuracy": 0.7786454111337662,
"num_tokens": 4535945.0,
"step": 278
},
{
"entropy": 0.5610467493534088,
"epoch": 1.0488721804511278,
"grad_norm": 0.027818012982606888,
"learning_rate": 0.0002,
"loss": 0.560647189617157,
"mean_token_accuracy": 0.7709101587533951,
"num_tokens": 4552374.0,
"step": 279
},
{
"entropy": 0.5373391807079315,
"epoch": 1.0526315789473684,
"grad_norm": 0.03428777679800987,
"learning_rate": 0.0002,
"loss": 0.5469943284988403,
"mean_token_accuracy": 0.7768525630235672,
"num_tokens": 4568711.0,
"step": 280
},
{
"entropy": 0.5424034297466278,
"epoch": 1.056390977443609,
"grad_norm": 0.03859133645892143,
"learning_rate": 0.0002,
"loss": 0.5439317226409912,
"mean_token_accuracy": 0.7811300605535507,
"num_tokens": 4585017.0,
"step": 281
},
{
"entropy": 0.5506146401166916,
"epoch": 1.0601503759398496,
"grad_norm": 0.03055771067738533,
"learning_rate": 0.0002,
"loss": 0.546417236328125,
"mean_token_accuracy": 0.7766596227884293,
"num_tokens": 4601432.0,
"step": 282
},
{
"entropy": 0.5494361072778702,
"epoch": 1.0639097744360901,
"grad_norm": 0.0343659445643425,
"learning_rate": 0.0002,
"loss": 0.5465281009674072,
"mean_token_accuracy": 0.7783948630094528,
"num_tokens": 4617733.0,
"step": 283
},
{
"entropy": 0.5440582782030106,
"epoch": 1.0676691729323309,
"grad_norm": 0.026508856564760208,
"learning_rate": 0.0002,
"loss": 0.5454896092414856,
"mean_token_accuracy": 0.7768892496824265,
"num_tokens": 4634160.0,
"step": 284
},
{
"entropy": 0.5566096007823944,
"epoch": 1.0714285714285714,
"grad_norm": 0.03006400726735592,
"learning_rate": 0.0002,
"loss": 0.5534993410110474,
"mean_token_accuracy": 0.7748663425445557,
"num_tokens": 4650625.0,
"step": 285
},
{
"entropy": 0.5545021891593933,
"epoch": 1.0751879699248121,
"grad_norm": 0.03096926584839821,
"learning_rate": 0.0002,
"loss": 0.5561465620994568,
"mean_token_accuracy": 0.7750347554683685,
"num_tokens": 4667029.0,
"step": 286
},
{
"entropy": 0.5399864912033081,
"epoch": 1.0789473684210527,
"grad_norm": 0.030643943697214127,
"learning_rate": 0.0002,
"loss": 0.5460204482078552,
"mean_token_accuracy": 0.7770880162715912,
"num_tokens": 4683375.0,
"step": 287
},
{
"entropy": 0.5572090744972229,
"epoch": 1.0827067669172932,
"grad_norm": 0.026186607778072357,
"learning_rate": 0.0002,
"loss": 0.5585043430328369,
"mean_token_accuracy": 0.7719515711069107,
"num_tokens": 4699882.0,
"step": 288
},
{
"entropy": 0.5484725385904312,
"epoch": 1.086466165413534,
"grad_norm": 0.027757612988352776,
"learning_rate": 0.0002,
"loss": 0.5432863235473633,
"mean_token_accuracy": 0.7777998596429825,
"num_tokens": 4716268.0,
"step": 289
},
{
"entropy": 0.5435892194509506,
"epoch": 1.0902255639097744,
"grad_norm": 0.02975296974182129,
"learning_rate": 0.0002,
"loss": 0.5351642966270447,
"mean_token_accuracy": 0.7828023135662079,
"num_tokens": 4732434.0,
"step": 290
},
{
"entropy": 0.5531795173883438,
"epoch": 1.093984962406015,
"grad_norm": 0.028304405510425568,
"learning_rate": 0.0002,
"loss": 0.5516840815544128,
"mean_token_accuracy": 0.7772639095783234,
"num_tokens": 4748580.0,
"step": 291
},
{
"entropy": 0.5184081122279167,
"epoch": 1.0977443609022557,
"grad_norm": 0.03446349874138832,
"learning_rate": 0.0002,
"loss": 0.5299493670463562,
"mean_token_accuracy": 0.7840149402618408,
"num_tokens": 4764598.0,
"step": 292
},
{
"entropy": 0.5289477556943893,
"epoch": 1.1015037593984962,
"grad_norm": 0.036261677742004395,
"learning_rate": 0.0002,
"loss": 0.5453619956970215,
"mean_token_accuracy": 0.7767883092164993,
"num_tokens": 4780809.0,
"step": 293
},
{
"entropy": 0.5418924987316132,
"epoch": 1.1052631578947367,
"grad_norm": 0.029477933421730995,
"learning_rate": 0.0002,
"loss": 0.5471935272216797,
"mean_token_accuracy": 0.7789769917726517,
"num_tokens": 4797348.0,
"step": 294
},
{
"entropy": 0.5463252663612366,
"epoch": 1.1090225563909775,
"grad_norm": 0.031204085797071457,
"learning_rate": 0.0002,
"loss": 0.5424449443817139,
"mean_token_accuracy": 0.7788571715354919,
"num_tokens": 4813415.0,
"step": 295
},
{
"entropy": 0.5470333397388458,
"epoch": 1.112781954887218,
"grad_norm": 0.03411991521716118,
"learning_rate": 0.0002,
"loss": 0.5338444709777832,
"mean_token_accuracy": 0.7839784771203995,
"num_tokens": 4829572.0,
"step": 296
},
{
"entropy": 0.5626541525125504,
"epoch": 1.1165413533834587,
"grad_norm": 0.03397219255566597,
"learning_rate": 0.0002,
"loss": 0.5499536991119385,
"mean_token_accuracy": 0.7788331657648087,
"num_tokens": 4845785.0,
"step": 297
},
{
"entropy": 0.5299470722675323,
"epoch": 1.1203007518796992,
"grad_norm": 0.03497639298439026,
"learning_rate": 0.0002,
"loss": 0.5392253994941711,
"mean_token_accuracy": 0.7810451984405518,
"num_tokens": 4862012.0,
"step": 298
},
{
"entropy": 0.5335487574338913,
"epoch": 1.1240601503759398,
"grad_norm": 0.034831658005714417,
"learning_rate": 0.0002,
"loss": 0.5457339286804199,
"mean_token_accuracy": 0.779063493013382,
"num_tokens": 4878251.0,
"step": 299
},
{
"entropy": 0.528610497713089,
"epoch": 1.1278195488721805,
"grad_norm": 0.033591266721487045,
"learning_rate": 0.0002,
"loss": 0.542759358882904,
"mean_token_accuracy": 0.7827056795358658,
"num_tokens": 4894510.0,
"step": 300
},
{
"entropy": 0.5455980747938156,
"epoch": 1.131578947368421,
"grad_norm": 0.029848981648683548,
"learning_rate": 0.0002,
"loss": 0.5544407963752747,
"mean_token_accuracy": 0.7761986404657364,
"num_tokens": 4910941.0,
"step": 301
},
{
"entropy": 0.5403441041707993,
"epoch": 1.1353383458646618,
"grad_norm": 0.028331086039543152,
"learning_rate": 0.0002,
"loss": 0.5373193025588989,
"mean_token_accuracy": 0.7810037434101105,
"num_tokens": 4927224.0,
"step": 302
},
{
"entropy": 0.579601064324379,
"epoch": 1.1390977443609023,
"grad_norm": 0.034219082444906235,
"learning_rate": 0.0002,
"loss": 0.5681281685829163,
"mean_token_accuracy": 0.7684440910816193,
"num_tokens": 4943447.0,
"step": 303
},
{
"entropy": 0.5505090206861496,
"epoch": 1.1428571428571428,
"grad_norm": 0.0307406485080719,
"learning_rate": 0.0002,
"loss": 0.5461090803146362,
"mean_token_accuracy": 0.778554230928421,
"num_tokens": 4959489.0,
"step": 304
},
{
"entropy": 0.5576640069484711,
"epoch": 1.1466165413533835,
"grad_norm": 0.030323676764965057,
"learning_rate": 0.0002,
"loss": 0.5553523302078247,
"mean_token_accuracy": 0.773658037185669,
"num_tokens": 4975936.0,
"step": 305
},
{
"entropy": 0.5266588181257248,
"epoch": 1.150375939849624,
"grad_norm": 0.035491373389959335,
"learning_rate": 0.0002,
"loss": 0.5350923538208008,
"mean_token_accuracy": 0.7815313786268234,
"num_tokens": 4992537.0,
"step": 306
},
{
"entropy": 0.5482136011123657,
"epoch": 1.1541353383458646,
"grad_norm": 0.03442855179309845,
"learning_rate": 0.0002,
"loss": 0.5545141696929932,
"mean_token_accuracy": 0.7746158391237259,
"num_tokens": 5009023.0,
"step": 307
},
{
"entropy": 0.5559152960777283,
"epoch": 1.1578947368421053,
"grad_norm": 0.02727232687175274,
"learning_rate": 0.0002,
"loss": 0.5569304823875427,
"mean_token_accuracy": 0.7725173830986023,
"num_tokens": 5025411.0,
"step": 308
},
{
"entropy": 0.5630469471216202,
"epoch": 1.1616541353383458,
"grad_norm": 0.03064255230128765,
"learning_rate": 0.0002,
"loss": 0.5543197989463806,
"mean_token_accuracy": 0.774148479104042,
"num_tokens": 5041812.0,
"step": 309
},
{
"entropy": 0.5571756958961487,
"epoch": 1.1654135338345863,
"grad_norm": 0.03609425947070122,
"learning_rate": 0.0002,
"loss": 0.5525773763656616,
"mean_token_accuracy": 0.7752318233251572,
"num_tokens": 5058244.0,
"step": 310
},
{
"entropy": 0.5431416481733322,
"epoch": 1.169172932330827,
"grad_norm": 0.027324821799993515,
"learning_rate": 0.0002,
"loss": 0.5384103059768677,
"mean_token_accuracy": 0.7805906236171722,
"num_tokens": 5074488.0,
"step": 311
},
{
"entropy": 0.5343848988413811,
"epoch": 1.1729323308270676,
"grad_norm": 0.03805036470293999,
"learning_rate": 0.0002,
"loss": 0.5469476580619812,
"mean_token_accuracy": 0.779438316822052,
"num_tokens": 5090911.0,
"step": 312
},
{
"entropy": 0.536148265004158,
"epoch": 1.1766917293233083,
"grad_norm": 0.02961050719022751,
"learning_rate": 0.0002,
"loss": 0.5435563921928406,
"mean_token_accuracy": 0.7815048396587372,
"num_tokens": 5107152.0,
"step": 313
},
{
"entropy": 0.5418159067630768,
"epoch": 1.1804511278195489,
"grad_norm": 0.025910982862114906,
"learning_rate": 0.0002,
"loss": 0.540198028087616,
"mean_token_accuracy": 0.7800037860870361,
"num_tokens": 5123652.0,
"step": 314
},
{
"entropy": 0.5343509763479233,
"epoch": 1.1842105263157894,
"grad_norm": 0.03428869694471359,
"learning_rate": 0.0002,
"loss": 0.5369153618812561,
"mean_token_accuracy": 0.7804707884788513,
"num_tokens": 5139855.0,
"step": 315
},
{
"entropy": 0.5401560962200165,
"epoch": 1.1879699248120301,
"grad_norm": 0.027781767770648003,
"learning_rate": 0.0002,
"loss": 0.5393479466438293,
"mean_token_accuracy": 0.7805478721857071,
"num_tokens": 5156155.0,
"step": 316
},
{
"entropy": 0.5566094070672989,
"epoch": 1.1917293233082706,
"grad_norm": 0.026983041316270828,
"learning_rate": 0.0002,
"loss": 0.554964005947113,
"mean_token_accuracy": 0.7756882756948471,
"num_tokens": 5172489.0,
"step": 317
},
{
"entropy": 0.547125369310379,
"epoch": 1.1954887218045114,
"grad_norm": 0.03205394372344017,
"learning_rate": 0.0002,
"loss": 0.5493847727775574,
"mean_token_accuracy": 0.7793397605419159,
"num_tokens": 5189044.0,
"step": 318
},
{
"entropy": 0.534126952290535,
"epoch": 1.199248120300752,
"grad_norm": 0.027468601241707802,
"learning_rate": 0.0002,
"loss": 0.532336413860321,
"mean_token_accuracy": 0.7843205332756042,
"num_tokens": 5205622.0,
"step": 319
},
{
"entropy": 0.541590228676796,
"epoch": 1.2030075187969924,
"grad_norm": 0.02954232320189476,
"learning_rate": 0.0002,
"loss": 0.5532248020172119,
"mean_token_accuracy": 0.7745756506919861,
"num_tokens": 5222003.0,
"step": 320
},
{
"entropy": 0.5365501791238785,
"epoch": 1.2067669172932332,
"grad_norm": 0.03286029398441315,
"learning_rate": 0.0002,
"loss": 0.5431678891181946,
"mean_token_accuracy": 0.7808897346258163,
"num_tokens": 5238368.0,
"step": 321
},
{
"entropy": 0.5435497313737869,
"epoch": 1.2105263157894737,
"grad_norm": 0.03365312144160271,
"learning_rate": 0.0002,
"loss": 0.542516827583313,
"mean_token_accuracy": 0.7798768132925034,
"num_tokens": 5254690.0,
"step": 322
},
{
"entropy": 0.5485272854566574,
"epoch": 1.2142857142857142,
"grad_norm": 0.02945873513817787,
"learning_rate": 0.0002,
"loss": 0.5457643866539001,
"mean_token_accuracy": 0.779216393828392,
"num_tokens": 5270982.0,
"step": 323
},
{
"entropy": 0.5480885654687881,
"epoch": 1.218045112781955,
"grad_norm": 0.03765803202986717,
"learning_rate": 0.0002,
"loss": 0.544890284538269,
"mean_token_accuracy": 0.7774617224931717,
"num_tokens": 5287222.0,
"step": 324
},
{
"entropy": 0.5345787778496742,
"epoch": 1.2218045112781954,
"grad_norm": 0.029292147606611252,
"learning_rate": 0.0002,
"loss": 0.5371191501617432,
"mean_token_accuracy": 0.7809965461492538,
"num_tokens": 5303631.0,
"step": 325
},
{
"entropy": 0.5533891320228577,
"epoch": 1.225563909774436,
"grad_norm": 0.03491590917110443,
"learning_rate": 0.0002,
"loss": 0.5632805228233337,
"mean_token_accuracy": 0.7713405042886734,
"num_tokens": 5319707.0,
"step": 326
},
{
"entropy": 0.5442000329494476,
"epoch": 1.2293233082706767,
"grad_norm": 0.035631779581308365,
"learning_rate": 0.0002,
"loss": 0.5511363744735718,
"mean_token_accuracy": 0.77325139939785,
"num_tokens": 5336015.0,
"step": 327
},
{
"entropy": 0.550067774951458,
"epoch": 1.2330827067669172,
"grad_norm": 0.03429507836699486,
"learning_rate": 0.0002,
"loss": 0.5445730686187744,
"mean_token_accuracy": 0.7788997292518616,
"num_tokens": 5352567.0,
"step": 328
},
{
"entropy": 0.5536926835775375,
"epoch": 1.236842105263158,
"grad_norm": 0.02860317751765251,
"learning_rate": 0.0002,
"loss": 0.5513879656791687,
"mean_token_accuracy": 0.7763962298631668,
"num_tokens": 5368974.0,
"step": 329
},
{
"entropy": 0.5571767240762711,
"epoch": 1.2406015037593985,
"grad_norm": 0.03053511306643486,
"learning_rate": 0.0002,
"loss": 0.5535838007926941,
"mean_token_accuracy": 0.7756504565477371,
"num_tokens": 5385405.0,
"step": 330
},
{
"entropy": 0.5644853711128235,
"epoch": 1.244360902255639,
"grad_norm": 0.02813347429037094,
"learning_rate": 0.0002,
"loss": 0.5661532282829285,
"mean_token_accuracy": 0.7694092392921448,
"num_tokens": 5401733.0,
"step": 331
},
{
"entropy": 0.554289311170578,
"epoch": 1.2481203007518797,
"grad_norm": 0.030001962557435036,
"learning_rate": 0.0002,
"loss": 0.5581742525100708,
"mean_token_accuracy": 0.7724047005176544,
"num_tokens": 5418343.0,
"step": 332
},
{
"entropy": 0.5443666130304337,
"epoch": 1.2518796992481203,
"grad_norm": 0.030697215348482132,
"learning_rate": 0.0002,
"loss": 0.5461480021476746,
"mean_token_accuracy": 0.7806287556886673,
"num_tokens": 5434583.0,
"step": 333
},
{
"entropy": 0.5332125425338745,
"epoch": 1.255639097744361,
"grad_norm": 0.031576018780469894,
"learning_rate": 0.0002,
"loss": 0.535359799861908,
"mean_token_accuracy": 0.7810158431529999,
"num_tokens": 5450746.0,
"step": 334
},
{
"entropy": 0.555268332362175,
"epoch": 1.2593984962406015,
"grad_norm": 0.027363646775484085,
"learning_rate": 0.0002,
"loss": 0.5560035109519958,
"mean_token_accuracy": 0.7736663818359375,
"num_tokens": 5467188.0,
"step": 335
},
{
"entropy": 0.5493292659521103,
"epoch": 1.263157894736842,
"grad_norm": 0.031114885583519936,
"learning_rate": 0.0002,
"loss": 0.5509231090545654,
"mean_token_accuracy": 0.7764100879430771,
"num_tokens": 5483617.0,
"step": 336
},
{
"entropy": 0.5554828643798828,
"epoch": 1.2669172932330828,
"grad_norm": 0.027718449011445045,
"learning_rate": 0.0002,
"loss": 0.5540401339530945,
"mean_token_accuracy": 0.7730122804641724,
"num_tokens": 5499950.0,
"step": 337
},
{
"entropy": 0.5383172035217285,
"epoch": 1.2706766917293233,
"grad_norm": 0.029059337452054024,
"learning_rate": 0.0002,
"loss": 0.5407942533493042,
"mean_token_accuracy": 0.7809923589229584,
"num_tokens": 5516241.0,
"step": 338
},
{
"entropy": 0.5302157253026962,
"epoch": 1.274436090225564,
"grad_norm": 0.030479708686470985,
"learning_rate": 0.0002,
"loss": 0.530126690864563,
"mean_token_accuracy": 0.7863384485244751,
"num_tokens": 5532841.0,
"step": 339
},
{
"entropy": 0.5322539657354355,
"epoch": 1.2781954887218046,
"grad_norm": 0.031503573060035706,
"learning_rate": 0.0002,
"loss": 0.5389677286148071,
"mean_token_accuracy": 0.77957783639431,
"num_tokens": 5549325.0,
"step": 340
},
{
"entropy": 0.5437572598457336,
"epoch": 1.281954887218045,
"grad_norm": 0.027867093682289124,
"learning_rate": 0.0002,
"loss": 0.5459513664245605,
"mean_token_accuracy": 0.7789556235074997,
"num_tokens": 5565810.0,
"step": 341
},
{
"entropy": 0.5430660545825958,
"epoch": 1.2857142857142856,
"grad_norm": 0.03420820087194443,
"learning_rate": 0.0002,
"loss": 0.5441212058067322,
"mean_token_accuracy": 0.7775195837020874,
"num_tokens": 5581844.0,
"step": 342
},
{
"entropy": 0.5310375243425369,
"epoch": 1.2894736842105263,
"grad_norm": 0.03065858967602253,
"learning_rate": 0.0002,
"loss": 0.5356528162956238,
"mean_token_accuracy": 0.7801522761583328,
"num_tokens": 5598042.0,
"step": 343
},
{
"entropy": 0.5220501720905304,
"epoch": 1.2932330827067668,
"grad_norm": 0.029243886470794678,
"learning_rate": 0.0002,
"loss": 0.516523540019989,
"mean_token_accuracy": 0.7906120866537094,
"num_tokens": 5614111.0,
"step": 344
},
{
"entropy": 0.5659748762845993,
"epoch": 1.2969924812030076,
"grad_norm": 0.03555883839726448,
"learning_rate": 0.0002,
"loss": 0.5587096214294434,
"mean_token_accuracy": 0.771675169467926,
"num_tokens": 5630635.0,
"step": 345
},
{
"entropy": 0.5501575618982315,
"epoch": 1.300751879699248,
"grad_norm": 0.030357254669070244,
"learning_rate": 0.0002,
"loss": 0.5473156571388245,
"mean_token_accuracy": 0.7771240919828415,
"num_tokens": 5646994.0,
"step": 346
},
{
"entropy": 0.5270983800292015,
"epoch": 1.3045112781954886,
"grad_norm": 0.030822839587926865,
"learning_rate": 0.0002,
"loss": 0.5363721251487732,
"mean_token_accuracy": 0.7837044894695282,
"num_tokens": 5663472.0,
"step": 347
},
{
"entropy": 0.5483475178480148,
"epoch": 1.3082706766917294,
"grad_norm": 0.03400631621479988,
"learning_rate": 0.0002,
"loss": 0.5550627708435059,
"mean_token_accuracy": 0.7723206877708435,
"num_tokens": 5679878.0,
"step": 348
},
{
"entropy": 0.5459110736846924,
"epoch": 1.3120300751879699,
"grad_norm": 0.028672240674495697,
"learning_rate": 0.0002,
"loss": 0.5484554767608643,
"mean_token_accuracy": 0.7754105031490326,
"num_tokens": 5696124.0,
"step": 349
},
{
"entropy": 0.5513360351324081,
"epoch": 1.3157894736842106,
"grad_norm": 0.029986541718244553,
"learning_rate": 0.0002,
"loss": 0.548675000667572,
"mean_token_accuracy": 0.7767119109630585,
"num_tokens": 5712240.0,
"step": 350
},
{
"entropy": 0.5394999980926514,
"epoch": 1.3195488721804511,
"grad_norm": 0.027749765664339066,
"learning_rate": 0.0002,
"loss": 0.5411927700042725,
"mean_token_accuracy": 0.7794090211391449,
"num_tokens": 5728487.0,
"step": 351
},
{
"entropy": 0.5632177442312241,
"epoch": 1.3233082706766917,
"grad_norm": 0.03165826201438904,
"learning_rate": 0.0002,
"loss": 0.5644969344139099,
"mean_token_accuracy": 0.7739209532737732,
"num_tokens": 5744665.0,
"step": 352
},
{
"entropy": 0.5484495759010315,
"epoch": 1.3270676691729324,
"grad_norm": 0.02855236455798149,
"learning_rate": 0.0002,
"loss": 0.5507109761238098,
"mean_token_accuracy": 0.7781708836555481,
"num_tokens": 5761081.0,
"step": 353
},
{
"entropy": 0.5463808476924896,
"epoch": 1.330827067669173,
"grad_norm": 0.033144768327474594,
"learning_rate": 0.0002,
"loss": 0.5490323901176453,
"mean_token_accuracy": 0.7771764546632767,
"num_tokens": 5777230.0,
"step": 354
},
{
"entropy": 0.559476301074028,
"epoch": 1.3345864661654137,
"grad_norm": 0.030584782361984253,
"learning_rate": 0.0002,
"loss": 0.5653771162033081,
"mean_token_accuracy": 0.7701748311519623,
"num_tokens": 5793509.0,
"step": 355
},
{
"entropy": 0.5580354928970337,
"epoch": 1.3383458646616542,
"grad_norm": 0.029205013066530228,
"learning_rate": 0.0002,
"loss": 0.5602571964263916,
"mean_token_accuracy": 0.7710904181003571,
"num_tokens": 5809901.0,
"step": 356
},
{
"entropy": 0.5673199146986008,
"epoch": 1.3421052631578947,
"grad_norm": 0.03065381944179535,
"learning_rate": 0.0002,
"loss": 0.5655714273452759,
"mean_token_accuracy": 0.7691835165023804,
"num_tokens": 5826128.0,
"step": 357
},
{
"entropy": 0.5535888224840164,
"epoch": 1.3458646616541352,
"grad_norm": 0.028708767145872116,
"learning_rate": 0.0002,
"loss": 0.5483720302581787,
"mean_token_accuracy": 0.7754883170127869,
"num_tokens": 5842416.0,
"step": 358
},
{
"entropy": 0.5565765500068665,
"epoch": 1.349624060150376,
"grad_norm": 0.031074965372681618,
"learning_rate": 0.0002,
"loss": 0.5588751435279846,
"mean_token_accuracy": 0.7724489718675613,
"num_tokens": 5858778.0,
"step": 359
},
{
"entropy": 0.5447706580162048,
"epoch": 1.3533834586466165,
"grad_norm": 0.031974222511053085,
"learning_rate": 0.0002,
"loss": 0.5503548979759216,
"mean_token_accuracy": 0.7767511457204819,
"num_tokens": 5875340.0,
"step": 360
},
{
"entropy": 0.5325894355773926,
"epoch": 1.3571428571428572,
"grad_norm": 0.036680273711681366,
"learning_rate": 0.0002,
"loss": 0.5425075888633728,
"mean_token_accuracy": 0.7785896062850952,
"num_tokens": 5891618.0,
"step": 361
},
{
"entropy": 0.5401211231946945,
"epoch": 1.3609022556390977,
"grad_norm": 0.030604355037212372,
"learning_rate": 0.0002,
"loss": 0.543202817440033,
"mean_token_accuracy": 0.7824591100215912,
"num_tokens": 5907777.0,
"step": 362
},
{
"entropy": 0.548919603228569,
"epoch": 1.3646616541353382,
"grad_norm": 0.02865537256002426,
"learning_rate": 0.0002,
"loss": 0.5504399538040161,
"mean_token_accuracy": 0.7752194404602051,
"num_tokens": 5924266.0,
"step": 363
},
{
"entropy": 0.5391300171613693,
"epoch": 1.368421052631579,
"grad_norm": 0.030051855370402336,
"learning_rate": 0.0002,
"loss": 0.5288874506950378,
"mean_token_accuracy": 0.7848425358533859,
"num_tokens": 5940334.0,
"step": 364
},
{
"entropy": 0.5440739095211029,
"epoch": 1.3721804511278195,
"grad_norm": 0.02727932669222355,
"learning_rate": 0.0002,
"loss": 0.5456202626228333,
"mean_token_accuracy": 0.7774905413389206,
"num_tokens": 5956646.0,
"step": 365
},
{
"entropy": 0.5311928540468216,
"epoch": 1.3759398496240602,
"grad_norm": 0.029294485226273537,
"learning_rate": 0.0002,
"loss": 0.5352226495742798,
"mean_token_accuracy": 0.7806590050458908,
"num_tokens": 5972841.0,
"step": 366
},
{
"entropy": 0.5386375188827515,
"epoch": 1.3796992481203008,
"grad_norm": 0.034396879374980927,
"learning_rate": 0.0002,
"loss": 0.5386478304862976,
"mean_token_accuracy": 0.780673161149025,
"num_tokens": 5989110.0,
"step": 367
},
{
"entropy": 0.5205325111746788,
"epoch": 1.3834586466165413,
"grad_norm": 0.028440408408641815,
"learning_rate": 0.0002,
"loss": 0.524253249168396,
"mean_token_accuracy": 0.7875637263059616,
"num_tokens": 6005130.0,
"step": 368
},
{
"entropy": 0.5718593895435333,
"epoch": 1.387218045112782,
"grad_norm": 0.03535715863108635,
"learning_rate": 0.0002,
"loss": 0.5674105882644653,
"mean_token_accuracy": 0.7696711122989655,
"num_tokens": 6021765.0,
"step": 369
},
{
"entropy": 0.5570171922445297,
"epoch": 1.3909774436090225,
"grad_norm": 0.02890731766819954,
"learning_rate": 0.0002,
"loss": 0.5550771951675415,
"mean_token_accuracy": 0.7735273241996765,
"num_tokens": 6038195.0,
"step": 370
},
{
"entropy": 0.5555340945720673,
"epoch": 1.3947368421052633,
"grad_norm": 0.03310281038284302,
"learning_rate": 0.0002,
"loss": 0.5569556951522827,
"mean_token_accuracy": 0.7722765356302261,
"num_tokens": 6054869.0,
"step": 371
},
{
"entropy": 0.5339787155389786,
"epoch": 1.3984962406015038,
"grad_norm": 0.0280836783349514,
"learning_rate": 0.0002,
"loss": 0.5336146354675293,
"mean_token_accuracy": 0.7833946198225021,
"num_tokens": 6071026.0,
"step": 372
},
{
"entropy": 0.5382460206747055,
"epoch": 1.4022556390977443,
"grad_norm": 0.028865907341241837,
"learning_rate": 0.0002,
"loss": 0.5415489077568054,
"mean_token_accuracy": 0.7795161455869675,
"num_tokens": 6087218.0,
"step": 373
},
{
"entropy": 0.5312956869602203,
"epoch": 1.4060150375939848,
"grad_norm": 0.029321739450097084,
"learning_rate": 0.0002,
"loss": 0.5310655832290649,
"mean_token_accuracy": 0.7824108898639679,
"num_tokens": 6103644.0,
"step": 374
},
{
"entropy": 0.5470356345176697,
"epoch": 1.4097744360902256,
"grad_norm": 0.035155754536390305,
"learning_rate": 0.0002,
"loss": 0.5525869131088257,
"mean_token_accuracy": 0.7761145532131195,
"num_tokens": 6120051.0,
"step": 375
},
{
"entropy": 0.5374057814478874,
"epoch": 1.413533834586466,
"grad_norm": 0.029863376170396805,
"learning_rate": 0.0002,
"loss": 0.542983889579773,
"mean_token_accuracy": 0.7801049947738647,
"num_tokens": 6136168.0,
"step": 376
},
{
"entropy": 0.5664133429527283,
"epoch": 1.4172932330827068,
"grad_norm": 0.04531969875097275,
"learning_rate": 0.0002,
"loss": 0.5716960430145264,
"mean_token_accuracy": 0.7669987082481384,
"num_tokens": 6152503.0,
"step": 377
},
{
"entropy": 0.5445482283830643,
"epoch": 1.4210526315789473,
"grad_norm": 0.031349968165159225,
"learning_rate": 0.0002,
"loss": 0.5467873811721802,
"mean_token_accuracy": 0.7808011472225189,
"num_tokens": 6168685.0,
"step": 378
},
{
"entropy": 0.5332349240779877,
"epoch": 1.4248120300751879,
"grad_norm": 0.03072705864906311,
"learning_rate": 0.0002,
"loss": 0.5336711406707764,
"mean_token_accuracy": 0.785218134522438,
"num_tokens": 6185265.0,
"step": 379
},
{
"entropy": 0.5406992584466934,
"epoch": 1.4285714285714286,
"grad_norm": 0.03197013586759567,
"learning_rate": 0.0002,
"loss": 0.535304605960846,
"mean_token_accuracy": 0.781609907746315,
"num_tokens": 6201359.0,
"step": 380
},
{
"entropy": 0.5503518134355545,
"epoch": 1.4323308270676691,
"grad_norm": 0.02861807495355606,
"learning_rate": 0.0002,
"loss": 0.5474637746810913,
"mean_token_accuracy": 0.7788266986608505,
"num_tokens": 6217636.0,
"step": 381
},
{
"entropy": 0.5336224138736725,
"epoch": 1.4360902255639099,
"grad_norm": 0.03593042492866516,
"learning_rate": 0.0002,
"loss": 0.5366555452346802,
"mean_token_accuracy": 0.7802215367555618,
"num_tokens": 6234047.0,
"step": 382
},
{
"entropy": 0.5492585748434067,
"epoch": 1.4398496240601504,
"grad_norm": 0.02969398722052574,
"learning_rate": 0.0002,
"loss": 0.5519292950630188,
"mean_token_accuracy": 0.77450992166996,
"num_tokens": 6250372.0,
"step": 383
},
{
"entropy": 0.5435014069080353,
"epoch": 1.443609022556391,
"grad_norm": 0.03131045401096344,
"learning_rate": 0.0002,
"loss": 0.5428797602653503,
"mean_token_accuracy": 0.7789845615625381,
"num_tokens": 6266490.0,
"step": 384
},
{
"entropy": 0.5582468658685684,
"epoch": 1.4473684210526316,
"grad_norm": 0.0334627628326416,
"learning_rate": 0.0002,
"loss": 0.5606057047843933,
"mean_token_accuracy": 0.7737329006195068,
"num_tokens": 6282965.0,
"step": 385
},
{
"entropy": 0.5667697936296463,
"epoch": 1.4511278195488722,
"grad_norm": 0.031320203095674515,
"learning_rate": 0.0002,
"loss": 0.5704291462898254,
"mean_token_accuracy": 0.7688294649124146,
"num_tokens": 6299265.0,
"step": 386
},
{
"entropy": 0.5566418468952179,
"epoch": 1.454887218045113,
"grad_norm": 0.04116431251168251,
"learning_rate": 0.0002,
"loss": 0.5568630695343018,
"mean_token_accuracy": 0.774201288819313,
"num_tokens": 6315434.0,
"step": 387
},
{
"entropy": 0.5492933839559555,
"epoch": 1.4586466165413534,
"grad_norm": 0.02759244106709957,
"learning_rate": 0.0002,
"loss": 0.5531164407730103,
"mean_token_accuracy": 0.7763701528310776,
"num_tokens": 6331760.0,
"step": 388
},
{
"entropy": 0.5672035366296768,
"epoch": 1.462406015037594,
"grad_norm": 0.03223001956939697,
"learning_rate": 0.0002,
"loss": 0.56959068775177,
"mean_token_accuracy": 0.768874928355217,
"num_tokens": 6348346.0,
"step": 389
},
{
"entropy": 0.5533206462860107,
"epoch": 1.4661654135338344,
"grad_norm": 0.03371699899435043,
"learning_rate": 0.0002,
"loss": 0.5532012581825256,
"mean_token_accuracy": 0.7752765119075775,
"num_tokens": 6364905.0,
"step": 390
},
{
"entropy": 0.5474317967891693,
"epoch": 1.4699248120300752,
"grad_norm": 0.033150747418403625,
"learning_rate": 0.0002,
"loss": 0.5470337867736816,
"mean_token_accuracy": 0.776570737361908,
"num_tokens": 6381253.0,
"step": 391
},
{
"entropy": 0.5514713823795319,
"epoch": 1.4736842105263157,
"grad_norm": 0.03456156328320503,
"learning_rate": 0.0002,
"loss": 0.5495055317878723,
"mean_token_accuracy": 0.7780424803495407,
"num_tokens": 6397488.0,
"step": 392
},
{
"entropy": 0.524335652589798,
"epoch": 1.4774436090225564,
"grad_norm": 0.0276760496199131,
"learning_rate": 0.0002,
"loss": 0.5228588581085205,
"mean_token_accuracy": 0.7869584411382675,
"num_tokens": 6413858.0,
"step": 393
},
{
"entropy": 0.5439832955598831,
"epoch": 1.481203007518797,
"grad_norm": 0.030009951442480087,
"learning_rate": 0.0002,
"loss": 0.5459988117218018,
"mean_token_accuracy": 0.7772574722766876,
"num_tokens": 6430056.0,
"step": 394
},
{
"entropy": 0.558243066072464,
"epoch": 1.4849624060150375,
"grad_norm": 0.03417029604315758,
"learning_rate": 0.0002,
"loss": 0.551323652267456,
"mean_token_accuracy": 0.7783164083957672,
"num_tokens": 6446633.0,
"step": 395
},
{
"entropy": 0.5622076392173767,
"epoch": 1.4887218045112782,
"grad_norm": 0.030520809814333916,
"learning_rate": 0.0002,
"loss": 0.5651980638504028,
"mean_token_accuracy": 0.7693700790405273,
"num_tokens": 6463061.0,
"step": 396
},
{
"entropy": 0.5262496769428253,
"epoch": 1.4924812030075187,
"grad_norm": 0.03385322168469429,
"learning_rate": 0.0002,
"loss": 0.5383599400520325,
"mean_token_accuracy": 0.7795081436634064,
"num_tokens": 6479394.0,
"step": 397
},
{
"entropy": 0.5428214818239212,
"epoch": 1.4962406015037595,
"grad_norm": 0.0344393290579319,
"learning_rate": 0.0002,
"loss": 0.5506508350372314,
"mean_token_accuracy": 0.776181235909462,
"num_tokens": 6495837.0,
"step": 398
},
{
"entropy": 0.5589512288570404,
"epoch": 1.5,
"grad_norm": 0.031076369807124138,
"learning_rate": 0.0002,
"loss": 0.5615136027336121,
"mean_token_accuracy": 0.7719069272279739,
"num_tokens": 6512096.0,
"step": 399
},
{
"entropy": 0.560438871383667,
"epoch": 1.5037593984962405,
"grad_norm": 0.03327278420329094,
"learning_rate": 0.0002,
"loss": 0.5491290092468262,
"mean_token_accuracy": 0.7760379314422607,
"num_tokens": 6528380.0,
"step": 400
},
{
"entropy": 0.543613851070404,
"epoch": 1.5075187969924813,
"grad_norm": 0.03218228369951248,
"learning_rate": 0.0002,
"loss": 0.5404437780380249,
"mean_token_accuracy": 0.7790963053703308,
"num_tokens": 6544607.0,
"step": 401
},
{
"entropy": 0.5582986176013947,
"epoch": 1.5112781954887218,
"grad_norm": 0.031328245997428894,
"learning_rate": 0.0002,
"loss": 0.5539280772209167,
"mean_token_accuracy": 0.7730978429317474,
"num_tokens": 6561161.0,
"step": 402
},
{
"entropy": 0.5439886897802353,
"epoch": 1.5150375939849625,
"grad_norm": 0.0315370075404644,
"learning_rate": 0.0002,
"loss": 0.5494069457054138,
"mean_token_accuracy": 0.77658711373806,
"num_tokens": 6577494.0,
"step": 403
},
{
"entropy": 0.5441574305295944,
"epoch": 1.518796992481203,
"grad_norm": 0.029565030708909035,
"learning_rate": 0.0002,
"loss": 0.5542066097259521,
"mean_token_accuracy": 0.7728031128644943,
"num_tokens": 6593864.0,
"step": 404
},
{
"entropy": 0.5381332039833069,
"epoch": 1.5225563909774436,
"grad_norm": 0.030989129096269608,
"learning_rate": 0.0002,
"loss": 0.5439568758010864,
"mean_token_accuracy": 0.7784450650215149,
"num_tokens": 6610189.0,
"step": 405
},
{
"entropy": 0.5451879501342773,
"epoch": 1.526315789473684,
"grad_norm": 0.030062349513173103,
"learning_rate": 0.0002,
"loss": 0.5435837507247925,
"mean_token_accuracy": 0.7782586812973022,
"num_tokens": 6626574.0,
"step": 406
},
{
"entropy": 0.5333066508173943,
"epoch": 1.5300751879699248,
"grad_norm": 0.02931753545999527,
"learning_rate": 0.0002,
"loss": 0.52620530128479,
"mean_token_accuracy": 0.784236952662468,
"num_tokens": 6642855.0,
"step": 407
},
{
"entropy": 0.5590699911117554,
"epoch": 1.5338345864661656,
"grad_norm": 0.03177345171570778,
"learning_rate": 0.0002,
"loss": 0.5554062128067017,
"mean_token_accuracy": 0.7730756998062134,
"num_tokens": 6659323.0,
"step": 408
},
{
"entropy": 0.5350319743156433,
"epoch": 1.537593984962406,
"grad_norm": 0.033441949635744095,
"learning_rate": 0.0002,
"loss": 0.5428333282470703,
"mean_token_accuracy": 0.7798242121934891,
"num_tokens": 6675571.0,
"step": 409
},
{
"entropy": 0.5449950993061066,
"epoch": 1.5413533834586466,
"grad_norm": 0.03087989240884781,
"learning_rate": 0.0002,
"loss": 0.550757646560669,
"mean_token_accuracy": 0.7777638882398605,
"num_tokens": 6692022.0,
"step": 410
},
{
"entropy": 0.5534456223249435,
"epoch": 1.545112781954887,
"grad_norm": 0.030627673491835594,
"learning_rate": 0.0002,
"loss": 0.5566884875297546,
"mean_token_accuracy": 0.7747643887996674,
"num_tokens": 6708348.0,
"step": 411
},
{
"entropy": 0.5696779191493988,
"epoch": 1.5488721804511278,
"grad_norm": 0.029869280755519867,
"learning_rate": 0.0002,
"loss": 0.5629582405090332,
"mean_token_accuracy": 0.7705719769001007,
"num_tokens": 6725016.0,
"step": 412
},
{
"entropy": 0.5336505770683289,
"epoch": 1.5526315789473686,
"grad_norm": 0.02911611832678318,
"learning_rate": 0.0002,
"loss": 0.5279027223587036,
"mean_token_accuracy": 0.783367246389389,
"num_tokens": 6741327.0,
"step": 413
},
{
"entropy": 0.5392275899648666,
"epoch": 1.556390977443609,
"grad_norm": 0.02994578517973423,
"learning_rate": 0.0002,
"loss": 0.5416238307952881,
"mean_token_accuracy": 0.7807497531175613,
"num_tokens": 6757440.0,
"step": 414
},
{
"entropy": 0.5460323542356491,
"epoch": 1.5601503759398496,
"grad_norm": 0.03534119576215744,
"learning_rate": 0.0002,
"loss": 0.5568557977676392,
"mean_token_accuracy": 0.7705673724412918,
"num_tokens": 6773654.0,
"step": 415
},
{
"entropy": 0.5286229997873306,
"epoch": 1.5639097744360901,
"grad_norm": 0.029811112210154533,
"learning_rate": 0.0002,
"loss": 0.5318726301193237,
"mean_token_accuracy": 0.7832337915897369,
"num_tokens": 6789752.0,
"step": 416
},
{
"entropy": 0.5552769899368286,
"epoch": 1.5676691729323309,
"grad_norm": 0.030895395204424858,
"learning_rate": 0.0002,
"loss": 0.5534340739250183,
"mean_token_accuracy": 0.7729407846927643,
"num_tokens": 6805849.0,
"step": 417
},
{
"entropy": 0.5429228097200394,
"epoch": 1.5714285714285714,
"grad_norm": 0.02707672491669655,
"learning_rate": 0.0002,
"loss": 0.5381065607070923,
"mean_token_accuracy": 0.7819552570581436,
"num_tokens": 6822408.0,
"step": 418
},
{
"entropy": 0.5434612482786179,
"epoch": 1.5751879699248121,
"grad_norm": 0.031254079192876816,
"learning_rate": 0.0002,
"loss": 0.5391129851341248,
"mean_token_accuracy": 0.7833003848791122,
"num_tokens": 6838597.0,
"step": 419
},
{
"entropy": 0.5366530418395996,
"epoch": 1.5789473684210527,
"grad_norm": 0.03022637590765953,
"learning_rate": 0.0002,
"loss": 0.5400729179382324,
"mean_token_accuracy": 0.7778450101613998,
"num_tokens": 6854952.0,
"step": 420
},
{
"entropy": 0.5444828122854233,
"epoch": 1.5827067669172932,
"grad_norm": 0.031558163464069366,
"learning_rate": 0.0002,
"loss": 0.5507203936576843,
"mean_token_accuracy": 0.7739860564470291,
"num_tokens": 6871383.0,
"step": 421
},
{
"entropy": 0.5397373139858246,
"epoch": 1.5864661654135337,
"grad_norm": 0.03590668365359306,
"learning_rate": 0.0002,
"loss": 0.5495097041130066,
"mean_token_accuracy": 0.7745723277330399,
"num_tokens": 6887614.0,
"step": 422
},
{
"entropy": 0.5547508299350739,
"epoch": 1.5902255639097744,
"grad_norm": 0.03271407634019852,
"learning_rate": 0.0002,
"loss": 0.5595258474349976,
"mean_token_accuracy": 0.7740814536809921,
"num_tokens": 6903891.0,
"step": 423
},
{
"entropy": 0.5452055484056473,
"epoch": 1.5939849624060152,
"grad_norm": 0.034447524696588516,
"learning_rate": 0.0002,
"loss": 0.5422000288963318,
"mean_token_accuracy": 0.7810980677604675,
"num_tokens": 6920317.0,
"step": 424
},
{
"entropy": 0.5475759953260422,
"epoch": 1.5977443609022557,
"grad_norm": 0.027404673397541046,
"learning_rate": 0.0002,
"loss": 0.5450745820999146,
"mean_token_accuracy": 0.7764957696199417,
"num_tokens": 6936706.0,
"step": 425
},
{
"entropy": 0.5484007894992828,
"epoch": 1.6015037593984962,
"grad_norm": 0.031125633046030998,
"learning_rate": 0.0002,
"loss": 0.5480135083198547,
"mean_token_accuracy": 0.7771385014057159,
"num_tokens": 6952874.0,
"step": 426
},
{
"entropy": 0.5364782959222794,
"epoch": 1.6052631578947367,
"grad_norm": 0.029450541362166405,
"learning_rate": 0.0002,
"loss": 0.5340723395347595,
"mean_token_accuracy": 0.7846143394708633,
"num_tokens": 6969087.0,
"step": 427
},
{
"entropy": 0.5632024109363556,
"epoch": 1.6090225563909775,
"grad_norm": 0.03085445798933506,
"learning_rate": 0.0002,
"loss": 0.56367427110672,
"mean_token_accuracy": 0.7722935974597931,
"num_tokens": 6985519.0,
"step": 428
},
{
"entropy": 0.5589936077594757,
"epoch": 1.6127819548872182,
"grad_norm": 0.03428523615002632,
"learning_rate": 0.0002,
"loss": 0.5611156225204468,
"mean_token_accuracy": 0.7728175222873688,
"num_tokens": 7001978.0,
"step": 429
},
{
"entropy": 0.5625983476638794,
"epoch": 1.6165413533834587,
"grad_norm": 0.03059856966137886,
"learning_rate": 0.0002,
"loss": 0.5613099932670593,
"mean_token_accuracy": 0.7710365056991577,
"num_tokens": 7018277.0,
"step": 430
},
{
"entropy": 0.5519939213991165,
"epoch": 1.6203007518796992,
"grad_norm": 0.030437655746936798,
"learning_rate": 0.0002,
"loss": 0.545467734336853,
"mean_token_accuracy": 0.778165876865387,
"num_tokens": 7034622.0,
"step": 431
},
{
"entropy": 0.5278475731611252,
"epoch": 1.6240601503759398,
"grad_norm": 0.027164338156580925,
"learning_rate": 0.0002,
"loss": 0.5260958075523376,
"mean_token_accuracy": 0.7867996096611023,
"num_tokens": 7050833.0,
"step": 432
},
{
"entropy": 0.5364744961261749,
"epoch": 1.6278195488721805,
"grad_norm": 0.02916925586760044,
"learning_rate": 0.0002,
"loss": 0.5371173024177551,
"mean_token_accuracy": 0.7820777744054794,
"num_tokens": 7067201.0,
"step": 433
},
{
"entropy": 0.5432325303554535,
"epoch": 1.631578947368421,
"grad_norm": 0.02878529019653797,
"learning_rate": 0.0002,
"loss": 0.5453219413757324,
"mean_token_accuracy": 0.7784911543130875,
"num_tokens": 7083919.0,
"step": 434
},
{
"entropy": 0.5461350232362747,
"epoch": 1.6353383458646618,
"grad_norm": 0.030911264941096306,
"learning_rate": 0.0002,
"loss": 0.5520428419113159,
"mean_token_accuracy": 0.7748389393091202,
"num_tokens": 7100167.0,
"step": 435
},
{
"entropy": 0.5301318913698196,
"epoch": 1.6390977443609023,
"grad_norm": 0.0337194949388504,
"learning_rate": 0.0002,
"loss": 0.533911406993866,
"mean_token_accuracy": 0.781144917011261,
"num_tokens": 7115963.0,
"step": 436
},
{
"entropy": 0.554198831319809,
"epoch": 1.6428571428571428,
"grad_norm": 0.03273259475827217,
"learning_rate": 0.0002,
"loss": 0.5581203699111938,
"mean_token_accuracy": 0.7747189700603485,
"num_tokens": 7132343.0,
"step": 437
},
{
"entropy": 0.5451264977455139,
"epoch": 1.6466165413533833,
"grad_norm": 0.028795765712857246,
"learning_rate": 0.0002,
"loss": 0.5419780015945435,
"mean_token_accuracy": 0.7782856971025467,
"num_tokens": 7148711.0,
"step": 438
},
{
"entropy": 0.5696405470371246,
"epoch": 1.650375939849624,
"grad_norm": 0.02880324050784111,
"learning_rate": 0.0002,
"loss": 0.568999171257019,
"mean_token_accuracy": 0.7674362361431122,
"num_tokens": 7165000.0,
"step": 439
},
{
"entropy": 0.5544975996017456,
"epoch": 1.6541353383458648,
"grad_norm": 0.0319298580288887,
"learning_rate": 0.0002,
"loss": 0.5572612881660461,
"mean_token_accuracy": 0.7738819718360901,
"num_tokens": 7181178.0,
"step": 440
},
{
"entropy": 0.5648850053548813,
"epoch": 1.6578947368421053,
"grad_norm": 0.033446941524744034,
"learning_rate": 0.0002,
"loss": 0.5726531147956848,
"mean_token_accuracy": 0.767191156744957,
"num_tokens": 7197682.0,
"step": 441
},
{
"entropy": 0.5558575242757797,
"epoch": 1.6616541353383458,
"grad_norm": 0.02976951375603676,
"learning_rate": 0.0002,
"loss": 0.5575220584869385,
"mean_token_accuracy": 0.7738383561372757,
"num_tokens": 7214036.0,
"step": 442
},
{
"entropy": 0.5415066331624985,
"epoch": 1.6654135338345863,
"grad_norm": 0.03178182989358902,
"learning_rate": 0.0002,
"loss": 0.5425861477851868,
"mean_token_accuracy": 0.777436301112175,
"num_tokens": 7230232.0,
"step": 443
},
{
"entropy": 0.5568071007728577,
"epoch": 1.669172932330827,
"grad_norm": 0.029093647375702858,
"learning_rate": 0.0002,
"loss": 0.5502623319625854,
"mean_token_accuracy": 0.7746951729059219,
"num_tokens": 7246458.0,
"step": 444
},
{
"entropy": 0.5455858707427979,
"epoch": 1.6729323308270678,
"grad_norm": 0.03103097900748253,
"learning_rate": 0.0002,
"loss": 0.5415849685668945,
"mean_token_accuracy": 0.7773046642541885,
"num_tokens": 7262757.0,
"step": 445
},
{
"entropy": 0.5557373017072678,
"epoch": 1.6766917293233083,
"grad_norm": 0.034459494054317474,
"learning_rate": 0.0002,
"loss": 0.5588368773460388,
"mean_token_accuracy": 0.7731840759515762,
"num_tokens": 7279011.0,
"step": 446
},
{
"entropy": 0.536065399646759,
"epoch": 1.6804511278195489,
"grad_norm": 0.030954651534557343,
"learning_rate": 0.0002,
"loss": 0.5398183465003967,
"mean_token_accuracy": 0.778962567448616,
"num_tokens": 7295450.0,
"step": 447
},
{
"entropy": 0.5364357531070709,
"epoch": 1.6842105263157894,
"grad_norm": 0.03524971008300781,
"learning_rate": 0.0002,
"loss": 0.5447929501533508,
"mean_token_accuracy": 0.7776346057653427,
"num_tokens": 7311638.0,
"step": 448
},
{
"entropy": 0.5611797869205475,
"epoch": 1.6879699248120301,
"grad_norm": 0.02808379754424095,
"learning_rate": 0.0002,
"loss": 0.5557354688644409,
"mean_token_accuracy": 0.7739097476005554,
"num_tokens": 7327872.0,
"step": 449
},
{
"entropy": 0.5732033550739288,
"epoch": 1.6917293233082706,
"grad_norm": 0.03260007128119469,
"learning_rate": 0.0002,
"loss": 0.5591524839401245,
"mean_token_accuracy": 0.775033637881279,
"num_tokens": 7344324.0,
"step": 450
},
{
"entropy": 0.5342790335416794,
"epoch": 1.6954887218045114,
"grad_norm": 0.02984827756881714,
"learning_rate": 0.0002,
"loss": 0.5380273461341858,
"mean_token_accuracy": 0.782566487789154,
"num_tokens": 7360753.0,
"step": 451
},
{
"entropy": 0.5318778306245804,
"epoch": 1.699248120300752,
"grad_norm": 0.03279503807425499,
"learning_rate": 0.0002,
"loss": 0.544060468673706,
"mean_token_accuracy": 0.7762828469276428,
"num_tokens": 7377154.0,
"step": 452
},
{
"entropy": 0.5356487184762955,
"epoch": 1.7030075187969924,
"grad_norm": 0.03332759812474251,
"learning_rate": 0.0002,
"loss": 0.548007607460022,
"mean_token_accuracy": 0.7769170254468918,
"num_tokens": 7393621.0,
"step": 453
},
{
"entropy": 0.5513975322246552,
"epoch": 1.706766917293233,
"grad_norm": 0.03238146752119064,
"learning_rate": 0.0002,
"loss": 0.5592359900474548,
"mean_token_accuracy": 0.7740825116634369,
"num_tokens": 7409899.0,
"step": 454
},
{
"entropy": 0.5548000931739807,
"epoch": 1.7105263157894737,
"grad_norm": 0.02822866663336754,
"learning_rate": 0.0002,
"loss": 0.5497517585754395,
"mean_token_accuracy": 0.776210606098175,
"num_tokens": 7426237.0,
"step": 455
},
{
"entropy": 0.5756575465202332,
"epoch": 1.7142857142857144,
"grad_norm": 0.027675755321979523,
"learning_rate": 0.0002,
"loss": 0.5697333812713623,
"mean_token_accuracy": 0.7680118083953857,
"num_tokens": 7442768.0,
"step": 456
},
{
"entropy": 0.5417828410863876,
"epoch": 1.718045112781955,
"grad_norm": 0.033404842019081116,
"learning_rate": 0.0002,
"loss": 0.5454074740409851,
"mean_token_accuracy": 0.7808687537908554,
"num_tokens": 7459143.0,
"step": 457
},
{
"entropy": 0.5427983999252319,
"epoch": 1.7218045112781954,
"grad_norm": 0.03309955820441246,
"learning_rate": 0.0002,
"loss": 0.5416461825370789,
"mean_token_accuracy": 0.7808773517608643,
"num_tokens": 7475461.0,
"step": 458
},
{
"entropy": 0.5505435466766357,
"epoch": 1.725563909774436,
"grad_norm": 0.034179892390966415,
"learning_rate": 0.0002,
"loss": 0.5560557246208191,
"mean_token_accuracy": 0.7720683664083481,
"num_tokens": 7491762.0,
"step": 459
},
{
"entropy": 0.5398002862930298,
"epoch": 1.7293233082706767,
"grad_norm": 0.036437805742025375,
"learning_rate": 0.0002,
"loss": 0.5529733896255493,
"mean_token_accuracy": 0.7730463594198227,
"num_tokens": 7507801.0,
"step": 460
},
{
"entropy": 0.5538046360015869,
"epoch": 1.7330827067669174,
"grad_norm": 0.038074180483818054,
"learning_rate": 0.0002,
"loss": 0.5474164485931396,
"mean_token_accuracy": 0.7738546878099442,
"num_tokens": 7524195.0,
"step": 461
},
{
"entropy": 0.5446304082870483,
"epoch": 1.736842105263158,
"grad_norm": 0.028863312676548958,
"learning_rate": 0.0002,
"loss": 0.534104585647583,
"mean_token_accuracy": 0.7812709957361221,
"num_tokens": 7540346.0,
"step": 462
},
{
"entropy": 0.5635255128145218,
"epoch": 1.7406015037593985,
"grad_norm": 0.0377831794321537,
"learning_rate": 0.0002,
"loss": 0.5565074682235718,
"mean_token_accuracy": 0.7726516425609589,
"num_tokens": 7556361.0,
"step": 463
},
{
"entropy": 0.5520550906658173,
"epoch": 1.744360902255639,
"grad_norm": 0.027316391468048096,
"learning_rate": 0.0002,
"loss": 0.5496057868003845,
"mean_token_accuracy": 0.7767691016197205,
"num_tokens": 7572407.0,
"step": 464
},
{
"entropy": 0.5517378151416779,
"epoch": 1.7481203007518797,
"grad_norm": 0.03549322485923767,
"learning_rate": 0.0002,
"loss": 0.5542277097702026,
"mean_token_accuracy": 0.7771301567554474,
"num_tokens": 7588716.0,
"step": 465
},
{
"entropy": 0.5447746813297272,
"epoch": 1.7518796992481203,
"grad_norm": 0.03821020945906639,
"learning_rate": 0.0002,
"loss": 0.558238685131073,
"mean_token_accuracy": 0.7732566744089127,
"num_tokens": 7604921.0,
"step": 466
},
{
"entropy": 0.5422779768705368,
"epoch": 1.755639097744361,
"grad_norm": 0.03218455985188484,
"learning_rate": 0.0002,
"loss": 0.549083411693573,
"mean_token_accuracy": 0.7762202769517899,
"num_tokens": 7621109.0,
"step": 467
},
{
"entropy": 0.5479860007762909,
"epoch": 1.7593984962406015,
"grad_norm": 0.03186026215553284,
"learning_rate": 0.0002,
"loss": 0.5414553880691528,
"mean_token_accuracy": 0.7800420671701431,
"num_tokens": 7637434.0,
"step": 468
},
{
"entropy": 0.5488834828138351,
"epoch": 1.763157894736842,
"grad_norm": 0.030316263437271118,
"learning_rate": 0.0002,
"loss": 0.5371969938278198,
"mean_token_accuracy": 0.7800302803516388,
"num_tokens": 7653708.0,
"step": 469
},
{
"entropy": 0.5712478011846542,
"epoch": 1.7669172932330826,
"grad_norm": 0.0292644202709198,
"learning_rate": 0.0002,
"loss": 0.5641398429870605,
"mean_token_accuracy": 0.7701270431280136,
"num_tokens": 7670165.0,
"step": 470
},
{
"entropy": 0.5487608909606934,
"epoch": 1.7706766917293233,
"grad_norm": 0.029384015128016472,
"learning_rate": 0.0002,
"loss": 0.5528495907783508,
"mean_token_accuracy": 0.7725293934345245,
"num_tokens": 7686546.0,
"step": 471
},
{
"entropy": 0.5485792607069016,
"epoch": 1.774436090225564,
"grad_norm": 0.03848496824502945,
"learning_rate": 0.0002,
"loss": 0.557949960231781,
"mean_token_accuracy": 0.7736170142889023,
"num_tokens": 7703199.0,
"step": 472
},
{
"entropy": 0.5328742563724518,
"epoch": 1.7781954887218046,
"grad_norm": 0.029961325228214264,
"learning_rate": 0.0002,
"loss": 0.5426016449928284,
"mean_token_accuracy": 0.7784318327903748,
"num_tokens": 7719414.0,
"step": 473
},
{
"entropy": 0.5418206453323364,
"epoch": 1.781954887218045,
"grad_norm": 0.03003692626953125,
"learning_rate": 0.0002,
"loss": 0.543552815914154,
"mean_token_accuracy": 0.777516707777977,
"num_tokens": 7735591.0,
"step": 474
},
{
"entropy": 0.5588981062173843,
"epoch": 1.7857142857142856,
"grad_norm": 0.035983212292194366,
"learning_rate": 0.0002,
"loss": 0.5562595725059509,
"mean_token_accuracy": 0.7752551138401031,
"num_tokens": 7751978.0,
"step": 475
},
{
"entropy": 0.5337852984666824,
"epoch": 1.7894736842105263,
"grad_norm": 0.030708249658346176,
"learning_rate": 0.0002,
"loss": 0.5263274312019348,
"mean_token_accuracy": 0.7854783833026886,
"num_tokens": 7768537.0,
"step": 476
},
{
"entropy": 0.5388501137495041,
"epoch": 1.793233082706767,
"grad_norm": 0.034256935119628906,
"learning_rate": 0.0002,
"loss": 0.5432993173599243,
"mean_token_accuracy": 0.7769720703363419,
"num_tokens": 7784830.0,
"step": 477
},
{
"entropy": 0.5526683777570724,
"epoch": 1.7969924812030076,
"grad_norm": 0.030191054567694664,
"learning_rate": 0.0002,
"loss": 0.5529841184616089,
"mean_token_accuracy": 0.774674654006958,
"num_tokens": 7801305.0,
"step": 478
},
{
"entropy": 0.5205394625663757,
"epoch": 1.800751879699248,
"grad_norm": 0.03705041483044624,
"learning_rate": 0.0002,
"loss": 0.5320290327072144,
"mean_token_accuracy": 0.7844933271408081,
"num_tokens": 7817468.0,
"step": 479
},
{
"entropy": 0.5391060262918472,
"epoch": 1.8045112781954886,
"grad_norm": 0.03425837680697441,
"learning_rate": 0.0002,
"loss": 0.5482912659645081,
"mean_token_accuracy": 0.7772899568080902,
"num_tokens": 7833783.0,
"step": 480
},
{
"entropy": 0.5595878064632416,
"epoch": 1.8082706766917294,
"grad_norm": 0.03261560574173927,
"learning_rate": 0.0002,
"loss": 0.5595347881317139,
"mean_token_accuracy": 0.7739517390727997,
"num_tokens": 7850116.0,
"step": 481
},
{
"entropy": 0.5623766779899597,
"epoch": 1.8120300751879699,
"grad_norm": 0.030305257067084312,
"learning_rate": 0.0002,
"loss": 0.5494015216827393,
"mean_token_accuracy": 0.7756963670253754,
"num_tokens": 7866336.0,
"step": 482
},
{
"entropy": 0.5707903653383255,
"epoch": 1.8157894736842106,
"grad_norm": 0.030717138200998306,
"learning_rate": 0.0002,
"loss": 0.5605000257492065,
"mean_token_accuracy": 0.7702891528606415,
"num_tokens": 7882899.0,
"step": 483
},
{
"entropy": 0.5296159312129021,
"epoch": 1.8195488721804511,
"grad_norm": 0.03342661261558533,
"learning_rate": 0.0002,
"loss": 0.5307406783103943,
"mean_token_accuracy": 0.7850563228130341,
"num_tokens": 7899131.0,
"step": 484
},
{
"entropy": 0.545372724533081,
"epoch": 1.8233082706766917,
"grad_norm": 0.0327008031308651,
"learning_rate": 0.0002,
"loss": 0.5443350076675415,
"mean_token_accuracy": 0.7800664007663727,
"num_tokens": 7915449.0,
"step": 485
},
{
"entropy": 0.5288603901863098,
"epoch": 1.8270676691729322,
"grad_norm": 0.03246629983186722,
"learning_rate": 0.0002,
"loss": 0.5420411229133606,
"mean_token_accuracy": 0.779539629817009,
"num_tokens": 7931703.0,
"step": 486
},
{
"entropy": 0.5476890802383423,
"epoch": 1.830827067669173,
"grad_norm": 0.03365527465939522,
"learning_rate": 0.0002,
"loss": 0.5550553798675537,
"mean_token_accuracy": 0.7729549556970596,
"num_tokens": 7948074.0,
"step": 487
},
{
"entropy": 0.5389307886362076,
"epoch": 1.8345864661654137,
"grad_norm": 0.036491431295871735,
"learning_rate": 0.0002,
"loss": 0.5469198822975159,
"mean_token_accuracy": 0.7751343995332718,
"num_tokens": 7964150.0,
"step": 488
},
{
"entropy": 0.5449552834033966,
"epoch": 1.8383458646616542,
"grad_norm": 0.03082645684480667,
"learning_rate": 0.0002,
"loss": 0.5452861189842224,
"mean_token_accuracy": 0.7780899852514267,
"num_tokens": 7980409.0,
"step": 489
},
{
"entropy": 0.5490948259830475,
"epoch": 1.8421052631578947,
"grad_norm": 0.031109903007745743,
"learning_rate": 0.0002,
"loss": 0.5441408157348633,
"mean_token_accuracy": 0.778783529996872,
"num_tokens": 7996889.0,
"step": 490
},
{
"entropy": 0.5475451499223709,
"epoch": 1.8458646616541352,
"grad_norm": 0.030056826770305634,
"learning_rate": 0.0002,
"loss": 0.5430116653442383,
"mean_token_accuracy": 0.7810570746660233,
"num_tokens": 8013259.0,
"step": 491
},
{
"entropy": 0.559479296207428,
"epoch": 1.849624060150376,
"grad_norm": 0.035820432007312775,
"learning_rate": 0.0002,
"loss": 0.5568897128105164,
"mean_token_accuracy": 0.7710603177547455,
"num_tokens": 8029520.0,
"step": 492
},
{
"entropy": 0.5462630242109299,
"epoch": 1.8533834586466167,
"grad_norm": 0.031395427882671356,
"learning_rate": 0.0002,
"loss": 0.5490817427635193,
"mean_token_accuracy": 0.7747374475002289,
"num_tokens": 8045599.0,
"step": 493
},
{
"entropy": 0.5427971929311752,
"epoch": 1.8571428571428572,
"grad_norm": 0.032419510185718536,
"learning_rate": 0.0002,
"loss": 0.547596275806427,
"mean_token_accuracy": 0.7759164273738861,
"num_tokens": 8062030.0,
"step": 494
},
{
"entropy": 0.5488359779119492,
"epoch": 1.8609022556390977,
"grad_norm": 0.03382895514369011,
"learning_rate": 0.0002,
"loss": 0.5546596646308899,
"mean_token_accuracy": 0.7742781788110733,
"num_tokens": 8078279.0,
"step": 495
},
{
"entropy": 0.5563898682594299,
"epoch": 1.8646616541353382,
"grad_norm": 0.030559495091438293,
"learning_rate": 0.0002,
"loss": 0.5596904754638672,
"mean_token_accuracy": 0.7740778177976608,
"num_tokens": 8094627.0,
"step": 496
},
{
"entropy": 0.5448739975690842,
"epoch": 1.868421052631579,
"grad_norm": 0.029570002108812332,
"learning_rate": 0.0002,
"loss": 0.5441548824310303,
"mean_token_accuracy": 0.7791137993335724,
"num_tokens": 8111057.0,
"step": 497
},
{
"entropy": 0.5403100103139877,
"epoch": 1.8721804511278195,
"grad_norm": 0.028860216960310936,
"learning_rate": 0.0002,
"loss": 0.5392476916313171,
"mean_token_accuracy": 0.7823552191257477,
"num_tokens": 8127458.0,
"step": 498
},
{
"entropy": 0.547279953956604,
"epoch": 1.8759398496240602,
"grad_norm": 0.03563547134399414,
"learning_rate": 0.0002,
"loss": 0.5528260469436646,
"mean_token_accuracy": 0.7767119854688644,
"num_tokens": 8143862.0,
"step": 499
},
{
"entropy": 0.5525589138269424,
"epoch": 1.8796992481203008,
"grad_norm": 0.03100893273949623,
"learning_rate": 0.0002,
"loss": 0.5514292120933533,
"mean_token_accuracy": 0.7746975123882294,
"num_tokens": 8160155.0,
"step": 500
},
{
"entropy": 0.5513135939836502,
"epoch": 1.8834586466165413,
"grad_norm": 0.0315982848405838,
"learning_rate": 0.0002,
"loss": 0.5519658923149109,
"mean_token_accuracy": 0.7756119072437286,
"num_tokens": 8176700.0,
"step": 501
},
{
"entropy": 0.5485852658748627,
"epoch": 1.8872180451127818,
"grad_norm": 0.031329069286584854,
"learning_rate": 0.0002,
"loss": 0.5463511347770691,
"mean_token_accuracy": 0.779010608792305,
"num_tokens": 8193245.0,
"step": 502
},
{
"entropy": 0.5625745803117752,
"epoch": 1.8909774436090225,
"grad_norm": 0.029315905645489693,
"learning_rate": 0.0002,
"loss": 0.5607528686523438,
"mean_token_accuracy": 0.7741692066192627,
"num_tokens": 8209893.0,
"step": 503
},
{
"entropy": 0.5387315452098846,
"epoch": 1.8947368421052633,
"grad_norm": 0.03832435607910156,
"learning_rate": 0.0002,
"loss": 0.5399753451347351,
"mean_token_accuracy": 0.781536191701889,
"num_tokens": 8226239.0,
"step": 504
},
{
"entropy": 0.544891282916069,
"epoch": 1.8984962406015038,
"grad_norm": 0.03846210241317749,
"learning_rate": 0.0002,
"loss": 0.5546903610229492,
"mean_token_accuracy": 0.7764989882707596,
"num_tokens": 8242463.0,
"step": 505
},
{
"entropy": 0.5383649319410324,
"epoch": 1.9022556390977443,
"grad_norm": 0.029546573758125305,
"learning_rate": 0.0002,
"loss": 0.5443148016929626,
"mean_token_accuracy": 0.7801246345043182,
"num_tokens": 8258870.0,
"step": 506
},
{
"entropy": 0.5518875420093536,
"epoch": 1.9060150375939848,
"grad_norm": 0.03868366405367851,
"learning_rate": 0.0002,
"loss": 0.56158447265625,
"mean_token_accuracy": 0.7744181603193283,
"num_tokens": 8275059.0,
"step": 507
},
{
"entropy": 0.5304814428091049,
"epoch": 1.9097744360902256,
"grad_norm": 0.030545437708497047,
"learning_rate": 0.0002,
"loss": 0.5301219820976257,
"mean_token_accuracy": 0.7852053344249725,
"num_tokens": 8291105.0,
"step": 508
},
{
"entropy": 0.5690664052963257,
"epoch": 1.9135338345864663,
"grad_norm": 0.032348547130823135,
"learning_rate": 0.0002,
"loss": 0.5622092485427856,
"mean_token_accuracy": 0.769376203417778,
"num_tokens": 8307569.0,
"step": 509
},
{
"entropy": 0.5624774992465973,
"epoch": 1.9172932330827068,
"grad_norm": 0.02640698291361332,
"learning_rate": 0.0002,
"loss": 0.5545241236686707,
"mean_token_accuracy": 0.7744268774986267,
"num_tokens": 8323912.0,
"step": 510
},
{
"entropy": 0.5579835772514343,
"epoch": 1.9210526315789473,
"grad_norm": 0.031412333250045776,
"learning_rate": 0.0002,
"loss": 0.5539452433586121,
"mean_token_accuracy": 0.774582713842392,
"num_tokens": 8340119.0,
"step": 511
},
{
"entropy": 0.542325347661972,
"epoch": 1.9248120300751879,
"grad_norm": 0.030913738533854485,
"learning_rate": 0.0002,
"loss": 0.5458105802536011,
"mean_token_accuracy": 0.7775561809539795,
"num_tokens": 8356315.0,
"step": 512
},
{
"entropy": 0.529489278793335,
"epoch": 1.9285714285714286,
"grad_norm": 0.029877884313464165,
"learning_rate": 0.0002,
"loss": 0.531100332736969,
"mean_token_accuracy": 0.7838429808616638,
"num_tokens": 8372456.0,
"step": 513
},
{
"entropy": 0.5389499813318253,
"epoch": 1.9323308270676691,
"grad_norm": 0.030849065631628036,
"learning_rate": 0.0002,
"loss": 0.5465497374534607,
"mean_token_accuracy": 0.7783443629741669,
"num_tokens": 8388807.0,
"step": 514
},
{
"entropy": 0.5628852099180222,
"epoch": 1.9360902255639099,
"grad_norm": 0.03353369981050491,
"learning_rate": 0.0002,
"loss": 0.5644093751907349,
"mean_token_accuracy": 0.7698302865028381,
"num_tokens": 8405066.0,
"step": 515
},
{
"entropy": 0.5497677177190781,
"epoch": 1.9398496240601504,
"grad_norm": 0.028165243566036224,
"learning_rate": 0.0002,
"loss": 0.547763466835022,
"mean_token_accuracy": 0.7773574143648148,
"num_tokens": 8421460.0,
"step": 516
},
{
"entropy": 0.5606269836425781,
"epoch": 1.943609022556391,
"grad_norm": 0.0319550521671772,
"learning_rate": 0.0002,
"loss": 0.5551348924636841,
"mean_token_accuracy": 0.7739223390817642,
"num_tokens": 8437784.0,
"step": 517
},
{
"entropy": 0.5395714491605759,
"epoch": 1.9473684210526314,
"grad_norm": 0.031290777027606964,
"learning_rate": 0.0002,
"loss": 0.5381031036376953,
"mean_token_accuracy": 0.7825980633497238,
"num_tokens": 8453854.0,
"step": 518
},
{
"entropy": 0.5344501882791519,
"epoch": 1.9511278195488722,
"grad_norm": 0.03777296468615532,
"learning_rate": 0.0002,
"loss": 0.5455595850944519,
"mean_token_accuracy": 0.7795031368732452,
"num_tokens": 8470272.0,
"step": 519
},
{
"entropy": 0.5205538719892502,
"epoch": 1.954887218045113,
"grad_norm": 0.03487836569547653,
"learning_rate": 0.0002,
"loss": 0.5330216288566589,
"mean_token_accuracy": 0.7831091731786728,
"num_tokens": 8486562.0,
"step": 520
},
{
"entropy": 0.5428618490695953,
"epoch": 1.9586466165413534,
"grad_norm": 0.030902346596121788,
"learning_rate": 0.0002,
"loss": 0.5495193004608154,
"mean_token_accuracy": 0.7756944447755814,
"num_tokens": 8502887.0,
"step": 521
},
{
"entropy": 0.544492781162262,
"epoch": 1.962406015037594,
"grad_norm": 0.03169652447104454,
"learning_rate": 0.0002,
"loss": 0.5453743934631348,
"mean_token_accuracy": 0.7783046513795853,
"num_tokens": 8519068.0,
"step": 522
},
{
"entropy": 0.5636335015296936,
"epoch": 1.9661654135338344,
"grad_norm": 0.03021661750972271,
"learning_rate": 0.0002,
"loss": 0.5499917268753052,
"mean_token_accuracy": 0.7781661599874496,
"num_tokens": 8535634.0,
"step": 523
},
{
"entropy": 0.55694779753685,
"epoch": 1.9699248120300752,
"grad_norm": 0.03414059802889824,
"learning_rate": 0.0002,
"loss": 0.5477267503738403,
"mean_token_accuracy": 0.7789023220539093,
"num_tokens": 8552014.0,
"step": 524
},
{
"entropy": 0.5450517237186432,
"epoch": 1.973684210526316,
"grad_norm": 0.03232225775718689,
"learning_rate": 0.0002,
"loss": 0.5392122268676758,
"mean_token_accuracy": 0.777529314160347,
"num_tokens": 8568141.0,
"step": 525
},
{
"entropy": 0.5509356558322906,
"epoch": 1.9774436090225564,
"grad_norm": 0.03768094256520271,
"learning_rate": 0.0002,
"loss": 0.5595051050186157,
"mean_token_accuracy": 0.7724569737911224,
"num_tokens": 8584500.0,
"step": 526
},
{
"entropy": 0.5301109999418259,
"epoch": 1.981203007518797,
"grad_norm": 0.033885687589645386,
"learning_rate": 0.0002,
"loss": 0.5360104441642761,
"mean_token_accuracy": 0.7817398905754089,
"num_tokens": 8600622.0,
"step": 527
},
{
"entropy": 0.5417920649051666,
"epoch": 1.9849624060150375,
"grad_norm": 0.035579532384872437,
"learning_rate": 0.0002,
"loss": 0.5494239926338196,
"mean_token_accuracy": 0.7785082012414932,
"num_tokens": 8616969.0,
"step": 528
},
{
"entropy": 0.5376323908567429,
"epoch": 1.9887218045112782,
"grad_norm": 0.0296316035091877,
"learning_rate": 0.0002,
"loss": 0.5373918414115906,
"mean_token_accuracy": 0.7816532105207443,
"num_tokens": 8633437.0,
"step": 529
},
{
"entropy": 0.5412444472312927,
"epoch": 1.9924812030075187,
"grad_norm": 0.03037526085972786,
"learning_rate": 0.0002,
"loss": 0.539776086807251,
"mean_token_accuracy": 0.7808452993631363,
"num_tokens": 8649560.0,
"step": 530
},
{
"entropy": 0.554906353354454,
"epoch": 1.9962406015037595,
"grad_norm": 0.03048609383404255,
"learning_rate": 0.0002,
"loss": 0.5531030893325806,
"mean_token_accuracy": 0.7767332792282104,
"num_tokens": 8665828.0,
"step": 531
},
{
"entropy": 0.5544924587011337,
"epoch": 2.0,
"grad_norm": 0.03117205761373043,
"learning_rate": 0.0002,
"loss": 0.5525693893432617,
"mean_token_accuracy": 0.775643989443779,
"num_tokens": 8682083.0,
"step": 532
},
{
"entropy": 0.5393226593732834,
"epoch": 2.0037593984962405,
"grad_norm": 0.034238528460264206,
"learning_rate": 0.0002,
"loss": 0.527999222278595,
"mean_token_accuracy": 0.7866329997777939,
"num_tokens": 8698342.0,
"step": 533
},
{
"entropy": 0.5444916188716888,
"epoch": 2.007518796992481,
"grad_norm": 0.03761903941631317,
"learning_rate": 0.0002,
"loss": 0.5434718132019043,
"mean_token_accuracy": 0.7761547416448593,
"num_tokens": 8714741.0,
"step": 534
},
{
"entropy": 0.5060115680098534,
"epoch": 2.011278195488722,
"grad_norm": 0.036343637853860855,
"learning_rate": 0.0002,
"loss": 0.5168589353561401,
"mean_token_accuracy": 0.7898426353931427,
"num_tokens": 8731100.0,
"step": 535
},
{
"entropy": 0.5210407823324203,
"epoch": 2.0150375939849625,
"grad_norm": 0.04487035050988197,
"learning_rate": 0.0002,
"loss": 0.5338425040245056,
"mean_token_accuracy": 0.783848226070404,
"num_tokens": 8747374.0,
"step": 536
},
{
"entropy": 0.5411355942487717,
"epoch": 2.018796992481203,
"grad_norm": 0.030216895043849945,
"learning_rate": 0.0002,
"loss": 0.5343786478042603,
"mean_token_accuracy": 0.785404697060585,
"num_tokens": 8763878.0,
"step": 537
},
{
"entropy": 0.5372739881277084,
"epoch": 2.0225563909774436,
"grad_norm": 0.028337521478533745,
"learning_rate": 0.0002,
"loss": 0.5299405455589294,
"mean_token_accuracy": 0.7845199257135391,
"num_tokens": 8780220.0,
"step": 538
},
{
"entropy": 0.5464906841516495,
"epoch": 2.026315789473684,
"grad_norm": 0.036913856863975525,
"learning_rate": 0.0002,
"loss": 0.5415371656417847,
"mean_token_accuracy": 0.7804137766361237,
"num_tokens": 8796472.0,
"step": 539
},
{
"entropy": 0.5379135385155678,
"epoch": 2.030075187969925,
"grad_norm": 0.03262462466955185,
"learning_rate": 0.0002,
"loss": 0.5289930701255798,
"mean_token_accuracy": 0.7824063748121262,
"num_tokens": 8812711.0,
"step": 540
},
{
"entropy": 0.5565919727087021,
"epoch": 2.0338345864661656,
"grad_norm": 0.04293256625533104,
"learning_rate": 0.0002,
"loss": 0.5547116994857788,
"mean_token_accuracy": 0.7729399651288986,
"num_tokens": 8829053.0,
"step": 541
},
{
"entropy": 0.5241617634892464,
"epoch": 2.037593984962406,
"grad_norm": 0.038099389523267746,
"learning_rate": 0.0002,
"loss": 0.5281400084495544,
"mean_token_accuracy": 0.7854866534471512,
"num_tokens": 8845272.0,
"step": 542
},
{
"entropy": 0.5125209540128708,
"epoch": 2.0413533834586466,
"grad_norm": 0.0444987453520298,
"learning_rate": 0.0002,
"loss": 0.5245556235313416,
"mean_token_accuracy": 0.7865463197231293,
"num_tokens": 8861604.0,
"step": 543
},
{
"entropy": 0.5151898711919785,
"epoch": 2.045112781954887,
"grad_norm": 0.03733397275209427,
"learning_rate": 0.0002,
"loss": 0.5251218676567078,
"mean_token_accuracy": 0.7850091606378555,
"num_tokens": 8878258.0,
"step": 544
},
{
"entropy": 0.5284005552530289,
"epoch": 2.0488721804511276,
"grad_norm": 0.03852412849664688,
"learning_rate": 0.0002,
"loss": 0.5298153758049011,
"mean_token_accuracy": 0.7847720235586166,
"num_tokens": 8894539.0,
"step": 545
},
{
"entropy": 0.54307721555233,
"epoch": 2.0526315789473686,
"grad_norm": 0.033771906048059464,
"learning_rate": 0.0002,
"loss": 0.5370909571647644,
"mean_token_accuracy": 0.7825237512588501,
"num_tokens": 8910872.0,
"step": 546
},
{
"entropy": 0.5492400974035263,
"epoch": 2.056390977443609,
"grad_norm": 0.03574720397591591,
"learning_rate": 0.0002,
"loss": 0.5408341884613037,
"mean_token_accuracy": 0.778035119175911,
"num_tokens": 8927218.0,
"step": 547
},
{
"entropy": 0.5240911245346069,
"epoch": 2.0601503759398496,
"grad_norm": 0.02964242920279503,
"learning_rate": 0.0002,
"loss": 0.5206458568572998,
"mean_token_accuracy": 0.7880397886037827,
"num_tokens": 8943483.0,
"step": 548
},
{
"entropy": 0.5402092635631561,
"epoch": 2.06390977443609,
"grad_norm": 0.030025213956832886,
"learning_rate": 0.0002,
"loss": 0.5365015864372253,
"mean_token_accuracy": 0.7826483398675919,
"num_tokens": 8959806.0,
"step": 549
},
{
"entropy": 0.5332436561584473,
"epoch": 2.0676691729323307,
"grad_norm": 0.04115639254450798,
"learning_rate": 0.0002,
"loss": 0.5445111393928528,
"mean_token_accuracy": 0.7822862267494202,
"num_tokens": 8976089.0,
"step": 550
},
{
"entropy": 0.5036703869700432,
"epoch": 2.0714285714285716,
"grad_norm": 0.04966175556182861,
"learning_rate": 0.0002,
"loss": 0.5189836025238037,
"mean_token_accuracy": 0.7873758524656296,
"num_tokens": 8992377.0,
"step": 551
},
{
"entropy": 0.5350762009620667,
"epoch": 2.075187969924812,
"grad_norm": 0.03549731895327568,
"learning_rate": 0.0002,
"loss": 0.5327733755111694,
"mean_token_accuracy": 0.7879746407270432,
"num_tokens": 9008811.0,
"step": 552
},
{
"entropy": 0.5646320134401321,
"epoch": 2.0789473684210527,
"grad_norm": 0.03737547621130943,
"learning_rate": 0.0002,
"loss": 0.5554011464118958,
"mean_token_accuracy": 0.7747785001993179,
"num_tokens": 9025308.0,
"step": 553
},
{
"entropy": 0.5232708752155304,
"epoch": 2.082706766917293,
"grad_norm": 0.0358981154859066,
"learning_rate": 0.0002,
"loss": 0.5174283385276794,
"mean_token_accuracy": 0.790026530623436,
"num_tokens": 9041525.0,
"step": 554
},
{
"entropy": 0.5285665988922119,
"epoch": 2.0864661654135337,
"grad_norm": 0.03469764441251755,
"learning_rate": 0.0002,
"loss": 0.5286591649055481,
"mean_token_accuracy": 0.7858238369226456,
"num_tokens": 9058016.0,
"step": 555
},
{
"entropy": 0.5281644910573959,
"epoch": 2.090225563909774,
"grad_norm": 0.0453813299536705,
"learning_rate": 0.0002,
"loss": 0.5388556718826294,
"mean_token_accuracy": 0.7807898968458176,
"num_tokens": 9074200.0,
"step": 556
},
{
"entropy": 0.5271690487861633,
"epoch": 2.093984962406015,
"grad_norm": 0.032550517469644547,
"learning_rate": 0.0002,
"loss": 0.5312079787254333,
"mean_token_accuracy": 0.7843631505966187,
"num_tokens": 9090441.0,
"step": 557
},
{
"entropy": 0.5335165411233902,
"epoch": 2.0977443609022557,
"grad_norm": 0.045913904905319214,
"learning_rate": 0.0002,
"loss": 0.5417532324790955,
"mean_token_accuracy": 0.7792288213968277,
"num_tokens": 9106701.0,
"step": 558
},
{
"entropy": 0.5311940237879753,
"epoch": 2.101503759398496,
"grad_norm": 0.03551177680492401,
"learning_rate": 0.0002,
"loss": 0.5270295143127441,
"mean_token_accuracy": 0.7884976118803024,
"num_tokens": 9122828.0,
"step": 559
},
{
"entropy": 0.5543871223926544,
"epoch": 2.1052631578947367,
"grad_norm": 0.04049575328826904,
"learning_rate": 0.0002,
"loss": 0.5416486859321594,
"mean_token_accuracy": 0.7811383605003357,
"num_tokens": 9139283.0,
"step": 560
},
{
"entropy": 0.5340919494628906,
"epoch": 2.1090225563909772,
"grad_norm": 0.039224181324243546,
"learning_rate": 0.0002,
"loss": 0.5327409505844116,
"mean_token_accuracy": 0.7838027775287628,
"num_tokens": 9155474.0,
"step": 561
},
{
"entropy": 0.5298718512058258,
"epoch": 2.112781954887218,
"grad_norm": 0.05099140852689743,
"learning_rate": 0.0002,
"loss": 0.5340836644172668,
"mean_token_accuracy": 0.783194437623024,
"num_tokens": 9171817.0,
"step": 562
},
{
"entropy": 0.5186150521039963,
"epoch": 2.1165413533834587,
"grad_norm": 0.03965724632143974,
"learning_rate": 0.0002,
"loss": 0.5235821604728699,
"mean_token_accuracy": 0.7888422161340714,
"num_tokens": 9188257.0,
"step": 563
},
{
"entropy": 0.5331820994615555,
"epoch": 2.1203007518796992,
"grad_norm": 0.04237478971481323,
"learning_rate": 0.0002,
"loss": 0.5393993258476257,
"mean_token_accuracy": 0.7827252298593521,
"num_tokens": 9204541.0,
"step": 564
},
{
"entropy": 0.540572926402092,
"epoch": 2.1240601503759398,
"grad_norm": 0.04164816811680794,
"learning_rate": 0.0002,
"loss": 0.5408675670623779,
"mean_token_accuracy": 0.7807533591985703,
"num_tokens": 9220820.0,
"step": 565
},
{
"entropy": 0.5385376363992691,
"epoch": 2.1278195488721803,
"grad_norm": 0.036260150372982025,
"learning_rate": 0.0002,
"loss": 0.5364916324615479,
"mean_token_accuracy": 0.7820783704519272,
"num_tokens": 9237023.0,
"step": 566
},
{
"entropy": 0.5336015373468399,
"epoch": 2.1315789473684212,
"grad_norm": 0.037857089191675186,
"learning_rate": 0.0002,
"loss": 0.5315621495246887,
"mean_token_accuracy": 0.785429060459137,
"num_tokens": 9253551.0,
"step": 567
},
{
"entropy": 0.5323529243469238,
"epoch": 2.1353383458646618,
"grad_norm": 0.037011366337537766,
"learning_rate": 0.0002,
"loss": 0.5320927500724792,
"mean_token_accuracy": 0.7860363125801086,
"num_tokens": 9270061.0,
"step": 568
},
{
"entropy": 0.5342943072319031,
"epoch": 2.1390977443609023,
"grad_norm": 0.04501970484852791,
"learning_rate": 0.0002,
"loss": 0.541400134563446,
"mean_token_accuracy": 0.7824247628450394,
"num_tokens": 9286644.0,
"step": 569
},
{
"entropy": 0.5125101208686829,
"epoch": 2.142857142857143,
"grad_norm": 0.03982450067996979,
"learning_rate": 0.0002,
"loss": 0.5186954736709595,
"mean_token_accuracy": 0.7895647883415222,
"num_tokens": 9302779.0,
"step": 570
},
{
"entropy": 0.5302434861660004,
"epoch": 2.1466165413533833,
"grad_norm": 0.04483801871538162,
"learning_rate": 0.0002,
"loss": 0.5331039428710938,
"mean_token_accuracy": 0.7822313755750656,
"num_tokens": 9318908.0,
"step": 571
},
{
"entropy": 0.541576087474823,
"epoch": 2.1503759398496243,
"grad_norm": 0.04227382317185402,
"learning_rate": 0.0002,
"loss": 0.5322229862213135,
"mean_token_accuracy": 0.7839206904172897,
"num_tokens": 9335280.0,
"step": 572
},
{
"entropy": 0.5349045842885971,
"epoch": 2.154135338345865,
"grad_norm": 0.039713822305202484,
"learning_rate": 0.0002,
"loss": 0.5306118726730347,
"mean_token_accuracy": 0.7863682806491852,
"num_tokens": 9351717.0,
"step": 573
},
{
"entropy": 0.538109079003334,
"epoch": 2.1578947368421053,
"grad_norm": 0.043392788618803024,
"learning_rate": 0.0002,
"loss": 0.5441777110099792,
"mean_token_accuracy": 0.7800941169261932,
"num_tokens": 9367925.0,
"step": 574
},
{
"entropy": 0.543743833899498,
"epoch": 2.161654135338346,
"grad_norm": 0.036299366503953934,
"learning_rate": 0.0002,
"loss": 0.5443440675735474,
"mean_token_accuracy": 0.7788920700550079,
"num_tokens": 9384356.0,
"step": 575
},
{
"entropy": 0.5299166440963745,
"epoch": 2.1654135338345863,
"grad_norm": 0.04222200810909271,
"learning_rate": 0.0002,
"loss": 0.5267676711082458,
"mean_token_accuracy": 0.7834489941596985,
"num_tokens": 9400653.0,
"step": 576
},
{
"entropy": 0.5201265513896942,
"epoch": 2.169172932330827,
"grad_norm": 0.034343086183071136,
"learning_rate": 0.0002,
"loss": 0.5234291553497314,
"mean_token_accuracy": 0.7866221219301224,
"num_tokens": 9416889.0,
"step": 577
},
{
"entropy": 0.5227823704481125,
"epoch": 2.172932330827068,
"grad_norm": 0.05559639260172844,
"learning_rate": 0.0002,
"loss": 0.5304789543151855,
"mean_token_accuracy": 0.7860793620347977,
"num_tokens": 9433083.0,
"step": 578
},
{
"entropy": 0.5409391671419144,
"epoch": 2.1766917293233083,
"grad_norm": 0.03534764051437378,
"learning_rate": 0.0002,
"loss": 0.5437344908714294,
"mean_token_accuracy": 0.7797643393278122,
"num_tokens": 9449666.0,
"step": 579
},
{
"entropy": 0.5353062897920609,
"epoch": 2.180451127819549,
"grad_norm": 0.0366806834936142,
"learning_rate": 0.0002,
"loss": 0.5361766815185547,
"mean_token_accuracy": 0.7838302254676819,
"num_tokens": 9465971.0,
"step": 580
},
{
"entropy": 0.5455628782510757,
"epoch": 2.1842105263157894,
"grad_norm": 0.04078822582960129,
"learning_rate": 0.0002,
"loss": 0.5446187257766724,
"mean_token_accuracy": 0.7786186188459396,
"num_tokens": 9482331.0,
"step": 581
},
{
"entropy": 0.5441193133592606,
"epoch": 2.18796992481203,
"grad_norm": 0.03562629595398903,
"learning_rate": 0.0002,
"loss": 0.538811981678009,
"mean_token_accuracy": 0.7832597941160202,
"num_tokens": 9498498.0,
"step": 582
},
{
"entropy": 0.519161731004715,
"epoch": 2.191729323308271,
"grad_norm": 0.04350278899073601,
"learning_rate": 0.0002,
"loss": 0.5223026275634766,
"mean_token_accuracy": 0.7909857630729675,
"num_tokens": 9514937.0,
"step": 583
},
{
"entropy": 0.5520303696393967,
"epoch": 2.1954887218045114,
"grad_norm": 0.04176495969295502,
"learning_rate": 0.0002,
"loss": 0.5509821772575378,
"mean_token_accuracy": 0.7763593196868896,
"num_tokens": 9531256.0,
"step": 584
},
{
"entropy": 0.5262609422206879,
"epoch": 2.199248120300752,
"grad_norm": 0.07633325457572937,
"learning_rate": 0.0002,
"loss": 0.5259430408477783,
"mean_token_accuracy": 0.7863292992115021,
"num_tokens": 9547509.0,
"step": 585
},
{
"entropy": 0.53122878074646,
"epoch": 2.2030075187969924,
"grad_norm": 0.04210652410984039,
"learning_rate": 0.0002,
"loss": 0.531125545501709,
"mean_token_accuracy": 0.7854439616203308,
"num_tokens": 9563675.0,
"step": 586
},
{
"entropy": 0.5309283137321472,
"epoch": 2.206766917293233,
"grad_norm": 0.042596347630023956,
"learning_rate": 0.0002,
"loss": 0.5361312627792358,
"mean_token_accuracy": 0.7840573638677597,
"num_tokens": 9580247.0,
"step": 587
},
{
"entropy": 0.523199625313282,
"epoch": 2.2105263157894735,
"grad_norm": 0.06264178454875946,
"learning_rate": 0.0002,
"loss": 0.5371831655502319,
"mean_token_accuracy": 0.7846156805753708,
"num_tokens": 9596084.0,
"step": 588
},
{
"entropy": 0.5497414767742157,
"epoch": 2.2142857142857144,
"grad_norm": 0.049970485270023346,
"learning_rate": 0.0002,
"loss": 0.5482587218284607,
"mean_token_accuracy": 0.7772606760263443,
"num_tokens": 9612439.0,
"step": 589
},
{
"entropy": 0.5475651770830154,
"epoch": 2.218045112781955,
"grad_norm": 0.047052860260009766,
"learning_rate": 0.0002,
"loss": 0.5382542610168457,
"mean_token_accuracy": 0.7837767452001572,
"num_tokens": 9628574.0,
"step": 590
},
{
"entropy": 0.5442479848861694,
"epoch": 2.2218045112781954,
"grad_norm": 0.03252498432993889,
"learning_rate": 0.0002,
"loss": 0.5315850973129272,
"mean_token_accuracy": 0.7825820297002792,
"num_tokens": 9644837.0,
"step": 591
},
{
"entropy": 0.5471898764371872,
"epoch": 2.225563909774436,
"grad_norm": 0.048182275146245956,
"learning_rate": 0.0002,
"loss": 0.5472801923751831,
"mean_token_accuracy": 0.776175931096077,
"num_tokens": 9661070.0,
"step": 592
},
{
"entropy": 0.5267005264759064,
"epoch": 2.2293233082706765,
"grad_norm": 0.04179242253303528,
"learning_rate": 0.0002,
"loss": 0.5309768319129944,
"mean_token_accuracy": 0.7826364785432816,
"num_tokens": 9677378.0,
"step": 593
},
{
"entropy": 0.5416758507490158,
"epoch": 2.2330827067669174,
"grad_norm": 0.04981589689850807,
"learning_rate": 0.0002,
"loss": 0.549900472164154,
"mean_token_accuracy": 0.7765727639198303,
"num_tokens": 9693819.0,
"step": 594
},
{
"entropy": 0.5369458198547363,
"epoch": 2.236842105263158,
"grad_norm": 0.051439523696899414,
"learning_rate": 0.0002,
"loss": 0.5440854430198669,
"mean_token_accuracy": 0.7789760231971741,
"num_tokens": 9710189.0,
"step": 595
},
{
"entropy": 0.5342868715524673,
"epoch": 2.2406015037593985,
"grad_norm": 0.04235680773854256,
"learning_rate": 0.0002,
"loss": 0.5430835485458374,
"mean_token_accuracy": 0.7785050868988037,
"num_tokens": 9726526.0,
"step": 596
},
{
"entropy": 0.5481905192136765,
"epoch": 2.244360902255639,
"grad_norm": 0.044252388179302216,
"learning_rate": 0.0002,
"loss": 0.5456714034080505,
"mean_token_accuracy": 0.7800015658140182,
"num_tokens": 9742892.0,
"step": 597
},
{
"entropy": 0.5490403324365616,
"epoch": 2.2481203007518795,
"grad_norm": 0.036522816866636276,
"learning_rate": 0.0002,
"loss": 0.5348387956619263,
"mean_token_accuracy": 0.7838009893894196,
"num_tokens": 9759316.0,
"step": 598
},
{
"entropy": 0.5373188108205795,
"epoch": 2.2518796992481205,
"grad_norm": 0.0484786219894886,
"learning_rate": 0.0002,
"loss": 0.5393818616867065,
"mean_token_accuracy": 0.7799521684646606,
"num_tokens": 9775422.0,
"step": 599
},
{
"entropy": 0.5350137799978256,
"epoch": 2.255639097744361,
"grad_norm": 0.03971916437149048,
"learning_rate": 0.0002,
"loss": 0.5390014052391052,
"mean_token_accuracy": 0.7825258076190948,
"num_tokens": 9791645.0,
"step": 600
},
{
"entropy": 0.529654249548912,
"epoch": 2.2593984962406015,
"grad_norm": 0.03677717223763466,
"learning_rate": 0.0002,
"loss": 0.5347926020622253,
"mean_token_accuracy": 0.7820286452770233,
"num_tokens": 9807863.0,
"step": 601
},
{
"entropy": 0.5160931199789047,
"epoch": 2.263157894736842,
"grad_norm": 0.04103193059563637,
"learning_rate": 0.0002,
"loss": 0.5219160914421082,
"mean_token_accuracy": 0.7898968160152435,
"num_tokens": 9823834.0,
"step": 602
},
{
"entropy": 0.547026053071022,
"epoch": 2.2669172932330826,
"grad_norm": 0.035431135445833206,
"learning_rate": 0.0002,
"loss": 0.5403215289115906,
"mean_token_accuracy": 0.7804599404335022,
"num_tokens": 9840527.0,
"step": 603
},
{
"entropy": 0.5330915451049805,
"epoch": 2.2706766917293235,
"grad_norm": 0.03688134625554085,
"learning_rate": 0.0002,
"loss": 0.5308654308319092,
"mean_token_accuracy": 0.7851675152778625,
"num_tokens": 9856677.0,
"step": 604
},
{
"entropy": 0.5384332090616226,
"epoch": 2.274436090225564,
"grad_norm": 0.04168199747800827,
"learning_rate": 0.0002,
"loss": 0.5318323373794556,
"mean_token_accuracy": 0.7833025008440018,
"num_tokens": 9872958.0,
"step": 605
},
{
"entropy": 0.5483455657958984,
"epoch": 2.2781954887218046,
"grad_norm": 0.0458533950150013,
"learning_rate": 0.0002,
"loss": 0.5497722625732422,
"mean_token_accuracy": 0.7783730030059814,
"num_tokens": 9889301.0,
"step": 606
},
{
"entropy": 0.5242274850606918,
"epoch": 2.281954887218045,
"grad_norm": 0.03992198407649994,
"learning_rate": 0.0002,
"loss": 0.5323127508163452,
"mean_token_accuracy": 0.7856701463460922,
"num_tokens": 9905738.0,
"step": 607
},
{
"entropy": 0.5306910574436188,
"epoch": 2.2857142857142856,
"grad_norm": 0.03714906424283981,
"learning_rate": 0.0002,
"loss": 0.5334057807922363,
"mean_token_accuracy": 0.7845153957605362,
"num_tokens": 9922153.0,
"step": 608
},
{
"entropy": 0.5255761742591858,
"epoch": 2.2894736842105265,
"grad_norm": 0.037783432751894,
"learning_rate": 0.0002,
"loss": 0.5267370343208313,
"mean_token_accuracy": 0.7860815078020096,
"num_tokens": 9938520.0,
"step": 609
},
{
"entropy": 0.528737261891365,
"epoch": 2.293233082706767,
"grad_norm": 0.03467050567269325,
"learning_rate": 0.0002,
"loss": 0.5269864797592163,
"mean_token_accuracy": 0.789274126291275,
"num_tokens": 9954806.0,
"step": 610
},
{
"entropy": 0.5392419397830963,
"epoch": 2.2969924812030076,
"grad_norm": 0.03630411997437477,
"learning_rate": 0.0002,
"loss": 0.5344975590705872,
"mean_token_accuracy": 0.7834292352199554,
"num_tokens": 9971123.0,
"step": 611
},
{
"entropy": 0.5148891359567642,
"epoch": 2.300751879699248,
"grad_norm": 0.03637854382395744,
"learning_rate": 0.0002,
"loss": 0.5145090222358704,
"mean_token_accuracy": 0.7894360274076462,
"num_tokens": 9987229.0,
"step": 612
},
{
"entropy": 0.538021132349968,
"epoch": 2.3045112781954886,
"grad_norm": 0.03751857578754425,
"learning_rate": 0.0002,
"loss": 0.541398286819458,
"mean_token_accuracy": 0.7807863056659698,
"num_tokens": 10003519.0,
"step": 613
},
{
"entropy": 0.5272123515605927,
"epoch": 2.308270676691729,
"grad_norm": 0.04051438719034195,
"learning_rate": 0.0002,
"loss": 0.5344090461730957,
"mean_token_accuracy": 0.7857641130685806,
"num_tokens": 10019993.0,
"step": 614
},
{
"entropy": 0.5179824233055115,
"epoch": 2.31203007518797,
"grad_norm": 0.04479973390698433,
"learning_rate": 0.0002,
"loss": 0.5279502272605896,
"mean_token_accuracy": 0.7859090268611908,
"num_tokens": 10036196.0,
"step": 615
},
{
"entropy": 0.5467290729284286,
"epoch": 2.3157894736842106,
"grad_norm": 0.03927797079086304,
"learning_rate": 0.0002,
"loss": 0.5486882328987122,
"mean_token_accuracy": 0.7768010795116425,
"num_tokens": 10052474.0,
"step": 616
},
{
"entropy": 0.5408567190170288,
"epoch": 2.319548872180451,
"grad_norm": 0.03986404091119766,
"learning_rate": 0.0002,
"loss": 0.5317103862762451,
"mean_token_accuracy": 0.7851662039756775,
"num_tokens": 10068775.0,
"step": 617
},
{
"entropy": 0.5392286479473114,
"epoch": 2.3233082706766917,
"grad_norm": 0.03838985413312912,
"learning_rate": 0.0002,
"loss": 0.530458927154541,
"mean_token_accuracy": 0.7848429083824158,
"num_tokens": 10084946.0,
"step": 618
},
{
"entropy": 0.5223991498351097,
"epoch": 2.327067669172932,
"grad_norm": 0.03357016295194626,
"learning_rate": 0.0002,
"loss": 0.5164550542831421,
"mean_token_accuracy": 0.7903633117675781,
"num_tokens": 10101221.0,
"step": 619
},
{
"entropy": 0.5287820845842361,
"epoch": 2.3308270676691727,
"grad_norm": 0.041184201836586,
"learning_rate": 0.0002,
"loss": 0.5312986373901367,
"mean_token_accuracy": 0.7844579666852951,
"num_tokens": 10117440.0,
"step": 620
},
{
"entropy": 0.5136409252882004,
"epoch": 2.3345864661654137,
"grad_norm": 0.044375885277986526,
"learning_rate": 0.0002,
"loss": 0.5256669521331787,
"mean_token_accuracy": 0.7870495319366455,
"num_tokens": 10133537.0,
"step": 621
},
{
"entropy": 0.5296864807605743,
"epoch": 2.338345864661654,
"grad_norm": 0.043142594397068024,
"learning_rate": 0.0002,
"loss": 0.5372653007507324,
"mean_token_accuracy": 0.7797198593616486,
"num_tokens": 10149832.0,
"step": 622
},
{
"entropy": 0.5296363830566406,
"epoch": 2.3421052631578947,
"grad_norm": 0.04168247431516647,
"learning_rate": 0.0002,
"loss": 0.5342837572097778,
"mean_token_accuracy": 0.7827459424734116,
"num_tokens": 10166206.0,
"step": 623
},
{
"entropy": 0.5279521271586418,
"epoch": 2.345864661654135,
"grad_norm": 0.03668156638741493,
"learning_rate": 0.0002,
"loss": 0.5243417024612427,
"mean_token_accuracy": 0.7867815494537354,
"num_tokens": 10182574.0,
"step": 624
},
{
"entropy": 0.5396132320165634,
"epoch": 2.3496240601503757,
"grad_norm": 0.040590520948171616,
"learning_rate": 0.0002,
"loss": 0.534129798412323,
"mean_token_accuracy": 0.7840494364500046,
"num_tokens": 10198963.0,
"step": 625
},
{
"entropy": 0.5384691059589386,
"epoch": 2.3533834586466167,
"grad_norm": 0.03799832612276077,
"learning_rate": 0.0002,
"loss": 0.5275224447250366,
"mean_token_accuracy": 0.788055807352066,
"num_tokens": 10215363.0,
"step": 626
},
{
"entropy": 0.5355971157550812,
"epoch": 2.357142857142857,
"grad_norm": 0.03812744468450546,
"learning_rate": 0.0002,
"loss": 0.5373313426971436,
"mean_token_accuracy": 0.7830821126699448,
"num_tokens": 10231721.0,
"step": 627
},
{
"entropy": 0.5379942953586578,
"epoch": 2.3609022556390977,
"grad_norm": 0.04219618812203407,
"learning_rate": 0.0002,
"loss": 0.5430394411087036,
"mean_token_accuracy": 0.779607817530632,
"num_tokens": 10248150.0,
"step": 628
},
{
"entropy": 0.5369090437889099,
"epoch": 2.3646616541353382,
"grad_norm": 0.04251544550061226,
"learning_rate": 0.0002,
"loss": 0.5445953011512756,
"mean_token_accuracy": 0.778522789478302,
"num_tokens": 10264414.0,
"step": 629
},
{
"entropy": 0.5455975085496902,
"epoch": 2.3684210526315788,
"grad_norm": 0.04128441959619522,
"learning_rate": 0.0002,
"loss": 0.5464663505554199,
"mean_token_accuracy": 0.7782220393419266,
"num_tokens": 10280655.0,
"step": 630
},
{
"entropy": 0.5499599725008011,
"epoch": 2.3721804511278197,
"grad_norm": 0.0386635959148407,
"learning_rate": 0.0002,
"loss": 0.542563259601593,
"mean_token_accuracy": 0.7798319011926651,
"num_tokens": 10297357.0,
"step": 631
},
{
"entropy": 0.5534010380506516,
"epoch": 2.3759398496240602,
"grad_norm": 0.040974393486976624,
"learning_rate": 0.0002,
"loss": 0.5562258362770081,
"mean_token_accuracy": 0.7761926651000977,
"num_tokens": 10313788.0,
"step": 632
},
{
"entropy": 0.5357997566461563,
"epoch": 2.3796992481203008,
"grad_norm": 0.03751135990023613,
"learning_rate": 0.0002,
"loss": 0.5311724543571472,
"mean_token_accuracy": 0.7860594242811203,
"num_tokens": 10330164.0,
"step": 633
},
{
"entropy": 0.5399480760097504,
"epoch": 2.3834586466165413,
"grad_norm": 0.0392535962164402,
"learning_rate": 0.0002,
"loss": 0.5405341982841492,
"mean_token_accuracy": 0.782960519194603,
"num_tokens": 10346587.0,
"step": 634
},
{
"entropy": 0.5351511463522911,
"epoch": 2.387218045112782,
"grad_norm": 0.04137985408306122,
"learning_rate": 0.0002,
"loss": 0.5435580611228943,
"mean_token_accuracy": 0.7791251242160797,
"num_tokens": 10362964.0,
"step": 635
},
{
"entropy": 0.5337197929620743,
"epoch": 2.3909774436090228,
"grad_norm": 0.04529615119099617,
"learning_rate": 0.0002,
"loss": 0.54475998878479,
"mean_token_accuracy": 0.7794527411460876,
"num_tokens": 10379194.0,
"step": 636
},
{
"entropy": 0.5295632779598236,
"epoch": 2.3947368421052633,
"grad_norm": 0.03818366676568985,
"learning_rate": 0.0002,
"loss": 0.53121417760849,
"mean_token_accuracy": 0.7843088060617447,
"num_tokens": 10395289.0,
"step": 637
},
{
"entropy": 0.5338181853294373,
"epoch": 2.398496240601504,
"grad_norm": 0.04155934602022171,
"learning_rate": 0.0002,
"loss": 0.5273146033287048,
"mean_token_accuracy": 0.7871305495500565,
"num_tokens": 10411478.0,
"step": 638
},
{
"entropy": 0.5275490283966064,
"epoch": 2.4022556390977443,
"grad_norm": 0.03884044289588928,
"learning_rate": 0.0002,
"loss": 0.5259033441543579,
"mean_token_accuracy": 0.7865510582923889,
"num_tokens": 10428000.0,
"step": 639
},
{
"entropy": 0.5296481549739838,
"epoch": 2.406015037593985,
"grad_norm": 0.03892350569367409,
"learning_rate": 0.0002,
"loss": 0.5338611602783203,
"mean_token_accuracy": 0.7841958701610565,
"num_tokens": 10444531.0,
"step": 640
},
{
"entropy": 0.5326656997203827,
"epoch": 2.409774436090226,
"grad_norm": 0.04130466282367706,
"learning_rate": 0.0002,
"loss": 0.5334239602088928,
"mean_token_accuracy": 0.7844693660736084,
"num_tokens": 10460884.0,
"step": 641
},
{
"entropy": 0.5167141184210777,
"epoch": 2.4135338345864663,
"grad_norm": 0.04298912361264229,
"learning_rate": 0.0002,
"loss": 0.5224160552024841,
"mean_token_accuracy": 0.790846198797226,
"num_tokens": 10476946.0,
"step": 642
},
{
"entropy": 0.5394491106271744,
"epoch": 2.417293233082707,
"grad_norm": 0.0389692522585392,
"learning_rate": 0.0002,
"loss": 0.5456172823905945,
"mean_token_accuracy": 0.7784712016582489,
"num_tokens": 10493157.0,
"step": 643
},
{
"entropy": 0.5317131578922272,
"epoch": 2.4210526315789473,
"grad_norm": 0.03282848745584488,
"learning_rate": 0.0002,
"loss": 0.5272088050842285,
"mean_token_accuracy": 0.7835191786289215,
"num_tokens": 10509339.0,
"step": 644
},
{
"entropy": 0.5249821543693542,
"epoch": 2.424812030075188,
"grad_norm": 0.03486508131027222,
"learning_rate": 0.0002,
"loss": 0.5219942927360535,
"mean_token_accuracy": 0.787269338965416,
"num_tokens": 10525556.0,
"step": 645
},
{
"entropy": 0.5392860472202301,
"epoch": 2.4285714285714284,
"grad_norm": 0.03448896110057831,
"learning_rate": 0.0002,
"loss": 0.5338496565818787,
"mean_token_accuracy": 0.7829862833023071,
"num_tokens": 10541761.0,
"step": 646
},
{
"entropy": 0.5386904329061508,
"epoch": 2.4323308270676693,
"grad_norm": 0.037768758833408356,
"learning_rate": 0.0002,
"loss": 0.5425961017608643,
"mean_token_accuracy": 0.7781831622123718,
"num_tokens": 10558311.0,
"step": 647
},
{
"entropy": 0.5251231044530869,
"epoch": 2.43609022556391,
"grad_norm": 0.03807547688484192,
"learning_rate": 0.0002,
"loss": 0.5291208624839783,
"mean_token_accuracy": 0.783474326133728,
"num_tokens": 10574696.0,
"step": 648
},
{
"entropy": 0.5356583297252655,
"epoch": 2.4398496240601504,
"grad_norm": 0.03421357646584511,
"learning_rate": 0.0002,
"loss": 0.5309426188468933,
"mean_token_accuracy": 0.7826003879308701,
"num_tokens": 10591225.0,
"step": 649
},
{
"entropy": 0.5321584492921829,
"epoch": 2.443609022556391,
"grad_norm": 0.04219021648168564,
"learning_rate": 0.0002,
"loss": 0.5343624353408813,
"mean_token_accuracy": 0.7819913923740387,
"num_tokens": 10607648.0,
"step": 650
},
{
"entropy": 0.5409150719642639,
"epoch": 2.4473684210526314,
"grad_norm": 0.039848409593105316,
"learning_rate": 0.0002,
"loss": 0.5406517386436462,
"mean_token_accuracy": 0.7809206694364548,
"num_tokens": 10623965.0,
"step": 651
},
{
"entropy": 0.5184071511030197,
"epoch": 2.451127819548872,
"grad_norm": 0.04401297867298126,
"learning_rate": 0.0002,
"loss": 0.5264937877655029,
"mean_token_accuracy": 0.7875054776668549,
"num_tokens": 10640111.0,
"step": 652
},
{
"entropy": 0.5153327658772469,
"epoch": 2.454887218045113,
"grad_norm": 0.037109002470970154,
"learning_rate": 0.0002,
"loss": 0.5220255255699158,
"mean_token_accuracy": 0.7878341674804688,
"num_tokens": 10656391.0,
"step": 653
},
{
"entropy": 0.534611888229847,
"epoch": 2.4586466165413534,
"grad_norm": 0.047087740153074265,
"learning_rate": 0.0002,
"loss": 0.5327281951904297,
"mean_token_accuracy": 0.7858874797821045,
"num_tokens": 10672550.0,
"step": 654
},
{
"entropy": 0.5468750447034836,
"epoch": 2.462406015037594,
"grad_norm": 0.03793250396847725,
"learning_rate": 0.0002,
"loss": 0.5467609167098999,
"mean_token_accuracy": 0.7752472460269928,
"num_tokens": 10688678.0,
"step": 655
},
{
"entropy": 0.5618661195039749,
"epoch": 2.4661654135338344,
"grad_norm": 0.043232064694166183,
"learning_rate": 0.0002,
"loss": 0.557094395160675,
"mean_token_accuracy": 0.7767215073108673,
"num_tokens": 10705231.0,
"step": 656
},
{
"entropy": 0.5481238514184952,
"epoch": 2.469924812030075,
"grad_norm": 0.04276246577501297,
"learning_rate": 0.0002,
"loss": 0.5488662719726562,
"mean_token_accuracy": 0.780038595199585,
"num_tokens": 10721712.0,
"step": 657
},
{
"entropy": 0.5505738407373428,
"epoch": 2.473684210526316,
"grad_norm": 0.040987517684698105,
"learning_rate": 0.0002,
"loss": 0.5510429739952087,
"mean_token_accuracy": 0.7774406224489212,
"num_tokens": 10737970.0,
"step": 658
},
{
"entropy": 0.5473013371229172,
"epoch": 2.4774436090225564,
"grad_norm": 0.051042236387729645,
"learning_rate": 0.0002,
"loss": 0.5507328510284424,
"mean_token_accuracy": 0.7794748395681381,
"num_tokens": 10754101.0,
"step": 659
},
{
"entropy": 0.5286405235528946,
"epoch": 2.481203007518797,
"grad_norm": 0.04263005033135414,
"learning_rate": 0.0002,
"loss": 0.5302000045776367,
"mean_token_accuracy": 0.7844719737768173,
"num_tokens": 10770357.0,
"step": 660
},
{
"entropy": 0.5383267849683762,
"epoch": 2.4849624060150375,
"grad_norm": 0.03854911029338837,
"learning_rate": 0.0002,
"loss": 0.54207444190979,
"mean_token_accuracy": 0.7791945487260818,
"num_tokens": 10786804.0,
"step": 661
},
{
"entropy": 0.5230704694986343,
"epoch": 2.488721804511278,
"grad_norm": 0.04200039431452751,
"learning_rate": 0.0002,
"loss": 0.5254136919975281,
"mean_token_accuracy": 0.7850333154201508,
"num_tokens": 10802992.0,
"step": 662
},
{
"entropy": 0.5294183790683746,
"epoch": 2.492481203007519,
"grad_norm": 0.04227717965841293,
"learning_rate": 0.0002,
"loss": 0.5372048616409302,
"mean_token_accuracy": 0.7844373136758804,
"num_tokens": 10819187.0,
"step": 663
},
{
"entropy": 0.5186149403452873,
"epoch": 2.4962406015037595,
"grad_norm": 0.03944484889507294,
"learning_rate": 0.0002,
"loss": 0.5234470367431641,
"mean_token_accuracy": 0.7857441008090973,
"num_tokens": 10835170.0,
"step": 664
},
{
"entropy": 0.5416997969150543,
"epoch": 2.5,
"grad_norm": 0.043196793645620346,
"learning_rate": 0.0002,
"loss": 0.5474759936332703,
"mean_token_accuracy": 0.7749510407447815,
"num_tokens": 10851563.0,
"step": 665
},
{
"entropy": 0.5275483727455139,
"epoch": 2.5037593984962405,
"grad_norm": 0.03911745548248291,
"learning_rate": 0.0002,
"loss": 0.5205013155937195,
"mean_token_accuracy": 0.7898803949356079,
"num_tokens": 10867571.0,
"step": 666
},
{
"entropy": 0.5302275121212006,
"epoch": 2.507518796992481,
"grad_norm": 0.03766452148556709,
"learning_rate": 0.0002,
"loss": 0.5310875773429871,
"mean_token_accuracy": 0.7819045037031174,
"num_tokens": 10883849.0,
"step": 667
},
{
"entropy": 0.5416832715272903,
"epoch": 2.511278195488722,
"grad_norm": 0.03993174061179161,
"learning_rate": 0.0002,
"loss": 0.5426294207572937,
"mean_token_accuracy": 0.7777436971664429,
"num_tokens": 10900103.0,
"step": 668
},
{
"entropy": 0.5554288029670715,
"epoch": 2.5150375939849625,
"grad_norm": 0.046043481677770615,
"learning_rate": 0.0002,
"loss": 0.5500344634056091,
"mean_token_accuracy": 0.7746063023805618,
"num_tokens": 10916472.0,
"step": 669
},
{
"entropy": 0.5500206649303436,
"epoch": 2.518796992481203,
"grad_norm": 0.04341411218047142,
"learning_rate": 0.0002,
"loss": 0.5484751462936401,
"mean_token_accuracy": 0.7778518944978714,
"num_tokens": 10932960.0,
"step": 670
},
{
"entropy": 0.5585402101278305,
"epoch": 2.5225563909774436,
"grad_norm": 0.04927565157413483,
"learning_rate": 0.0002,
"loss": 0.5563656091690063,
"mean_token_accuracy": 0.7734353542327881,
"num_tokens": 10949340.0,
"step": 671
},
{
"entropy": 0.5314253345131874,
"epoch": 2.526315789473684,
"grad_norm": 0.04110320657491684,
"learning_rate": 0.0002,
"loss": 0.5281319618225098,
"mean_token_accuracy": 0.7881615608930588,
"num_tokens": 10965640.0,
"step": 672
},
{
"entropy": 0.519628070294857,
"epoch": 2.530075187969925,
"grad_norm": 0.03798144683241844,
"learning_rate": 0.0002,
"loss": 0.5186299085617065,
"mean_token_accuracy": 0.7885057926177979,
"num_tokens": 10982162.0,
"step": 673
},
{
"entropy": 0.5199308693408966,
"epoch": 2.5338345864661656,
"grad_norm": 0.04168830066919327,
"learning_rate": 0.0002,
"loss": 0.5289560556411743,
"mean_token_accuracy": 0.7860239744186401,
"num_tokens": 10998283.0,
"step": 674
},
{
"entropy": 0.5352334305644035,
"epoch": 2.537593984962406,
"grad_norm": 0.04851493611931801,
"learning_rate": 0.0002,
"loss": 0.5395171642303467,
"mean_token_accuracy": 0.781098335981369,
"num_tokens": 11014541.0,
"step": 675
},
{
"entropy": 0.5220839083194733,
"epoch": 2.5413533834586466,
"grad_norm": 0.03901033103466034,
"learning_rate": 0.0002,
"loss": 0.5202946662902832,
"mean_token_accuracy": 0.7897375226020813,
"num_tokens": 11030626.0,
"step": 676
},
{
"entropy": 0.5660356432199478,
"epoch": 2.545112781954887,
"grad_norm": 0.040614161640405655,
"learning_rate": 0.0002,
"loss": 0.5683348774909973,
"mean_token_accuracy": 0.7686392664909363,
"num_tokens": 11047170.0,
"step": 677
},
{
"entropy": 0.5248497724533081,
"epoch": 2.548872180451128,
"grad_norm": 0.050087373703718185,
"learning_rate": 0.0002,
"loss": 0.5326120257377625,
"mean_token_accuracy": 0.7856886386871338,
"num_tokens": 11063651.0,
"step": 678
},
{
"entropy": 0.5423640608787537,
"epoch": 2.5526315789473686,
"grad_norm": 0.05331513658165932,
"learning_rate": 0.0002,
"loss": 0.5449936389923096,
"mean_token_accuracy": 0.778554379940033,
"num_tokens": 11080048.0,
"step": 679
},
{
"entropy": 0.5384076982736588,
"epoch": 2.556390977443609,
"grad_norm": 0.04410131275653839,
"learning_rate": 0.0002,
"loss": 0.5350104570388794,
"mean_token_accuracy": 0.7837571948766708,
"num_tokens": 11096391.0,
"step": 680
},
{
"entropy": 0.529449462890625,
"epoch": 2.5601503759398496,
"grad_norm": 0.03738116845488548,
"learning_rate": 0.0002,
"loss": 0.5299030542373657,
"mean_token_accuracy": 0.7870044708251953,
"num_tokens": 11112709.0,
"step": 681
},
{
"entropy": 0.5311971455812454,
"epoch": 2.56390977443609,
"grad_norm": 0.04492153227329254,
"learning_rate": 0.0002,
"loss": 0.5362582206726074,
"mean_token_accuracy": 0.780634418129921,
"num_tokens": 11129093.0,
"step": 682
},
{
"entropy": 0.5400303602218628,
"epoch": 2.567669172932331,
"grad_norm": 0.036020781844854355,
"learning_rate": 0.0002,
"loss": 0.5404684543609619,
"mean_token_accuracy": 0.7825169265270233,
"num_tokens": 11145314.0,
"step": 683
},
{
"entropy": 0.5410858988761902,
"epoch": 2.571428571428571,
"grad_norm": 0.04276980832219124,
"learning_rate": 0.0002,
"loss": 0.5423122048377991,
"mean_token_accuracy": 0.7814541161060333,
"num_tokens": 11161581.0,
"step": 684
},
{
"entropy": 0.5380300432443619,
"epoch": 2.575187969924812,
"grad_norm": 0.03481379151344299,
"learning_rate": 0.0002,
"loss": 0.5358370542526245,
"mean_token_accuracy": 0.7818766683340073,
"num_tokens": 11177989.0,
"step": 685
},
{
"entropy": 0.5248596295714378,
"epoch": 2.5789473684210527,
"grad_norm": 0.036602359265089035,
"learning_rate": 0.0002,
"loss": 0.5253828763961792,
"mean_token_accuracy": 0.7854669690132141,
"num_tokens": 11194032.0,
"step": 686
},
{
"entropy": 0.5219234973192215,
"epoch": 2.582706766917293,
"grad_norm": 0.040489669889211655,
"learning_rate": 0.0002,
"loss": 0.5243583917617798,
"mean_token_accuracy": 0.786599799990654,
"num_tokens": 11210092.0,
"step": 687
},
{
"entropy": 0.5334769785404205,
"epoch": 2.5864661654135337,
"grad_norm": 0.03958981856703758,
"learning_rate": 0.0002,
"loss": 0.5376310348510742,
"mean_token_accuracy": 0.7825024574995041,
"num_tokens": 11226462.0,
"step": 688
},
{
"entropy": 0.5297794789075851,
"epoch": 2.590225563909774,
"grad_norm": 0.039997756481170654,
"learning_rate": 0.0002,
"loss": 0.5335977077484131,
"mean_token_accuracy": 0.7828920185565948,
"num_tokens": 11242781.0,
"step": 689
},
{
"entropy": 0.535497397184372,
"epoch": 2.593984962406015,
"grad_norm": 0.03865867853164673,
"learning_rate": 0.0002,
"loss": 0.5379775762557983,
"mean_token_accuracy": 0.7825619131326675,
"num_tokens": 11259131.0,
"step": 690
},
{
"entropy": 0.5340843796730042,
"epoch": 2.5977443609022557,
"grad_norm": 0.037679754197597504,
"learning_rate": 0.0002,
"loss": 0.5335901975631714,
"mean_token_accuracy": 0.7848968952894211,
"num_tokens": 11275370.0,
"step": 691
},
{
"entropy": 0.5506868213415146,
"epoch": 2.601503759398496,
"grad_norm": 0.04139415919780731,
"learning_rate": 0.0002,
"loss": 0.5515389442443848,
"mean_token_accuracy": 0.7779832780361176,
"num_tokens": 11291675.0,
"step": 692
},
{
"entropy": 0.5458535552024841,
"epoch": 2.6052631578947367,
"grad_norm": 0.03914312273263931,
"learning_rate": 0.0002,
"loss": 0.5428761839866638,
"mean_token_accuracy": 0.7802267819643021,
"num_tokens": 11308082.0,
"step": 693
},
{
"entropy": 0.5242106392979622,
"epoch": 2.6090225563909772,
"grad_norm": 0.03517727553844452,
"learning_rate": 0.0002,
"loss": 0.5183535218238831,
"mean_token_accuracy": 0.7899799644947052,
"num_tokens": 11324349.0,
"step": 694
},
{
"entropy": 0.527122899889946,
"epoch": 2.612781954887218,
"grad_norm": 0.03646351397037506,
"learning_rate": 0.0002,
"loss": 0.5237759351730347,
"mean_token_accuracy": 0.7876067459583282,
"num_tokens": 11340804.0,
"step": 695
},
{
"entropy": 0.5334932953119278,
"epoch": 2.6165413533834587,
"grad_norm": 0.03501564636826515,
"learning_rate": 0.0002,
"loss": 0.5345377326011658,
"mean_token_accuracy": 0.7828026562929153,
"num_tokens": 11357207.0,
"step": 696
},
{
"entropy": 0.5264469981193542,
"epoch": 2.6203007518796992,
"grad_norm": 0.042768895626068115,
"learning_rate": 0.0002,
"loss": 0.5306587219238281,
"mean_token_accuracy": 0.7863332629203796,
"num_tokens": 11373543.0,
"step": 697
},
{
"entropy": 0.5400331318378448,
"epoch": 2.6240601503759398,
"grad_norm": 0.03265206515789032,
"learning_rate": 0.0002,
"loss": 0.5402212142944336,
"mean_token_accuracy": 0.7809455096721649,
"num_tokens": 11390155.0,
"step": 698
},
{
"entropy": 0.5565398335456848,
"epoch": 2.6278195488721803,
"grad_norm": 0.04417556896805763,
"learning_rate": 0.0002,
"loss": 0.5573287010192871,
"mean_token_accuracy": 0.7738644480705261,
"num_tokens": 11406739.0,
"step": 699
},
{
"entropy": 0.5443829298019409,
"epoch": 2.6315789473684212,
"grad_norm": 0.03721097856760025,
"learning_rate": 0.0002,
"loss": 0.5420445799827576,
"mean_token_accuracy": 0.7787856310606003,
"num_tokens": 11423213.0,
"step": 700
},
{
"entropy": 0.5284033268690109,
"epoch": 2.6353383458646618,
"grad_norm": 0.041038673371076584,
"learning_rate": 0.0002,
"loss": 0.5301244258880615,
"mean_token_accuracy": 0.7856591492891312,
"num_tokens": 11439231.0,
"step": 701
},
{
"entropy": 0.5442045629024506,
"epoch": 2.6390977443609023,
"grad_norm": 0.03640377148985863,
"learning_rate": 0.0002,
"loss": 0.5464366674423218,
"mean_token_accuracy": 0.7776281535625458,
"num_tokens": 11455738.0,
"step": 702
},
{
"entropy": 0.5383570641279221,
"epoch": 2.642857142857143,
"grad_norm": 0.04412476718425751,
"learning_rate": 0.0002,
"loss": 0.544456422328949,
"mean_token_accuracy": 0.7783865183591843,
"num_tokens": 11471797.0,
"step": 703
},
{
"entropy": 0.5191052407026291,
"epoch": 2.6466165413533833,
"grad_norm": 0.035958074033260345,
"learning_rate": 0.0002,
"loss": 0.5193113088607788,
"mean_token_accuracy": 0.7863477617502213,
"num_tokens": 11487876.0,
"step": 704
},
{
"entropy": 0.5466601550579071,
"epoch": 2.6503759398496243,
"grad_norm": 0.048238396644592285,
"learning_rate": 0.0002,
"loss": 0.5443681478500366,
"mean_token_accuracy": 0.7801824659109116,
"num_tokens": 11504122.0,
"step": 705
},
{
"entropy": 0.5602389425039291,
"epoch": 2.654135338345865,
"grad_norm": 0.0392533615231514,
"learning_rate": 0.0002,
"loss": 0.5607460141181946,
"mean_token_accuracy": 0.7710349410772324,
"num_tokens": 11520493.0,
"step": 706
},
{
"entropy": 0.5393271297216415,
"epoch": 2.6578947368421053,
"grad_norm": 0.046152085065841675,
"learning_rate": 0.0002,
"loss": 0.5473223924636841,
"mean_token_accuracy": 0.7810050994157791,
"num_tokens": 11536519.0,
"step": 707
},
{
"entropy": 0.5321537107229233,
"epoch": 2.661654135338346,
"grad_norm": 0.038532763719558716,
"learning_rate": 0.0002,
"loss": 0.5388097763061523,
"mean_token_accuracy": 0.7796639204025269,
"num_tokens": 11552787.0,
"step": 708
},
{
"entropy": 0.5336644947528839,
"epoch": 2.6654135338345863,
"grad_norm": 0.043611474335193634,
"learning_rate": 0.0002,
"loss": 0.5328789949417114,
"mean_token_accuracy": 0.7849068492650986,
"num_tokens": 11569073.0,
"step": 709
},
{
"entropy": 0.5428521186113358,
"epoch": 2.6691729323308273,
"grad_norm": 0.03883448615670204,
"learning_rate": 0.0002,
"loss": 0.5391871333122253,
"mean_token_accuracy": 0.781522735953331,
"num_tokens": 11585504.0,
"step": 710
},
{
"entropy": 0.5335109233856201,
"epoch": 2.672932330827068,
"grad_norm": 0.03785593435168266,
"learning_rate": 0.0002,
"loss": 0.5298542976379395,
"mean_token_accuracy": 0.7834679186344147,
"num_tokens": 11601813.0,
"step": 711
},
{
"entropy": 0.527670718729496,
"epoch": 2.6766917293233083,
"grad_norm": 0.036839164793491364,
"learning_rate": 0.0002,
"loss": 0.5316509008407593,
"mean_token_accuracy": 0.7826409935951233,
"num_tokens": 11618283.0,
"step": 712
},
{
"entropy": 0.5326329097151756,
"epoch": 2.680451127819549,
"grad_norm": 0.04807848483324051,
"learning_rate": 0.0002,
"loss": 0.5426601767539978,
"mean_token_accuracy": 0.7812999188899994,
"num_tokens": 11634632.0,
"step": 713
},
{
"entropy": 0.5393012017011642,
"epoch": 2.6842105263157894,
"grad_norm": 0.038986288011074066,
"learning_rate": 0.0002,
"loss": 0.5428729057312012,
"mean_token_accuracy": 0.7807578444480896,
"num_tokens": 11650999.0,
"step": 714
},
{
"entropy": 0.5483723729848862,
"epoch": 2.6879699248120303,
"grad_norm": 0.03780362382531166,
"learning_rate": 0.0002,
"loss": 0.5442914366722107,
"mean_token_accuracy": 0.7784056067466736,
"num_tokens": 11667151.0,
"step": 715
},
{
"entropy": 0.547231912612915,
"epoch": 2.6917293233082704,
"grad_norm": 0.045203741639852524,
"learning_rate": 0.0002,
"loss": 0.5431523323059082,
"mean_token_accuracy": 0.7817295789718628,
"num_tokens": 11683514.0,
"step": 716
},
{
"entropy": 0.5371780097484589,
"epoch": 2.6954887218045114,
"grad_norm": 0.03749014437198639,
"learning_rate": 0.0002,
"loss": 0.5376321077346802,
"mean_token_accuracy": 0.7811625152826309,
"num_tokens": 11699727.0,
"step": 717
},
{
"entropy": 0.5319441854953766,
"epoch": 2.699248120300752,
"grad_norm": 0.04130973294377327,
"learning_rate": 0.0002,
"loss": 0.5348937511444092,
"mean_token_accuracy": 0.784428283572197,
"num_tokens": 11716234.0,
"step": 718
},
{
"entropy": 0.5342800319194794,
"epoch": 2.7030075187969924,
"grad_norm": 0.04313354194164276,
"learning_rate": 0.0002,
"loss": 0.5452970266342163,
"mean_token_accuracy": 0.7770380526781082,
"num_tokens": 11732506.0,
"step": 719
},
{
"entropy": 0.5398904979228973,
"epoch": 2.706766917293233,
"grad_norm": 0.04417818412184715,
"learning_rate": 0.0002,
"loss": 0.5421609878540039,
"mean_token_accuracy": 0.7809232920408249,
"num_tokens": 11748768.0,
"step": 720
},
{
"entropy": 0.5440465807914734,
"epoch": 2.7105263157894735,
"grad_norm": 0.036389391869306564,
"learning_rate": 0.0002,
"loss": 0.5376783609390259,
"mean_token_accuracy": 0.7818926721811295,
"num_tokens": 11765164.0,
"step": 721
},
{
"entropy": 0.5312932878732681,
"epoch": 2.7142857142857144,
"grad_norm": 0.037032727152109146,
"learning_rate": 0.0002,
"loss": 0.5279201865196228,
"mean_token_accuracy": 0.7845446914434433,
"num_tokens": 11781577.0,
"step": 722
},
{
"entropy": 0.5704400539398193,
"epoch": 2.718045112781955,
"grad_norm": 0.03669275715947151,
"learning_rate": 0.0002,
"loss": 0.5670531988143921,
"mean_token_accuracy": 0.7707259953022003,
"num_tokens": 11798120.0,
"step": 723
},
{
"entropy": 0.5271944850683212,
"epoch": 2.7218045112781954,
"grad_norm": 0.04460054636001587,
"learning_rate": 0.0002,
"loss": 0.531152606010437,
"mean_token_accuracy": 0.7819943279027939,
"num_tokens": 11814241.0,
"step": 724
},
{
"entropy": 0.5407906174659729,
"epoch": 2.725563909774436,
"grad_norm": 0.04240792244672775,
"learning_rate": 0.0002,
"loss": 0.5359742045402527,
"mean_token_accuracy": 0.7843276411294937,
"num_tokens": 11830762.0,
"step": 725
},
{
"entropy": 0.538364827632904,
"epoch": 2.7293233082706765,
"grad_norm": 0.04200772941112518,
"learning_rate": 0.0002,
"loss": 0.5396072864532471,
"mean_token_accuracy": 0.7798211723566055,
"num_tokens": 11847252.0,
"step": 726
},
{
"entropy": 0.5308995842933655,
"epoch": 2.7330827067669174,
"grad_norm": 0.03762137144804001,
"learning_rate": 0.0002,
"loss": 0.5341114401817322,
"mean_token_accuracy": 0.7839807718992233,
"num_tokens": 11863535.0,
"step": 727
},
{
"entropy": 0.5268086791038513,
"epoch": 2.736842105263158,
"grad_norm": 0.03609534725546837,
"learning_rate": 0.0002,
"loss": 0.5221338868141174,
"mean_token_accuracy": 0.789483904838562,
"num_tokens": 11879928.0,
"step": 728
},
{
"entropy": 0.5412466526031494,
"epoch": 2.7406015037593985,
"grad_norm": 0.040453530848026276,
"learning_rate": 0.0002,
"loss": 0.5429666042327881,
"mean_token_accuracy": 0.7812945246696472,
"num_tokens": 11896142.0,
"step": 729
},
{
"entropy": 0.5352004170417786,
"epoch": 2.744360902255639,
"grad_norm": 0.044242773205041885,
"learning_rate": 0.0002,
"loss": 0.536725640296936,
"mean_token_accuracy": 0.7831927388906479,
"num_tokens": 11912241.0,
"step": 730
},
{
"entropy": 0.5453604012727737,
"epoch": 2.7481203007518795,
"grad_norm": 0.0423831045627594,
"learning_rate": 0.0002,
"loss": 0.5527924299240112,
"mean_token_accuracy": 0.7745030075311661,
"num_tokens": 11928611.0,
"step": 731
},
{
"entropy": 0.5306564420461655,
"epoch": 2.7518796992481205,
"grad_norm": 0.0449826754629612,
"learning_rate": 0.0002,
"loss": 0.5404161214828491,
"mean_token_accuracy": 0.7825066149234772,
"num_tokens": 11944963.0,
"step": 732
},
{
"entropy": 0.5378609150648117,
"epoch": 2.755639097744361,
"grad_norm": 0.04047499597072601,
"learning_rate": 0.0002,
"loss": 0.5455936193466187,
"mean_token_accuracy": 0.7781111598014832,
"num_tokens": 11961304.0,
"step": 733
},
{
"entropy": 0.5367683172225952,
"epoch": 2.7593984962406015,
"grad_norm": 0.04174184799194336,
"learning_rate": 0.0002,
"loss": 0.5363747477531433,
"mean_token_accuracy": 0.7800599485635757,
"num_tokens": 11977719.0,
"step": 734
},
{
"entropy": 0.5561744570732117,
"epoch": 2.763157894736842,
"grad_norm": 0.04008743166923523,
"learning_rate": 0.0002,
"loss": 0.552983283996582,
"mean_token_accuracy": 0.7766020447015762,
"num_tokens": 11993844.0,
"step": 735
},
{
"entropy": 0.5463001132011414,
"epoch": 2.7669172932330826,
"grad_norm": 0.03661397472023964,
"learning_rate": 0.0002,
"loss": 0.5395646691322327,
"mean_token_accuracy": 0.7784713059663773,
"num_tokens": 12010281.0,
"step": 736
},
{
"entropy": 0.5210074186325073,
"epoch": 2.7706766917293235,
"grad_norm": 0.03591572865843773,
"learning_rate": 0.0002,
"loss": 0.5220502018928528,
"mean_token_accuracy": 0.7874239087104797,
"num_tokens": 12026530.0,
"step": 737
},
{
"entropy": 0.5433954000473022,
"epoch": 2.774436090225564,
"grad_norm": 0.04104798287153244,
"learning_rate": 0.0002,
"loss": 0.5510661005973816,
"mean_token_accuracy": 0.7753429859876633,
"num_tokens": 12042889.0,
"step": 738
},
{
"entropy": 0.5119400694966316,
"epoch": 2.7781954887218046,
"grad_norm": 0.039529718458652496,
"learning_rate": 0.0002,
"loss": 0.5171459317207336,
"mean_token_accuracy": 0.7895881831645966,
"num_tokens": 12059138.0,
"step": 739
},
{
"entropy": 0.5456018000841141,
"epoch": 2.781954887218045,
"grad_norm": 0.03834446892142296,
"learning_rate": 0.0002,
"loss": 0.5516197681427002,
"mean_token_accuracy": 0.7791079431772232,
"num_tokens": 12075629.0,
"step": 740
},
{
"entropy": 0.5416502356529236,
"epoch": 2.7857142857142856,
"grad_norm": 0.03950374945998192,
"learning_rate": 0.0002,
"loss": 0.541545033454895,
"mean_token_accuracy": 0.7776272892951965,
"num_tokens": 12091966.0,
"step": 741
},
{
"entropy": 0.5439035892486572,
"epoch": 2.7894736842105265,
"grad_norm": 0.03714444488286972,
"learning_rate": 0.0002,
"loss": 0.5373456478118896,
"mean_token_accuracy": 0.7819632142782211,
"num_tokens": 12108429.0,
"step": 742
},
{
"entropy": 0.5513075590133667,
"epoch": 2.793233082706767,
"grad_norm": 0.03567977994680405,
"learning_rate": 0.0002,
"loss": 0.5416471362113953,
"mean_token_accuracy": 0.7816196233034134,
"num_tokens": 12124997.0,
"step": 743
},
{
"entropy": 0.5525044798851013,
"epoch": 2.7969924812030076,
"grad_norm": 0.036792755126953125,
"learning_rate": 0.0002,
"loss": 0.5522248148918152,
"mean_token_accuracy": 0.7766036689281464,
"num_tokens": 12141338.0,
"step": 744
},
{
"entropy": 0.522551566362381,
"epoch": 2.800751879699248,
"grad_norm": 0.03983981907367706,
"learning_rate": 0.0002,
"loss": 0.5232869982719421,
"mean_token_accuracy": 0.7857565432786942,
"num_tokens": 12157683.0,
"step": 745
},
{
"entropy": 0.5314129739999771,
"epoch": 2.8045112781954886,
"grad_norm": 0.03918331488966942,
"learning_rate": 0.0002,
"loss": 0.5321224927902222,
"mean_token_accuracy": 0.7834707945585251,
"num_tokens": 12174145.0,
"step": 746
},
{
"entropy": 0.5208713561296463,
"epoch": 2.8082706766917296,
"grad_norm": 0.03813806548714638,
"learning_rate": 0.0002,
"loss": 0.5278118848800659,
"mean_token_accuracy": 0.7842634320259094,
"num_tokens": 12190434.0,
"step": 747
},
{
"entropy": 0.5349813252687454,
"epoch": 2.8120300751879697,
"grad_norm": 0.04137561097741127,
"learning_rate": 0.0002,
"loss": 0.5378336906433105,
"mean_token_accuracy": 0.7831988483667374,
"num_tokens": 12206552.0,
"step": 748
},
{
"entropy": 0.529716819524765,
"epoch": 2.8157894736842106,
"grad_norm": 0.037089038640260696,
"learning_rate": 0.0002,
"loss": 0.530727744102478,
"mean_token_accuracy": 0.787126213312149,
"num_tokens": 12222985.0,
"step": 749
},
{
"entropy": 0.5329919755458832,
"epoch": 2.819548872180451,
"grad_norm": 0.03868598863482475,
"learning_rate": 0.0002,
"loss": 0.535510241985321,
"mean_token_accuracy": 0.7821749895811081,
"num_tokens": 12239387.0,
"step": 750
},
{
"entropy": 0.5512770563364029,
"epoch": 2.8233082706766917,
"grad_norm": 0.03504098951816559,
"learning_rate": 0.0002,
"loss": 0.5498230457305908,
"mean_token_accuracy": 0.77789406478405,
"num_tokens": 12255678.0,
"step": 751
},
{
"entropy": 0.5387983024120331,
"epoch": 2.827067669172932,
"grad_norm": 0.04012952372431755,
"learning_rate": 0.0002,
"loss": 0.5449475049972534,
"mean_token_accuracy": 0.7773616015911102,
"num_tokens": 12271735.0,
"step": 752
},
{
"entropy": 0.5438449382781982,
"epoch": 2.8308270676691727,
"grad_norm": 0.04448486492037773,
"learning_rate": 0.0002,
"loss": 0.5473355650901794,
"mean_token_accuracy": 0.7765258699655533,
"num_tokens": 12288034.0,
"step": 753
},
{
"entropy": 0.5242600291967392,
"epoch": 2.8345864661654137,
"grad_norm": 0.03874325752258301,
"learning_rate": 0.0002,
"loss": 0.5232968330383301,
"mean_token_accuracy": 0.7877610623836517,
"num_tokens": 12304188.0,
"step": 754
},
{
"entropy": 0.5431344211101532,
"epoch": 2.838345864661654,
"grad_norm": 0.04510108754038811,
"learning_rate": 0.0002,
"loss": 0.5374618768692017,
"mean_token_accuracy": 0.783510684967041,
"num_tokens": 12320210.0,
"step": 755
},
{
"entropy": 0.566683366894722,
"epoch": 2.8421052631578947,
"grad_norm": 0.038339611142873764,
"learning_rate": 0.0002,
"loss": 0.5602604746818542,
"mean_token_accuracy": 0.7746738642454147,
"num_tokens": 12336736.0,
"step": 756
},
{
"entropy": 0.5256731361150742,
"epoch": 2.845864661654135,
"grad_norm": 0.04725516587495804,
"learning_rate": 0.0002,
"loss": 0.5308937430381775,
"mean_token_accuracy": 0.7819661647081375,
"num_tokens": 12353304.0,
"step": 757
},
{
"entropy": 0.5368983596563339,
"epoch": 2.8496240601503757,
"grad_norm": 0.04469098895788193,
"learning_rate": 0.0002,
"loss": 0.5494676828384399,
"mean_token_accuracy": 0.7781397998332977,
"num_tokens": 12369897.0,
"step": 758
},
{
"entropy": 0.5407442450523376,
"epoch": 2.8533834586466167,
"grad_norm": 0.04544219374656677,
"learning_rate": 0.0002,
"loss": 0.5484528541564941,
"mean_token_accuracy": 0.7776692062616348,
"num_tokens": 12385920.0,
"step": 759
},
{
"entropy": 0.5232048332691193,
"epoch": 2.857142857142857,
"grad_norm": 0.03687431663274765,
"learning_rate": 0.0002,
"loss": 0.5165009498596191,
"mean_token_accuracy": 0.789492592215538,
"num_tokens": 12402444.0,
"step": 760
},
{
"entropy": 0.5273272693157196,
"epoch": 2.8609022556390977,
"grad_norm": 0.037794262170791626,
"learning_rate": 0.0002,
"loss": 0.5232701301574707,
"mean_token_accuracy": 0.788696900010109,
"num_tokens": 12418988.0,
"step": 761
},
{
"entropy": 0.5304031819105148,
"epoch": 2.8646616541353382,
"grad_norm": 0.038420420140028,
"learning_rate": 0.0002,
"loss": 0.5247512459754944,
"mean_token_accuracy": 0.7857597023248672,
"num_tokens": 12435536.0,
"step": 762
},
{
"entropy": 0.5269620269536972,
"epoch": 2.8684210526315788,
"grad_norm": 0.04084121063351631,
"learning_rate": 0.0002,
"loss": 0.5284534692764282,
"mean_token_accuracy": 0.7831205129623413,
"num_tokens": 12451737.0,
"step": 763
},
{
"entropy": 0.5162742882966995,
"epoch": 2.8721804511278197,
"grad_norm": 0.04410441219806671,
"learning_rate": 0.0002,
"loss": 0.5282053351402283,
"mean_token_accuracy": 0.7836557477712631,
"num_tokens": 12467925.0,
"step": 764
},
{
"entropy": 0.5351501703262329,
"epoch": 2.8759398496240602,
"grad_norm": 0.04215250536799431,
"learning_rate": 0.0002,
"loss": 0.5436667799949646,
"mean_token_accuracy": 0.7797116935253143,
"num_tokens": 12484385.0,
"step": 765
},
{
"entropy": 0.5445809066295624,
"epoch": 2.8796992481203008,
"grad_norm": 0.039003774523735046,
"learning_rate": 0.0002,
"loss": 0.5466570854187012,
"mean_token_accuracy": 0.7810900658369064,
"num_tokens": 12500782.0,
"step": 766
},
{
"entropy": 0.5677538812160492,
"epoch": 2.8834586466165413,
"grad_norm": 0.038001179695129395,
"learning_rate": 0.0002,
"loss": 0.5561648011207581,
"mean_token_accuracy": 0.7711465805768967,
"num_tokens": 12517241.0,
"step": 767
},
{
"entropy": 0.5477330982685089,
"epoch": 2.887218045112782,
"grad_norm": 0.03719984367489815,
"learning_rate": 0.0002,
"loss": 0.5399020910263062,
"mean_token_accuracy": 0.7845228165388107,
"num_tokens": 12533645.0,
"step": 768
},
{
"entropy": 0.5322476327419281,
"epoch": 2.8909774436090228,
"grad_norm": 0.04132302105426788,
"learning_rate": 0.0002,
"loss": 0.5327161550521851,
"mean_token_accuracy": 0.7837435156106949,
"num_tokens": 12550190.0,
"step": 769
},
{
"entropy": 0.5217838287353516,
"epoch": 2.8947368421052633,
"grad_norm": 0.041548822075128555,
"learning_rate": 0.0002,
"loss": 0.5239148139953613,
"mean_token_accuracy": 0.7885714769363403,
"num_tokens": 12566418.0,
"step": 770
},
{
"entropy": 0.5343627035617828,
"epoch": 2.898496240601504,
"grad_norm": 0.04029269516468048,
"learning_rate": 0.0002,
"loss": 0.5422418117523193,
"mean_token_accuracy": 0.7791919559240341,
"num_tokens": 12582647.0,
"step": 771
},
{
"entropy": 0.5284289866685867,
"epoch": 2.9022556390977443,
"grad_norm": 0.04448118433356285,
"learning_rate": 0.0002,
"loss": 0.5392597913742065,
"mean_token_accuracy": 0.7816968858242035,
"num_tokens": 12598795.0,
"step": 772
},
{
"entropy": 0.5162788778543472,
"epoch": 2.906015037593985,
"grad_norm": 0.04028403386473656,
"learning_rate": 0.0002,
"loss": 0.521114706993103,
"mean_token_accuracy": 0.7890318781137466,
"num_tokens": 12615105.0,
"step": 773
},
{
"entropy": 0.5632917135953903,
"epoch": 2.909774436090226,
"grad_norm": 0.04001300409436226,
"learning_rate": 0.0002,
"loss": 0.5603697299957275,
"mean_token_accuracy": 0.7751758396625519,
"num_tokens": 12631390.0,
"step": 774
},
{
"entropy": 0.5503305643796921,
"epoch": 2.9135338345864663,
"grad_norm": 0.03347298875451088,
"learning_rate": 0.0002,
"loss": 0.5459069609642029,
"mean_token_accuracy": 0.7786167114973068,
"num_tokens": 12647885.0,
"step": 775
},
{
"entropy": 0.5473008453845978,
"epoch": 2.917293233082707,
"grad_norm": 0.03752491995692253,
"learning_rate": 0.0002,
"loss": 0.5333649516105652,
"mean_token_accuracy": 0.7828412652015686,
"num_tokens": 12664120.0,
"step": 776
},
{
"entropy": 0.5354459285736084,
"epoch": 2.9210526315789473,
"grad_norm": 0.04058157652616501,
"learning_rate": 0.0002,
"loss": 0.5341867208480835,
"mean_token_accuracy": 0.7867896258831024,
"num_tokens": 12680500.0,
"step": 777
},
{
"entropy": 0.5142473876476288,
"epoch": 2.924812030075188,
"grad_norm": 0.04209408536553383,
"learning_rate": 0.0002,
"loss": 0.5206042528152466,
"mean_token_accuracy": 0.7850682884454727,
"num_tokens": 12696593.0,
"step": 778
},
{
"entropy": 0.5365364253520966,
"epoch": 2.928571428571429,
"grad_norm": 0.04453515261411667,
"learning_rate": 0.0002,
"loss": 0.545800507068634,
"mean_token_accuracy": 0.7796301394701004,
"num_tokens": 12712691.0,
"step": 779
},
{
"entropy": 0.542564183473587,
"epoch": 2.932330827067669,
"grad_norm": 0.03840424865484238,
"learning_rate": 0.0002,
"loss": 0.5449208617210388,
"mean_token_accuracy": 0.778635174036026,
"num_tokens": 12729062.0,
"step": 780
},
{
"entropy": 0.5423157215118408,
"epoch": 2.93609022556391,
"grad_norm": 0.0474003404378891,
"learning_rate": 0.0002,
"loss": 0.5478240251541138,
"mean_token_accuracy": 0.7766861170530319,
"num_tokens": 12745381.0,
"step": 781
},
{
"entropy": 0.5361933559179306,
"epoch": 2.9398496240601504,
"grad_norm": 0.037907540798187256,
"learning_rate": 0.0002,
"loss": 0.5324196815490723,
"mean_token_accuracy": 0.7846821397542953,
"num_tokens": 12761688.0,
"step": 782
},
{
"entropy": 0.5589640736579895,
"epoch": 2.943609022556391,
"grad_norm": 0.04339439421892166,
"learning_rate": 0.0002,
"loss": 0.5444428324699402,
"mean_token_accuracy": 0.7806793451309204,
"num_tokens": 12778289.0,
"step": 783
},
{
"entropy": 0.5389928370714188,
"epoch": 2.9473684210526314,
"grad_norm": 0.03586737811565399,
"learning_rate": 0.0002,
"loss": 0.5383816957473755,
"mean_token_accuracy": 0.7810381203889847,
"num_tokens": 12794954.0,
"step": 784
},
{
"entropy": 0.5266241282224655,
"epoch": 2.951127819548872,
"grad_norm": 0.03784513846039772,
"learning_rate": 0.0002,
"loss": 0.5282174348831177,
"mean_token_accuracy": 0.7867349982261658,
"num_tokens": 12811150.0,
"step": 785
},
{
"entropy": 0.5349175482988358,
"epoch": 2.954887218045113,
"grad_norm": 0.04314623400568962,
"learning_rate": 0.0002,
"loss": 0.5450260043144226,
"mean_token_accuracy": 0.7768904566764832,
"num_tokens": 12827293.0,
"step": 786
},
{
"entropy": 0.5137490779161453,
"epoch": 2.9586466165413534,
"grad_norm": 0.04252813383936882,
"learning_rate": 0.0002,
"loss": 0.5246796011924744,
"mean_token_accuracy": 0.7863982170820236,
"num_tokens": 12843307.0,
"step": 787
},
{
"entropy": 0.5352135896682739,
"epoch": 2.962406015037594,
"grad_norm": 0.045887961983680725,
"learning_rate": 0.0002,
"loss": 0.5371412634849548,
"mean_token_accuracy": 0.7804872691631317,
"num_tokens": 12859595.0,
"step": 788
},
{
"entropy": 0.5446542203426361,
"epoch": 2.9661654135338344,
"grad_norm": 0.04673901945352554,
"learning_rate": 0.0002,
"loss": 0.5501778721809387,
"mean_token_accuracy": 0.7773697823286057,
"num_tokens": 12875931.0,
"step": 789
},
{
"entropy": 0.5408057272434235,
"epoch": 2.969924812030075,
"grad_norm": 0.0367148295044899,
"learning_rate": 0.0002,
"loss": 0.5386841297149658,
"mean_token_accuracy": 0.779689833521843,
"num_tokens": 12892289.0,
"step": 790
},
{
"entropy": 0.538294106721878,
"epoch": 2.973684210526316,
"grad_norm": 0.035284459590911865,
"learning_rate": 0.0002,
"loss": 0.5302733778953552,
"mean_token_accuracy": 0.7843924909830093,
"num_tokens": 12908646.0,
"step": 791
},
{
"entropy": 0.5408864170312881,
"epoch": 2.9774436090225564,
"grad_norm": 0.03952067717909813,
"learning_rate": 0.0002,
"loss": 0.5328561663627625,
"mean_token_accuracy": 0.7823582589626312,
"num_tokens": 12924940.0,
"step": 792
},
{
"entropy": 0.5341958701610565,
"epoch": 2.981203007518797,
"grad_norm": 0.03711646795272827,
"learning_rate": 0.0002,
"loss": 0.5313258767127991,
"mean_token_accuracy": 0.7841775417327881,
"num_tokens": 12941104.0,
"step": 793
},
{
"entropy": 0.5351585075259209,
"epoch": 2.9849624060150375,
"grad_norm": 0.04043775424361229,
"learning_rate": 0.0002,
"loss": 0.5411684513092041,
"mean_token_accuracy": 0.7801253944635391,
"num_tokens": 12957327.0,
"step": 794
},
{
"entropy": 0.5278606861829758,
"epoch": 2.988721804511278,
"grad_norm": 0.04125319793820381,
"learning_rate": 0.0002,
"loss": 0.5394368171691895,
"mean_token_accuracy": 0.7814257442951202,
"num_tokens": 12973968.0,
"step": 795
},
{
"entropy": 0.5424105674028397,
"epoch": 2.992481203007519,
"grad_norm": 0.04019284248352051,
"learning_rate": 0.0002,
"loss": 0.5428224802017212,
"mean_token_accuracy": 0.7811149209737778,
"num_tokens": 12990151.0,
"step": 796
},
{
"entropy": 0.526485025882721,
"epoch": 2.9962406015037595,
"grad_norm": 0.04355369135737419,
"learning_rate": 0.0002,
"loss": 0.524267315864563,
"mean_token_accuracy": 0.7883585393428802,
"num_tokens": 13006619.0,
"step": 797
},
{
"entropy": 0.5499685406684875,
"epoch": 3.0,
"grad_norm": 0.04084917902946472,
"learning_rate": 0.0002,
"loss": 0.5499616265296936,
"mean_token_accuracy": 0.7766987532377243,
"num_tokens": 13023154.0,
"step": 798
}
],
"logging_steps": 1,
"max_steps": 798,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2137387169173996e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}