foam-gpt-oss-120B / trainer_state.json
finalform's picture
Upload 8 files
8a86a22 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 1385,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.5288876295089722,
"epoch": 0.0036199095022624436,
"grad_norm": 7.694402694702148,
"learning_rate": 0.0,
"loss": 1.3638800382614136,
"mean_token_accuracy": 0.8390376716852188,
"num_tokens": 9306.0,
"step": 1
},
{
"entropy": 1.4891301393508911,
"epoch": 0.007239819004524887,
"grad_norm": 10.345961570739746,
"learning_rate": 2.9850746268656716e-06,
"loss": 1.404976725578308,
"mean_token_accuracy": 0.8416248112916946,
"num_tokens": 18426.0,
"step": 2
},
{
"entropy": 1.5589642226696014,
"epoch": 0.01085972850678733,
"grad_norm": 7.015953063964844,
"learning_rate": 5.970149253731343e-06,
"loss": 1.5137574672698975,
"mean_token_accuracy": 0.844389408826828,
"num_tokens": 27017.0,
"step": 3
},
{
"entropy": 1.5128743052482605,
"epoch": 0.014479638009049774,
"grad_norm": 8.688935279846191,
"learning_rate": 8.955223880597016e-06,
"loss": 1.429337501525879,
"mean_token_accuracy": 0.8436507284641266,
"num_tokens": 36186.0,
"step": 4
},
{
"entropy": 1.5162458419799805,
"epoch": 0.01809954751131222,
"grad_norm": 18.12025260925293,
"learning_rate": 1.1940298507462686e-05,
"loss": 1.5170090198516846,
"mean_token_accuracy": 0.8127347379922867,
"num_tokens": 45259.0,
"step": 5
},
{
"entropy": 1.607073962688446,
"epoch": 0.02171945701357466,
"grad_norm": 11.967021942138672,
"learning_rate": 1.4925373134328357e-05,
"loss": 1.7809343338012695,
"mean_token_accuracy": 0.7834140509366989,
"num_tokens": 53870.0,
"step": 6
},
{
"entropy": 1.5723404288291931,
"epoch": 0.025339366515837104,
"grad_norm": 7.197022438049316,
"learning_rate": 1.791044776119403e-05,
"loss": 1.355630874633789,
"mean_token_accuracy": 0.8707718253135681,
"num_tokens": 62422.0,
"step": 7
},
{
"entropy": 1.6577945351600647,
"epoch": 0.02895927601809955,
"grad_norm": 9.124281883239746,
"learning_rate": 2.0895522388059702e-05,
"loss": 1.5860857963562012,
"mean_token_accuracy": 0.8311486840248108,
"num_tokens": 70836.0,
"step": 8
},
{
"entropy": 1.5083436369895935,
"epoch": 0.03257918552036199,
"grad_norm": 9.471440315246582,
"learning_rate": 2.3880597014925373e-05,
"loss": 1.4798086881637573,
"mean_token_accuracy": 0.8188015669584274,
"num_tokens": 79489.0,
"step": 9
},
{
"entropy": 1.5417097806930542,
"epoch": 0.03619909502262444,
"grad_norm": 6.9740309715271,
"learning_rate": 2.6865671641791047e-05,
"loss": 1.4000660181045532,
"mean_token_accuracy": 0.8296933174133301,
"num_tokens": 88400.0,
"step": 10
},
{
"entropy": 1.6839916408061981,
"epoch": 0.039819004524886875,
"grad_norm": 8.314177513122559,
"learning_rate": 2.9850746268656714e-05,
"loss": 1.3732950687408447,
"mean_token_accuracy": 0.8450545966625214,
"num_tokens": 97018.0,
"step": 11
},
{
"entropy": 1.7210606038570404,
"epoch": 0.04343891402714932,
"grad_norm": 6.364591598510742,
"learning_rate": 3.283582089552239e-05,
"loss": 1.2142231464385986,
"mean_token_accuracy": 0.8527437746524811,
"num_tokens": 105866.0,
"step": 12
},
{
"entropy": 1.6527923345565796,
"epoch": 0.047058823529411764,
"grad_norm": 4.993825912475586,
"learning_rate": 3.582089552238806e-05,
"loss": 0.9318434000015259,
"mean_token_accuracy": 0.8724203705787659,
"num_tokens": 114999.0,
"step": 13
},
{
"entropy": 1.7282630801200867,
"epoch": 0.05067873303167421,
"grad_norm": 4.304642677307129,
"learning_rate": 3.8805970149253736e-05,
"loss": 0.9089325070381165,
"mean_token_accuracy": 0.8798395097255707,
"num_tokens": 124184.0,
"step": 14
},
{
"entropy": 1.8163867890834808,
"epoch": 0.05429864253393665,
"grad_norm": 3.7051150798797607,
"learning_rate": 4.1791044776119404e-05,
"loss": 0.7500128746032715,
"mean_token_accuracy": 0.8938957899808884,
"num_tokens": 132985.0,
"step": 15
},
{
"entropy": 2.0243027210235596,
"epoch": 0.0579185520361991,
"grad_norm": 4.971452236175537,
"learning_rate": 4.477611940298508e-05,
"loss": 1.0864768028259277,
"mean_token_accuracy": 0.8368343859910965,
"num_tokens": 141599.0,
"step": 16
},
{
"entropy": 2.125125467777252,
"epoch": 0.06153846153846154,
"grad_norm": 4.845816612243652,
"learning_rate": 4.7761194029850745e-05,
"loss": 0.7839944958686829,
"mean_token_accuracy": 0.8883605301380157,
"num_tokens": 149961.0,
"step": 17
},
{
"entropy": 2.167099416255951,
"epoch": 0.06515837104072399,
"grad_norm": 4.479213237762451,
"learning_rate": 5.074626865671642e-05,
"loss": 0.6522338390350342,
"mean_token_accuracy": 0.8985295295715332,
"num_tokens": 158394.0,
"step": 18
},
{
"entropy": 2.3476614952087402,
"epoch": 0.06877828054298643,
"grad_norm": 4.596512794494629,
"learning_rate": 5.373134328358209e-05,
"loss": 0.5884965062141418,
"mean_token_accuracy": 0.8780558109283447,
"num_tokens": 167295.0,
"step": 19
},
{
"entropy": 2.620903968811035,
"epoch": 0.07239819004524888,
"grad_norm": 3.99661922454834,
"learning_rate": 5.671641791044776e-05,
"loss": 0.6179074645042419,
"mean_token_accuracy": 0.875989705324173,
"num_tokens": 176255.0,
"step": 20
},
{
"entropy": 2.663840651512146,
"epoch": 0.0760180995475113,
"grad_norm": 2.395817518234253,
"learning_rate": 5.970149253731343e-05,
"loss": 0.5167301893234253,
"mean_token_accuracy": 0.8798592388629913,
"num_tokens": 185601.0,
"step": 21
},
{
"entropy": 3.007373869419098,
"epoch": 0.07963800904977375,
"grad_norm": 1.9023845195770264,
"learning_rate": 6.268656716417911e-05,
"loss": 0.4969954788684845,
"mean_token_accuracy": 0.8838344216346741,
"num_tokens": 194036.0,
"step": 22
},
{
"entropy": 3.014187455177307,
"epoch": 0.0832579185520362,
"grad_norm": 1.0483063459396362,
"learning_rate": 6.567164179104478e-05,
"loss": 0.4313647150993347,
"mean_token_accuracy": 0.8821887522935867,
"num_tokens": 203167.0,
"step": 23
},
{
"entropy": 3.5317789912223816,
"epoch": 0.08687782805429864,
"grad_norm": 1.9082902669906616,
"learning_rate": 6.865671641791044e-05,
"loss": 0.6452760100364685,
"mean_token_accuracy": 0.8487882167100906,
"num_tokens": 211791.0,
"step": 24
},
{
"entropy": 3.470491588115692,
"epoch": 0.09049773755656108,
"grad_norm": 1.3330037593841553,
"learning_rate": 7.164179104477612e-05,
"loss": 0.609681248664856,
"mean_token_accuracy": 0.8559600114822388,
"num_tokens": 220855.0,
"step": 25
},
{
"entropy": 3.8751351833343506,
"epoch": 0.09411764705882353,
"grad_norm": 1.864700436592102,
"learning_rate": 7.46268656716418e-05,
"loss": 0.5049571394920349,
"mean_token_accuracy": 0.8841463923454285,
"num_tokens": 229389.0,
"step": 26
},
{
"entropy": 3.706156551837921,
"epoch": 0.09773755656108597,
"grad_norm": 1.7854461669921875,
"learning_rate": 7.761194029850747e-05,
"loss": 0.39277932047843933,
"mean_token_accuracy": 0.9083494395017624,
"num_tokens": 238523.0,
"step": 27
},
{
"entropy": 3.8404606580734253,
"epoch": 0.10135746606334842,
"grad_norm": 2.3090603351593018,
"learning_rate": 8.059701492537314e-05,
"loss": 0.35173487663269043,
"mean_token_accuracy": 0.9240767657756805,
"num_tokens": 247228.0,
"step": 28
},
{
"entropy": 4.031954348087311,
"epoch": 0.10497737556561086,
"grad_norm": 1.6039751768112183,
"learning_rate": 8.358208955223881e-05,
"loss": 0.5736312866210938,
"mean_token_accuracy": 0.8665709495544434,
"num_tokens": 255766.0,
"step": 29
},
{
"entropy": 4.0528751611709595,
"epoch": 0.1085972850678733,
"grad_norm": 1.5278459787368774,
"learning_rate": 8.656716417910447e-05,
"loss": 0.46521249413490295,
"mean_token_accuracy": 0.8837475925683975,
"num_tokens": 264515.0,
"step": 30
},
{
"entropy": 3.7500529289245605,
"epoch": 0.11221719457013575,
"grad_norm": 1.2679299116134644,
"learning_rate": 8.955223880597016e-05,
"loss": 0.6649113297462463,
"mean_token_accuracy": 0.8520247489213943,
"num_tokens": 273421.0,
"step": 31
},
{
"entropy": 3.870599329471588,
"epoch": 0.1158371040723982,
"grad_norm": 1.4248121976852417,
"learning_rate": 9.253731343283582e-05,
"loss": 0.2973020672798157,
"mean_token_accuracy": 0.9219554513692856,
"num_tokens": 281883.0,
"step": 32
},
{
"entropy": 3.5264278650283813,
"epoch": 0.11945701357466064,
"grad_norm": 0.7345032691955566,
"learning_rate": 9.552238805970149e-05,
"loss": 0.504949688911438,
"mean_token_accuracy": 0.9012798517942429,
"num_tokens": 290801.0,
"step": 33
},
{
"entropy": 3.6929972171783447,
"epoch": 0.12307692307692308,
"grad_norm": 1.0366969108581543,
"learning_rate": 9.850746268656717e-05,
"loss": 0.35403263568878174,
"mean_token_accuracy": 0.9262522161006927,
"num_tokens": 299165.0,
"step": 34
},
{
"entropy": 3.458252251148224,
"epoch": 0.12669683257918551,
"grad_norm": 0.8196644186973572,
"learning_rate": 0.00010149253731343284,
"loss": 0.46754205226898193,
"mean_token_accuracy": 0.8908957839012146,
"num_tokens": 307618.0,
"step": 35
},
{
"entropy": 3.502661347389221,
"epoch": 0.13031674208144797,
"grad_norm": 0.8469979166984558,
"learning_rate": 0.0001044776119402985,
"loss": 0.377693772315979,
"mean_token_accuracy": 0.9080320745706558,
"num_tokens": 316466.0,
"step": 36
},
{
"entropy": 3.3003416061401367,
"epoch": 0.1339366515837104,
"grad_norm": 0.7982582449913025,
"learning_rate": 0.00010746268656716419,
"loss": 0.3522840142250061,
"mean_token_accuracy": 0.9149231612682343,
"num_tokens": 325197.0,
"step": 37
},
{
"entropy": 3.265174388885498,
"epoch": 0.13755656108597286,
"grad_norm": 0.7812036275863647,
"learning_rate": 0.00011044776119402987,
"loss": 0.33715564012527466,
"mean_token_accuracy": 0.9169812202453613,
"num_tokens": 334246.0,
"step": 38
},
{
"entropy": 3.2711930871009827,
"epoch": 0.1411764705882353,
"grad_norm": 0.6879589557647705,
"learning_rate": 0.00011343283582089552,
"loss": 0.22535352408885956,
"mean_token_accuracy": 0.9371069073677063,
"num_tokens": 342753.0,
"step": 39
},
{
"entropy": 3.039187252521515,
"epoch": 0.14479638009049775,
"grad_norm": 0.7044833898544312,
"learning_rate": 0.0001164179104477612,
"loss": 0.269231379032135,
"mean_token_accuracy": 0.9326638281345367,
"num_tokens": 351703.0,
"step": 40
},
{
"entropy": 3.1341193318367004,
"epoch": 0.14841628959276018,
"grad_norm": 0.7598081231117249,
"learning_rate": 0.00011940298507462686,
"loss": 0.23056557774543762,
"mean_token_accuracy": 0.9387440532445908,
"num_tokens": 360201.0,
"step": 41
},
{
"entropy": 3.1592912673950195,
"epoch": 0.1520361990950226,
"grad_norm": 0.9636098146438599,
"learning_rate": 0.00012238805970149255,
"loss": 0.4810163974761963,
"mean_token_accuracy": 0.8872013241052628,
"num_tokens": 368978.0,
"step": 42
},
{
"entropy": 2.778893828392029,
"epoch": 0.15565610859728507,
"grad_norm": 0.5792455077171326,
"learning_rate": 0.00012537313432835822,
"loss": 0.21380706131458282,
"mean_token_accuracy": 0.9375593662261963,
"num_tokens": 378445.0,
"step": 43
},
{
"entropy": 2.7576374411582947,
"epoch": 0.1592760180995475,
"grad_norm": 0.9693785905838013,
"learning_rate": 0.00012835820895522389,
"loss": 0.2909581959247589,
"mean_token_accuracy": 0.9316168874502182,
"num_tokens": 387672.0,
"step": 44
},
{
"entropy": 2.5791444182395935,
"epoch": 0.16289592760180996,
"grad_norm": 0.6936656832695007,
"learning_rate": 0.00013134328358208955,
"loss": 0.32536542415618896,
"mean_token_accuracy": 0.9312366247177124,
"num_tokens": 396735.0,
"step": 45
},
{
"entropy": 2.483812391757965,
"epoch": 0.1665158371040724,
"grad_norm": 0.8131234049797058,
"learning_rate": 0.00013432835820895525,
"loss": 0.4375811219215393,
"mean_token_accuracy": 0.8777281790971756,
"num_tokens": 405449.0,
"step": 46
},
{
"entropy": 1.9735961556434631,
"epoch": 0.17013574660633485,
"grad_norm": 0.7072490453720093,
"learning_rate": 0.0001373134328358209,
"loss": 0.3115054965019226,
"mean_token_accuracy": 0.9156161844730377,
"num_tokens": 414791.0,
"step": 47
},
{
"entropy": 1.7938538491725922,
"epoch": 0.17375565610859728,
"grad_norm": 0.8033711314201355,
"learning_rate": 0.00014029850746268658,
"loss": 0.4930950999259949,
"mean_token_accuracy": 0.8819727599620819,
"num_tokens": 423449.0,
"step": 48
},
{
"entropy": 1.6752981543540955,
"epoch": 0.17737556561085974,
"grad_norm": 0.4999159574508667,
"learning_rate": 0.00014328358208955225,
"loss": 0.2053433656692505,
"mean_token_accuracy": 0.952075183391571,
"num_tokens": 432347.0,
"step": 49
},
{
"entropy": 1.6547318398952484,
"epoch": 0.18099547511312217,
"grad_norm": 5.030092716217041,
"learning_rate": 0.00014626865671641792,
"loss": 0.24254727363586426,
"mean_token_accuracy": 0.9326749891042709,
"num_tokens": 441197.0,
"step": 50
},
{
"entropy": 1.6684256792068481,
"epoch": 0.18461538461538463,
"grad_norm": 0.5943530201911926,
"learning_rate": 0.0001492537313432836,
"loss": 0.3166338801383972,
"mean_token_accuracy": 0.9284558445215225,
"num_tokens": 450481.0,
"step": 51
},
{
"entropy": 1.6687548756599426,
"epoch": 0.18823529411764706,
"grad_norm": 0.6136987805366516,
"learning_rate": 0.00015223880597014925,
"loss": 0.31150421500205994,
"mean_token_accuracy": 0.9149410724639893,
"num_tokens": 459376.0,
"step": 52
},
{
"entropy": 1.6935299038887024,
"epoch": 0.19185520361990951,
"grad_norm": 0.6838834285736084,
"learning_rate": 0.00015522388059701495,
"loss": 0.39871394634246826,
"mean_token_accuracy": 0.9073809385299683,
"num_tokens": 468010.0,
"step": 53
},
{
"entropy": 1.5816974937915802,
"epoch": 0.19547511312217195,
"grad_norm": 0.5880348086357117,
"learning_rate": 0.00015820895522388059,
"loss": 0.25584831833839417,
"mean_token_accuracy": 0.9261536300182343,
"num_tokens": 476753.0,
"step": 54
},
{
"entropy": 1.5977029204368591,
"epoch": 0.19909502262443438,
"grad_norm": 0.5524119734764099,
"learning_rate": 0.00016119402985074628,
"loss": 0.27022480964660645,
"mean_token_accuracy": 0.9257165640592575,
"num_tokens": 485781.0,
"step": 55
},
{
"entropy": 1.6373226642608643,
"epoch": 0.20271493212669683,
"grad_norm": 0.6135741472244263,
"learning_rate": 0.00016417910447761195,
"loss": 0.263553261756897,
"mean_token_accuracy": 0.93193618953228,
"num_tokens": 494688.0,
"step": 56
},
{
"entropy": 1.5185258090496063,
"epoch": 0.20633484162895926,
"grad_norm": 0.5645662546157837,
"learning_rate": 0.00016716417910447761,
"loss": 0.297269344329834,
"mean_token_accuracy": 0.9199682921171188,
"num_tokens": 503958.0,
"step": 57
},
{
"entropy": 1.530813604593277,
"epoch": 0.20995475113122172,
"grad_norm": 0.5128429532051086,
"learning_rate": 0.00017014925373134328,
"loss": 0.3574504554271698,
"mean_token_accuracy": 0.9292190074920654,
"num_tokens": 513417.0,
"step": 58
},
{
"entropy": 1.5204592645168304,
"epoch": 0.21357466063348415,
"grad_norm": 0.5484449863433838,
"learning_rate": 0.00017313432835820895,
"loss": 0.2696574032306671,
"mean_token_accuracy": 0.9359241276979446,
"num_tokens": 522380.0,
"step": 59
},
{
"entropy": 1.5577877461910248,
"epoch": 0.2171945701357466,
"grad_norm": 0.6176109313964844,
"learning_rate": 0.00017611940298507464,
"loss": 0.3171631097793579,
"mean_token_accuracy": 0.9242918938398361,
"num_tokens": 531241.0,
"step": 60
},
{
"entropy": 1.5508787035942078,
"epoch": 0.22081447963800904,
"grad_norm": 0.5896923542022705,
"learning_rate": 0.0001791044776119403,
"loss": 0.18115177750587463,
"mean_token_accuracy": 0.9481829404830933,
"num_tokens": 540149.0,
"step": 61
},
{
"entropy": 1.5782250761985779,
"epoch": 0.2244343891402715,
"grad_norm": 0.7554022669792175,
"learning_rate": 0.00018208955223880598,
"loss": 0.3728199303150177,
"mean_token_accuracy": 0.9055676311254501,
"num_tokens": 548927.0,
"step": 62
},
{
"entropy": 1.6092694103717804,
"epoch": 0.22805429864253393,
"grad_norm": 0.696146547794342,
"learning_rate": 0.00018507462686567165,
"loss": 0.49058157205581665,
"mean_token_accuracy": 0.8916458785533905,
"num_tokens": 557895.0,
"step": 63
},
{
"entropy": 1.5317207276821136,
"epoch": 0.2316742081447964,
"grad_norm": 0.6891468167304993,
"learning_rate": 0.00018805970149253734,
"loss": 0.36224162578582764,
"mean_token_accuracy": 0.9079867750406265,
"num_tokens": 567080.0,
"step": 64
},
{
"entropy": 1.5777660310268402,
"epoch": 0.23529411764705882,
"grad_norm": 0.6747457981109619,
"learning_rate": 0.00019104477611940298,
"loss": 0.34208086133003235,
"mean_token_accuracy": 0.9191138446331024,
"num_tokens": 576156.0,
"step": 65
},
{
"entropy": 1.579883337020874,
"epoch": 0.23891402714932128,
"grad_norm": 0.6088735461235046,
"learning_rate": 0.00019402985074626867,
"loss": 0.2968827188014984,
"mean_token_accuracy": 0.9238106161355972,
"num_tokens": 585109.0,
"step": 66
},
{
"entropy": 1.6180895566940308,
"epoch": 0.2425339366515837,
"grad_norm": 0.5592202544212341,
"learning_rate": 0.00019701492537313434,
"loss": 0.17197401821613312,
"mean_token_accuracy": 0.954246997833252,
"num_tokens": 593744.0,
"step": 67
},
{
"entropy": 1.5819954574108124,
"epoch": 0.24615384615384617,
"grad_norm": 0.6158381700515747,
"learning_rate": 0.0002,
"loss": 0.32391372323036194,
"mean_token_accuracy": 0.9132590889930725,
"num_tokens": 602817.0,
"step": 68
},
{
"entropy": 1.6124244332313538,
"epoch": 0.2497737556561086,
"grad_norm": 0.6058560013771057,
"learning_rate": 0.00019999990383005872,
"loss": 0.398790568113327,
"mean_token_accuracy": 0.9038740396499634,
"num_tokens": 611500.0,
"step": 69
},
{
"entropy": 1.5589013695716858,
"epoch": 0.25339366515837103,
"grad_norm": 0.5041698813438416,
"learning_rate": 0.00019999961532044045,
"loss": 0.24661695957183838,
"mean_token_accuracy": 0.9379201531410217,
"num_tokens": 620497.0,
"step": 70
},
{
"entropy": 1.539864867925644,
"epoch": 0.25701357466063346,
"grad_norm": 0.5611797571182251,
"learning_rate": 0.00019999913447176174,
"loss": 0.2731279730796814,
"mean_token_accuracy": 0.9369927495718002,
"num_tokens": 629538.0,
"step": 71
},
{
"entropy": 1.5526191294193268,
"epoch": 0.26063348416289595,
"grad_norm": 0.49492526054382324,
"learning_rate": 0.00019999846128505015,
"loss": 0.22361817955970764,
"mean_token_accuracy": 0.9443113952875137,
"num_tokens": 638389.0,
"step": 72
},
{
"entropy": 1.6069320440292358,
"epoch": 0.2642533936651584,
"grad_norm": 0.6465152502059937,
"learning_rate": 0.00019999759576174448,
"loss": 0.2783147692680359,
"mean_token_accuracy": 0.9260384887456894,
"num_tokens": 647129.0,
"step": 73
},
{
"entropy": 1.5199538469314575,
"epoch": 0.2678733031674208,
"grad_norm": 0.541709840297699,
"learning_rate": 0.00019999653790369438,
"loss": 0.23969395458698273,
"mean_token_accuracy": 0.9355704188346863,
"num_tokens": 655779.0,
"step": 74
},
{
"entropy": 1.4944458305835724,
"epoch": 0.27149321266968324,
"grad_norm": 0.5421354174613953,
"learning_rate": 0.00019999528771316057,
"loss": 0.23178081214427948,
"mean_token_accuracy": 0.9418981224298477,
"num_tokens": 664953.0,
"step": 75
},
{
"entropy": 1.4615215063095093,
"epoch": 0.2751131221719457,
"grad_norm": 0.5120413899421692,
"learning_rate": 0.00019999384519281494,
"loss": 0.21126341819763184,
"mean_token_accuracy": 0.9391632974147797,
"num_tokens": 674258.0,
"step": 76
},
{
"entropy": 1.5696524381637573,
"epoch": 0.27873303167420815,
"grad_norm": 0.6865569949150085,
"learning_rate": 0.00019999221034574028,
"loss": 0.31426694989204407,
"mean_token_accuracy": 0.9186421036720276,
"num_tokens": 682844.0,
"step": 77
},
{
"entropy": 1.4924792051315308,
"epoch": 0.2823529411764706,
"grad_norm": 0.6203171610832214,
"learning_rate": 0.00019999038317543036,
"loss": 0.22723156213760376,
"mean_token_accuracy": 0.9455482661724091,
"num_tokens": 692032.0,
"step": 78
},
{
"entropy": 1.5375191569328308,
"epoch": 0.285972850678733,
"grad_norm": 0.5844593048095703,
"learning_rate": 0.00019998836368579013,
"loss": 0.3107585310935974,
"mean_token_accuracy": 0.9372987002134323,
"num_tokens": 700940.0,
"step": 79
},
{
"entropy": 1.542263776063919,
"epoch": 0.2895927601809955,
"grad_norm": 0.6159414649009705,
"learning_rate": 0.00019998615188113547,
"loss": 0.20322281122207642,
"mean_token_accuracy": 0.9484172016382217,
"num_tokens": 709374.0,
"step": 80
},
{
"entropy": 1.566244751214981,
"epoch": 0.29321266968325793,
"grad_norm": 0.7167544960975647,
"learning_rate": 0.00019998374776619316,
"loss": 0.289408415555954,
"mean_token_accuracy": 0.9302106499671936,
"num_tokens": 718254.0,
"step": 81
},
{
"entropy": 1.5220647156238556,
"epoch": 0.29683257918552036,
"grad_norm": 0.5164801478385925,
"learning_rate": 0.0001999811513461012,
"loss": 0.16729353368282318,
"mean_token_accuracy": 0.961905837059021,
"num_tokens": 726892.0,
"step": 82
},
{
"entropy": 1.4655065834522247,
"epoch": 0.3004524886877828,
"grad_norm": 0.7785144448280334,
"learning_rate": 0.00019997836262640825,
"loss": 0.2753179669380188,
"mean_token_accuracy": 0.9345895648002625,
"num_tokens": 736144.0,
"step": 83
},
{
"entropy": 1.5420578122138977,
"epoch": 0.3040723981900452,
"grad_norm": 0.7086989879608154,
"learning_rate": 0.00019997538161307425,
"loss": 0.19912396371364594,
"mean_token_accuracy": 0.948458805680275,
"num_tokens": 744947.0,
"step": 84
},
{
"entropy": 1.5529279112815857,
"epoch": 0.3076923076923077,
"grad_norm": 0.6447221040725708,
"learning_rate": 0.00019997220831246987,
"loss": 0.19798173010349274,
"mean_token_accuracy": 0.9535399079322815,
"num_tokens": 754040.0,
"step": 85
},
{
"entropy": 1.5264079570770264,
"epoch": 0.31131221719457014,
"grad_norm": 0.6189771890640259,
"learning_rate": 0.00019996884273137686,
"loss": 0.20467980206012726,
"mean_token_accuracy": 0.9415531605482101,
"num_tokens": 763365.0,
"step": 86
},
{
"entropy": 1.751158595085144,
"epoch": 0.31493212669683257,
"grad_norm": 0.7291711568832397,
"learning_rate": 0.0001999652848769878,
"loss": 0.24249613285064697,
"mean_token_accuracy": 0.9340829253196716,
"num_tokens": 771408.0,
"step": 87
},
{
"entropy": 1.6374999582767487,
"epoch": 0.318552036199095,
"grad_norm": 0.6003106236457825,
"learning_rate": 0.00019996153475690623,
"loss": 0.3658824861049652,
"mean_token_accuracy": 0.918518453836441,
"num_tokens": 780637.0,
"step": 88
},
{
"entropy": 1.5862501561641693,
"epoch": 0.3221719457013575,
"grad_norm": 0.6139584183692932,
"learning_rate": 0.00019995759237914656,
"loss": 0.29145702719688416,
"mean_token_accuracy": 0.9236557334661484,
"num_tokens": 789741.0,
"step": 89
},
{
"entropy": 1.67588272690773,
"epoch": 0.3257918552036199,
"grad_norm": 0.5298960208892822,
"learning_rate": 0.0001999534577521341,
"loss": 0.19036075472831726,
"mean_token_accuracy": 0.9556817710399628,
"num_tokens": 798373.0,
"step": 90
},
{
"entropy": 1.6315190196037292,
"epoch": 0.32941176470588235,
"grad_norm": 0.5074024796485901,
"learning_rate": 0.00019994913088470498,
"loss": 0.22543872892856598,
"mean_token_accuracy": 0.9336265623569489,
"num_tokens": 807267.0,
"step": 91
},
{
"entropy": 1.5913198590278625,
"epoch": 0.3330316742081448,
"grad_norm": 0.7284563779830933,
"learning_rate": 0.00019994461178610617,
"loss": 0.24927280843257904,
"mean_token_accuracy": 0.9343689233064651,
"num_tokens": 816046.0,
"step": 92
},
{
"entropy": 1.6069969236850739,
"epoch": 0.33665158371040727,
"grad_norm": 0.7105880379676819,
"learning_rate": 0.00019993990046599555,
"loss": 0.295354962348938,
"mean_token_accuracy": 0.9300834238529205,
"num_tokens": 824773.0,
"step": 93
},
{
"entropy": 1.6023050248622894,
"epoch": 0.3402714932126697,
"grad_norm": 0.5128775238990784,
"learning_rate": 0.00019993499693444168,
"loss": 0.13714508712291718,
"mean_token_accuracy": 0.9659775942564011,
"num_tokens": 833307.0,
"step": 94
},
{
"entropy": 1.47949880361557,
"epoch": 0.3438914027149321,
"grad_norm": 0.4838848114013672,
"learning_rate": 0.00019992990120192393,
"loss": 0.1908133178949356,
"mean_token_accuracy": 0.9466118365526199,
"num_tokens": 842176.0,
"step": 95
},
{
"entropy": 1.5162540972232819,
"epoch": 0.34751131221719456,
"grad_norm": 0.4756297767162323,
"learning_rate": 0.00019992461327933252,
"loss": 0.14188416302204132,
"mean_token_accuracy": 0.9609939008951187,
"num_tokens": 851079.0,
"step": 96
},
{
"entropy": 1.5233789086341858,
"epoch": 0.351131221719457,
"grad_norm": 0.5846927762031555,
"learning_rate": 0.00019991913317796825,
"loss": 0.35901975631713867,
"mean_token_accuracy": 0.9176534414291382,
"num_tokens": 860190.0,
"step": 97
},
{
"entropy": 1.488583117723465,
"epoch": 0.3547511312217195,
"grad_norm": 0.5157149434089661,
"learning_rate": 0.00019991346090954268,
"loss": 0.1544593721628189,
"mean_token_accuracy": 0.9559026509523392,
"num_tokens": 869107.0,
"step": 98
},
{
"entropy": 1.4912174940109253,
"epoch": 0.3583710407239819,
"grad_norm": 0.6246358156204224,
"learning_rate": 0.00019990759648617814,
"loss": 0.2815958857536316,
"mean_token_accuracy": 0.9374883323907852,
"num_tokens": 877732.0,
"step": 99
},
{
"entropy": 1.4868848025798798,
"epoch": 0.36199095022624433,
"grad_norm": 0.502863883972168,
"learning_rate": 0.0001999015399204075,
"loss": 0.15397809445858002,
"mean_token_accuracy": 0.9577479511499405,
"num_tokens": 886494.0,
"step": 100
},
{
"entropy": 1.4532659351825714,
"epoch": 0.36561085972850677,
"grad_norm": 0.4709261655807495,
"learning_rate": 0.0001998952912251743,
"loss": 0.16087600588798523,
"mean_token_accuracy": 0.9519130736589432,
"num_tokens": 895455.0,
"step": 101
},
{
"entropy": 1.491580843925476,
"epoch": 0.36923076923076925,
"grad_norm": 0.5536823868751526,
"learning_rate": 0.0001998888504138327,
"loss": 0.16084736585617065,
"mean_token_accuracy": 0.960470125079155,
"num_tokens": 904392.0,
"step": 102
},
{
"entropy": 1.4420756101608276,
"epoch": 0.3728506787330317,
"grad_norm": 0.5427432060241699,
"learning_rate": 0.00019988221750014747,
"loss": 0.2450316995382309,
"mean_token_accuracy": 0.9366131573915482,
"num_tokens": 914065.0,
"step": 103
},
{
"entropy": 1.5350265502929688,
"epoch": 0.3764705882352941,
"grad_norm": 0.4882596731185913,
"learning_rate": 0.00019987539249829381,
"loss": 0.21313555538654327,
"mean_token_accuracy": 0.9590773284435272,
"num_tokens": 922687.0,
"step": 104
},
{
"entropy": 1.5769560635089874,
"epoch": 0.38009049773755654,
"grad_norm": 0.7525632977485657,
"learning_rate": 0.0001998683754228575,
"loss": 0.2643919289112091,
"mean_token_accuracy": 0.9193268120288849,
"num_tokens": 930932.0,
"step": 105
},
{
"entropy": 1.5345399081707,
"epoch": 0.38371040723981903,
"grad_norm": 0.6206967234611511,
"learning_rate": 0.00019986116628883485,
"loss": 0.2925248146057129,
"mean_token_accuracy": 0.931127279996872,
"num_tokens": 939846.0,
"step": 106
},
{
"entropy": 1.5490939021110535,
"epoch": 0.38733031674208146,
"grad_norm": 0.5089172124862671,
"learning_rate": 0.00019985376511163255,
"loss": 0.14840808510780334,
"mean_token_accuracy": 0.9644817113876343,
"num_tokens": 948368.0,
"step": 107
},
{
"entropy": 1.5281821191310883,
"epoch": 0.3909502262443439,
"grad_norm": 0.4952821135520935,
"learning_rate": 0.00019984617190706768,
"loss": 0.21237969398498535,
"mean_token_accuracy": 0.9380226731300354,
"num_tokens": 957250.0,
"step": 108
},
{
"entropy": 1.5303342044353485,
"epoch": 0.3945701357466063,
"grad_norm": 0.5427054166793823,
"learning_rate": 0.00019983838669136782,
"loss": 0.16821178793907166,
"mean_token_accuracy": 0.9463026076555252,
"num_tokens": 965749.0,
"step": 109
},
{
"entropy": 1.5286442935466766,
"epoch": 0.39819004524886875,
"grad_norm": 0.4814460873603821,
"learning_rate": 0.00019983040948117078,
"loss": 0.21669423580169678,
"mean_token_accuracy": 0.9485635608434677,
"num_tokens": 973995.0,
"step": 110
},
{
"entropy": 1.5193851590156555,
"epoch": 0.40180995475113124,
"grad_norm": 0.4849177300930023,
"learning_rate": 0.00019982224029352477,
"loss": 0.1973811537027359,
"mean_token_accuracy": 0.9545295536518097,
"num_tokens": 982514.0,
"step": 111
},
{
"entropy": 1.5222062468528748,
"epoch": 0.40542986425339367,
"grad_norm": 0.5772113800048828,
"learning_rate": 0.00019981387914588822,
"loss": 0.357006311416626,
"mean_token_accuracy": 0.9294342249631882,
"num_tokens": 991115.0,
"step": 112
},
{
"entropy": 1.4591252207756042,
"epoch": 0.4090497737556561,
"grad_norm": 0.40475624799728394,
"learning_rate": 0.00019980532605612985,
"loss": 0.17911836504936218,
"mean_token_accuracy": 0.958268016576767,
"num_tokens": 1000407.0,
"step": 113
},
{
"entropy": 1.5297361612319946,
"epoch": 0.41266968325791853,
"grad_norm": 0.5671453475952148,
"learning_rate": 0.0001997965810425285,
"loss": 0.14860941469669342,
"mean_token_accuracy": 0.9657443910837173,
"num_tokens": 1008486.0,
"step": 114
},
{
"entropy": 1.4873208403587341,
"epoch": 0.416289592760181,
"grad_norm": 0.47243061661720276,
"learning_rate": 0.0001997876441237733,
"loss": 0.27150753140449524,
"mean_token_accuracy": 0.9356586933135986,
"num_tokens": 1017408.0,
"step": 115
},
{
"entropy": 1.5046488046646118,
"epoch": 0.41990950226244345,
"grad_norm": 0.5545538067817688,
"learning_rate": 0.00019977851531896335,
"loss": 0.27598193287849426,
"mean_token_accuracy": 0.9293323308229446,
"num_tokens": 1026279.0,
"step": 116
},
{
"entropy": 1.5389924347400665,
"epoch": 0.4235294117647059,
"grad_norm": 0.5456140637397766,
"learning_rate": 0.00019976919464760793,
"loss": 0.20166276395320892,
"mean_token_accuracy": 0.9474701136350632,
"num_tokens": 1034896.0,
"step": 117
},
{
"entropy": 1.5261160135269165,
"epoch": 0.4271493212669683,
"grad_norm": 0.4349948763847351,
"learning_rate": 0.00019975968212962637,
"loss": 0.11712481081485748,
"mean_token_accuracy": 0.9618890285491943,
"num_tokens": 1043357.0,
"step": 118
},
{
"entropy": 1.4879018366336823,
"epoch": 0.4307692307692308,
"grad_norm": 0.6200428009033203,
"learning_rate": 0.00019974997778534793,
"loss": 0.2552901804447174,
"mean_token_accuracy": 0.931525394320488,
"num_tokens": 1051617.0,
"step": 119
},
{
"entropy": 1.495672345161438,
"epoch": 0.4343891402714932,
"grad_norm": 0.5531431436538696,
"learning_rate": 0.0001997400816355119,
"loss": 0.18440331518650055,
"mean_token_accuracy": 0.9566466957330704,
"num_tokens": 1060297.0,
"step": 120
},
{
"entropy": 1.468797743320465,
"epoch": 0.43800904977375565,
"grad_norm": 0.8418586254119873,
"learning_rate": 0.00019972999370126737,
"loss": 0.4476836621761322,
"mean_token_accuracy": 0.9020169228315353,
"num_tokens": 1069625.0,
"step": 121
},
{
"entropy": 1.5040291845798492,
"epoch": 0.4416289592760181,
"grad_norm": 0.5432603359222412,
"learning_rate": 0.00019971971400417342,
"loss": 0.23491275310516357,
"mean_token_accuracy": 0.938463494181633,
"num_tokens": 1078541.0,
"step": 122
},
{
"entropy": 1.507046639919281,
"epoch": 0.4452488687782805,
"grad_norm": 0.515389621257782,
"learning_rate": 0.00019970924256619888,
"loss": 0.19066768884658813,
"mean_token_accuracy": 0.9394121021032333,
"num_tokens": 1087015.0,
"step": 123
},
{
"entropy": 1.5334805250167847,
"epoch": 0.448868778280543,
"grad_norm": 0.6119932532310486,
"learning_rate": 0.00019969857940972235,
"loss": 0.2432323843240738,
"mean_token_accuracy": 0.9420388340950012,
"num_tokens": 1095644.0,
"step": 124
},
{
"entropy": 1.451358139514923,
"epoch": 0.45248868778280543,
"grad_norm": 0.513521134853363,
"learning_rate": 0.00019968772455753218,
"loss": 0.21337461471557617,
"mean_token_accuracy": 0.9473854750394821,
"num_tokens": 1104652.0,
"step": 125
},
{
"entropy": 1.403743416070938,
"epoch": 0.45610859728506786,
"grad_norm": 0.44188353419303894,
"learning_rate": 0.00019967667803282637,
"loss": 0.21201607584953308,
"mean_token_accuracy": 0.9484260380268097,
"num_tokens": 1113648.0,
"step": 126
},
{
"entropy": 1.4469349682331085,
"epoch": 0.4597285067873303,
"grad_norm": 0.5705162286758423,
"learning_rate": 0.00019966543985921258,
"loss": 0.18666209280490875,
"mean_token_accuracy": 0.9515744149684906,
"num_tokens": 1122645.0,
"step": 127
},
{
"entropy": 1.3682746887207031,
"epoch": 0.4633484162895928,
"grad_norm": 0.49191927909851074,
"learning_rate": 0.000199654010060708,
"loss": 0.29679813981056213,
"mean_token_accuracy": 0.9304109215736389,
"num_tokens": 1132041.0,
"step": 128
},
{
"entropy": 1.4161922633647919,
"epoch": 0.4669683257918552,
"grad_norm": 0.47388190031051636,
"learning_rate": 0.00019964238866173933,
"loss": 0.1531015932559967,
"mean_token_accuracy": 0.9585694819688797,
"num_tokens": 1140871.0,
"step": 129
},
{
"entropy": 1.3751187920570374,
"epoch": 0.47058823529411764,
"grad_norm": 0.46824195981025696,
"learning_rate": 0.00019963057568714288,
"loss": 0.17030593752861023,
"mean_token_accuracy": 0.9534722566604614,
"num_tokens": 1149706.0,
"step": 130
},
{
"entropy": 1.3917218148708344,
"epoch": 0.47420814479638007,
"grad_norm": 0.4428865909576416,
"learning_rate": 0.00019961857116216415,
"loss": 0.15017299354076385,
"mean_token_accuracy": 0.9592516124248505,
"num_tokens": 1158749.0,
"step": 131
},
{
"entropy": 1.3426124155521393,
"epoch": 0.47782805429864256,
"grad_norm": 0.49072983860969543,
"learning_rate": 0.00019960637511245823,
"loss": 0.22365210950374603,
"mean_token_accuracy": 0.946484237909317,
"num_tokens": 1167772.0,
"step": 132
},
{
"entropy": 1.421400249004364,
"epoch": 0.481447963800905,
"grad_norm": 0.5675694346427917,
"learning_rate": 0.00019959398756408937,
"loss": 0.18156108260154724,
"mean_token_accuracy": 0.9530180990695953,
"num_tokens": 1176196.0,
"step": 133
},
{
"entropy": 1.3424357175827026,
"epoch": 0.4850678733031674,
"grad_norm": 0.5297194123268127,
"learning_rate": 0.0001995814085435311,
"loss": 0.18270625174045563,
"mean_token_accuracy": 0.9406759738922119,
"num_tokens": 1185613.0,
"step": 134
},
{
"entropy": 1.4340205192565918,
"epoch": 0.48868778280542985,
"grad_norm": 0.5901919603347778,
"learning_rate": 0.00019956863807766618,
"loss": 0.1788717657327652,
"mean_token_accuracy": 0.9499698132276535,
"num_tokens": 1193890.0,
"step": 135
},
{
"entropy": 1.4506348073482513,
"epoch": 0.49230769230769234,
"grad_norm": 0.5080121159553528,
"learning_rate": 0.00019955567619378653,
"loss": 0.12078559398651123,
"mean_token_accuracy": 0.9667535722255707,
"num_tokens": 1202125.0,
"step": 136
},
{
"entropy": 1.382477194070816,
"epoch": 0.49592760180995477,
"grad_norm": 0.659888744354248,
"learning_rate": 0.00019954252291959313,
"loss": 0.2572862505912781,
"mean_token_accuracy": 0.9361128658056259,
"num_tokens": 1210985.0,
"step": 137
},
{
"entropy": 1.4523886442184448,
"epoch": 0.4995475113122172,
"grad_norm": 0.6076639890670776,
"learning_rate": 0.00019952917828319587,
"loss": 0.1389123499393463,
"mean_token_accuracy": 0.9607612937688828,
"num_tokens": 1219414.0,
"step": 138
},
{
"entropy": 1.4372033476829529,
"epoch": 0.5031674208144796,
"grad_norm": 0.6114508509635925,
"learning_rate": 0.00019951564231311382,
"loss": 0.16256970167160034,
"mean_token_accuracy": 0.9580988585948944,
"num_tokens": 1227819.0,
"step": 139
},
{
"entropy": 1.4240647554397583,
"epoch": 0.5067873303167421,
"grad_norm": 0.5607258677482605,
"learning_rate": 0.00019950191503827477,
"loss": 0.1742154359817505,
"mean_token_accuracy": 0.9604121297597885,
"num_tokens": 1236815.0,
"step": 140
},
{
"entropy": 1.3567279279232025,
"epoch": 0.5104072398190045,
"grad_norm": 0.49631235003471375,
"learning_rate": 0.00019948799648801546,
"loss": 0.1200169026851654,
"mean_token_accuracy": 0.9611728638410568,
"num_tokens": 1246290.0,
"step": 141
},
{
"entropy": 1.506151258945465,
"epoch": 0.5140271493212669,
"grad_norm": 0.9381592273712158,
"learning_rate": 0.0001994738866920813,
"loss": 0.24524807929992676,
"mean_token_accuracy": 0.9364699125289917,
"num_tokens": 1254437.0,
"step": 142
},
{
"entropy": 1.4939132928848267,
"epoch": 0.5176470588235295,
"grad_norm": 0.4674718976020813,
"learning_rate": 0.00019945958568062656,
"loss": 0.22789186239242554,
"mean_token_accuracy": 0.9441357254981995,
"num_tokens": 1263451.0,
"step": 143
},
{
"entropy": 1.501807302236557,
"epoch": 0.5212669683257919,
"grad_norm": 0.5704030394554138,
"learning_rate": 0.00019944509348421394,
"loss": 0.18432818353176117,
"mean_token_accuracy": 0.953639954328537,
"num_tokens": 1272204.0,
"step": 144
},
{
"entropy": 1.4492950141429901,
"epoch": 0.5248868778280543,
"grad_norm": 0.47559797763824463,
"learning_rate": 0.000199430410133815,
"loss": 0.147740438580513,
"mean_token_accuracy": 0.9607143253087997,
"num_tokens": 1281403.0,
"step": 145
},
{
"entropy": 1.4073392152786255,
"epoch": 0.5285067873303168,
"grad_norm": 0.6094549298286438,
"learning_rate": 0.00019941553566080956,
"loss": 0.2326284945011139,
"mean_token_accuracy": 0.9408996403217316,
"num_tokens": 1290379.0,
"step": 146
},
{
"entropy": 1.507258117198944,
"epoch": 0.5321266968325792,
"grad_norm": 0.6971456408500671,
"learning_rate": 0.00019940047009698605,
"loss": 0.2862337529659271,
"mean_token_accuracy": 0.9211678206920624,
"num_tokens": 1298939.0,
"step": 147
},
{
"entropy": 1.4335385262966156,
"epoch": 0.5357466063348416,
"grad_norm": 0.5851684808731079,
"learning_rate": 0.00019938521347454127,
"loss": 0.29994359612464905,
"mean_token_accuracy": 0.9208105802536011,
"num_tokens": 1307833.0,
"step": 148
},
{
"entropy": 1.489009529352188,
"epoch": 0.539366515837104,
"grad_norm": 0.7157009840011597,
"learning_rate": 0.00019936976582608023,
"loss": 0.33578288555145264,
"mean_token_accuracy": 0.9373409301042557,
"num_tokens": 1316684.0,
"step": 149
},
{
"entropy": 1.4718311429023743,
"epoch": 0.5429864253393665,
"grad_norm": 0.5213670134544373,
"learning_rate": 0.00019935412718461625,
"loss": 0.13324445486068726,
"mean_token_accuracy": 0.9562166035175323,
"num_tokens": 1325251.0,
"step": 150
},
{
"entropy": 1.4213023483753204,
"epoch": 0.5466063348416289,
"grad_norm": 0.5317531824111938,
"learning_rate": 0.0001993382975835709,
"loss": 0.15537622570991516,
"mean_token_accuracy": 0.964701920747757,
"num_tokens": 1333742.0,
"step": 151
},
{
"entropy": 1.3925495147705078,
"epoch": 0.5502262443438914,
"grad_norm": 0.4998919665813446,
"learning_rate": 0.00019932227705677372,
"loss": 0.1906745433807373,
"mean_token_accuracy": 0.9361033141613007,
"num_tokens": 1342670.0,
"step": 152
},
{
"entropy": 1.3675826787948608,
"epoch": 0.5538461538461539,
"grad_norm": 0.3895116448402405,
"learning_rate": 0.00019930606563846234,
"loss": 0.14962267875671387,
"mean_token_accuracy": 0.9600752294063568,
"num_tokens": 1351651.0,
"step": 153
},
{
"entropy": 1.2704340815544128,
"epoch": 0.5574660633484163,
"grad_norm": 0.43442079424858093,
"learning_rate": 0.0001992896633632823,
"loss": 0.1176116019487381,
"mean_token_accuracy": 0.9697213470935822,
"num_tokens": 1360957.0,
"step": 154
},
{
"entropy": 1.349033147096634,
"epoch": 0.5610859728506787,
"grad_norm": 0.6015726923942566,
"learning_rate": 0.00019927307026628715,
"loss": 0.17969676852226257,
"mean_token_accuracy": 0.9535713642835617,
"num_tokens": 1369448.0,
"step": 155
},
{
"entropy": 1.3353968262672424,
"epoch": 0.5647058823529412,
"grad_norm": 0.5272573828697205,
"learning_rate": 0.00019925628638293815,
"loss": 0.21488747000694275,
"mean_token_accuracy": 0.942707359790802,
"num_tokens": 1378342.0,
"step": 156
},
{
"entropy": 1.3764784634113312,
"epoch": 0.5683257918552036,
"grad_norm": 0.7961441278457642,
"learning_rate": 0.00019923931174910421,
"loss": 0.30012083053588867,
"mean_token_accuracy": 0.9248047173023224,
"num_tokens": 1386635.0,
"step": 157
},
{
"entropy": 1.4050418436527252,
"epoch": 0.571945701357466,
"grad_norm": 0.6271554231643677,
"learning_rate": 0.00019922214640106207,
"loss": 0.184654101729393,
"mean_token_accuracy": 0.9567630439996719,
"num_tokens": 1394801.0,
"step": 158
},
{
"entropy": 1.3368881344795227,
"epoch": 0.5755656108597285,
"grad_norm": 0.5737248063087463,
"learning_rate": 0.00019920479037549595,
"loss": 0.2726176977157593,
"mean_token_accuracy": 0.9449877142906189,
"num_tokens": 1403634.0,
"step": 159
},
{
"entropy": 1.3764293193817139,
"epoch": 0.579185520361991,
"grad_norm": 0.6995347142219543,
"learning_rate": 0.00019918724370949754,
"loss": 0.2793852686882019,
"mean_token_accuracy": 0.9334268867969513,
"num_tokens": 1412184.0,
"step": 160
},
{
"entropy": 1.3516182601451874,
"epoch": 0.5828054298642534,
"grad_norm": 0.6867108941078186,
"learning_rate": 0.00019916950644056607,
"loss": 0.23619085550308228,
"mean_token_accuracy": 0.9341517090797424,
"num_tokens": 1421258.0,
"step": 161
},
{
"entropy": 1.3733141720294952,
"epoch": 0.5864253393665159,
"grad_norm": 0.6309821009635925,
"learning_rate": 0.00019915157860660797,
"loss": 0.27165842056274414,
"mean_token_accuracy": 0.9349307119846344,
"num_tokens": 1430376.0,
"step": 162
},
{
"entropy": 1.421423226594925,
"epoch": 0.5900452488687783,
"grad_norm": 0.512278139591217,
"learning_rate": 0.000199133460245937,
"loss": 0.09708991646766663,
"mean_token_accuracy": 0.9740542620420456,
"num_tokens": 1439210.0,
"step": 163
},
{
"entropy": 1.4290501475334167,
"epoch": 0.5936651583710407,
"grad_norm": 0.54677814245224,
"learning_rate": 0.0001991151513972741,
"loss": 0.1681460738182068,
"mean_token_accuracy": 0.9526286870241165,
"num_tokens": 1447909.0,
"step": 164
},
{
"entropy": 1.4586975574493408,
"epoch": 0.5972850678733032,
"grad_norm": 0.5319063067436218,
"learning_rate": 0.00019909665209974723,
"loss": 0.27126502990722656,
"mean_token_accuracy": 0.9449646919965744,
"num_tokens": 1456865.0,
"step": 165
},
{
"entropy": 1.291123479604721,
"epoch": 0.6009049773755656,
"grad_norm": 0.41675442457199097,
"learning_rate": 0.00019907796239289154,
"loss": 0.14705216884613037,
"mean_token_accuracy": 0.957685187458992,
"num_tokens": 1466501.0,
"step": 166
},
{
"entropy": 1.4526861608028412,
"epoch": 0.604524886877828,
"grad_norm": 0.6517218351364136,
"learning_rate": 0.0001990590823166489,
"loss": 0.18804579973220825,
"mean_token_accuracy": 0.95452880859375,
"num_tokens": 1475217.0,
"step": 167
},
{
"entropy": 1.4182425439357758,
"epoch": 0.6081447963800904,
"grad_norm": 0.44628605246543884,
"learning_rate": 0.0001990400119113681,
"loss": 0.16652102768421173,
"mean_token_accuracy": 0.9495290070772171,
"num_tokens": 1484272.0,
"step": 168
},
{
"entropy": 1.395858347415924,
"epoch": 0.611764705882353,
"grad_norm": 0.5252248048782349,
"learning_rate": 0.00019902075121780473,
"loss": 0.204986572265625,
"mean_token_accuracy": 0.9546073526144028,
"num_tokens": 1493197.0,
"step": 169
},
{
"entropy": 1.437942624092102,
"epoch": 0.6153846153846154,
"grad_norm": 0.574350893497467,
"learning_rate": 0.00019900130027712099,
"loss": 0.2608497142791748,
"mean_token_accuracy": 0.9362770020961761,
"num_tokens": 1502113.0,
"step": 170
},
{
"entropy": 1.4258457124233246,
"epoch": 0.6190045248868778,
"grad_norm": 0.5768272876739502,
"learning_rate": 0.00019898165913088568,
"loss": 0.2692157030105591,
"mean_token_accuracy": 0.9251071363687515,
"num_tokens": 1510830.0,
"step": 171
},
{
"entropy": 1.4638590514659882,
"epoch": 0.6226244343891403,
"grad_norm": 0.5094544291496277,
"learning_rate": 0.00019896182782107408,
"loss": 0.11549721658229828,
"mean_token_accuracy": 0.969596341252327,
"num_tokens": 1519258.0,
"step": 172
},
{
"entropy": 1.4395278096199036,
"epoch": 0.6262443438914027,
"grad_norm": 0.5421784520149231,
"learning_rate": 0.00019894180639006787,
"loss": 0.20283782482147217,
"mean_token_accuracy": 0.9391360431909561,
"num_tokens": 1527863.0,
"step": 173
},
{
"entropy": 1.3791865408420563,
"epoch": 0.6298642533936651,
"grad_norm": 0.5409896373748779,
"learning_rate": 0.00019892159488065506,
"loss": 0.1997358500957489,
"mean_token_accuracy": 0.9525162279605865,
"num_tokens": 1536957.0,
"step": 174
},
{
"entropy": 1.4018727838993073,
"epoch": 0.6334841628959276,
"grad_norm": 0.6673674583435059,
"learning_rate": 0.00019890119333602988,
"loss": 0.1960916519165039,
"mean_token_accuracy": 0.9513887315988541,
"num_tokens": 1545655.0,
"step": 175
},
{
"entropy": 1.45829838514328,
"epoch": 0.63710407239819,
"grad_norm": 0.4443865716457367,
"learning_rate": 0.00019888060179979266,
"loss": 0.19763894379138947,
"mean_token_accuracy": 0.9526630192995071,
"num_tokens": 1554210.0,
"step": 176
},
{
"entropy": 1.5121627151966095,
"epoch": 0.6407239819004525,
"grad_norm": 0.5975261926651001,
"learning_rate": 0.00019885982031594973,
"loss": 0.21303991973400116,
"mean_token_accuracy": 0.9420748949050903,
"num_tokens": 1562730.0,
"step": 177
},
{
"entropy": 1.375084936618805,
"epoch": 0.644343891402715,
"grad_norm": 0.457962304353714,
"learning_rate": 0.00019883884892891348,
"loss": 0.1408441662788391,
"mean_token_accuracy": 0.967100590467453,
"num_tokens": 1571700.0,
"step": 178
},
{
"entropy": 1.541441649198532,
"epoch": 0.6479638009049774,
"grad_norm": 0.6157453060150146,
"learning_rate": 0.000198817687683502,
"loss": 0.0904114842414856,
"mean_token_accuracy": 0.9715709984302521,
"num_tokens": 1579817.0,
"step": 179
},
{
"entropy": 1.5142415463924408,
"epoch": 0.6515837104072398,
"grad_norm": 0.8100699186325073,
"learning_rate": 0.0001987963366249392,
"loss": 0.398171067237854,
"mean_token_accuracy": 0.9328930824995041,
"num_tokens": 1588732.0,
"step": 180
},
{
"entropy": 1.4462586343288422,
"epoch": 0.6552036199095023,
"grad_norm": 0.44163593649864197,
"learning_rate": 0.0001987747957988547,
"loss": 0.12847575545310974,
"mean_token_accuracy": 0.961458757519722,
"num_tokens": 1598066.0,
"step": 181
},
{
"entropy": 1.4784432351589203,
"epoch": 0.6588235294117647,
"grad_norm": 0.6655211448669434,
"learning_rate": 0.00019875306525128354,
"loss": 0.22915330529212952,
"mean_token_accuracy": 0.9353652149438858,
"num_tokens": 1606976.0,
"step": 182
},
{
"entropy": 1.5425707399845123,
"epoch": 0.6624434389140271,
"grad_norm": 0.537034273147583,
"learning_rate": 0.00019873114502866633,
"loss": 0.13760660588741302,
"mean_token_accuracy": 0.9617000967264175,
"num_tokens": 1615552.0,
"step": 183
},
{
"entropy": 1.317031353712082,
"epoch": 0.6660633484162896,
"grad_norm": 0.4945172667503357,
"learning_rate": 0.00019870903517784898,
"loss": 0.22668133676052094,
"mean_token_accuracy": 0.935705840587616,
"num_tokens": 1625671.0,
"step": 184
},
{
"entropy": 1.4837210774421692,
"epoch": 0.669683257918552,
"grad_norm": 0.4989132285118103,
"learning_rate": 0.00019868673574608266,
"loss": 0.1490660309791565,
"mean_token_accuracy": 0.9522654563188553,
"num_tokens": 1634216.0,
"step": 185
},
{
"entropy": 1.5495317876338959,
"epoch": 0.6733031674208145,
"grad_norm": 0.8123180270195007,
"learning_rate": 0.0001986642467810237,
"loss": 0.17341700196266174,
"mean_token_accuracy": 0.9526006877422333,
"num_tokens": 1642342.0,
"step": 186
},
{
"entropy": 1.4706730842590332,
"epoch": 0.676923076923077,
"grad_norm": 0.7836541533470154,
"learning_rate": 0.00019864156833073352,
"loss": 0.2447955161333084,
"mean_token_accuracy": 0.9472227543592453,
"num_tokens": 1650719.0,
"step": 187
},
{
"entropy": 1.436569631099701,
"epoch": 0.6805429864253394,
"grad_norm": 0.4677812159061432,
"learning_rate": 0.00019861870044367844,
"loss": 0.11487612128257751,
"mean_token_accuracy": 0.9670159816741943,
"num_tokens": 1659875.0,
"step": 188
},
{
"entropy": 1.5288162529468536,
"epoch": 0.6841628959276018,
"grad_norm": 0.5686696171760559,
"learning_rate": 0.0001985956431687296,
"loss": 0.16770240664482117,
"mean_token_accuracy": 0.9642325341701508,
"num_tokens": 1668729.0,
"step": 189
},
{
"entropy": 1.4523391425609589,
"epoch": 0.6877828054298643,
"grad_norm": 0.7280884981155396,
"learning_rate": 0.00019857239655516302,
"loss": 0.3095542788505554,
"mean_token_accuracy": 0.9423863142728806,
"num_tokens": 1677528.0,
"step": 190
},
{
"entropy": 1.454990178346634,
"epoch": 0.6914027149321267,
"grad_norm": 0.4571734070777893,
"learning_rate": 0.0001985489606526592,
"loss": 0.09302312880754471,
"mean_token_accuracy": 0.9739308208227158,
"num_tokens": 1686546.0,
"step": 191
},
{
"entropy": 1.3993627727031708,
"epoch": 0.6950226244343891,
"grad_norm": 0.42863190174102783,
"learning_rate": 0.00019852533551130324,
"loss": 0.17399480938911438,
"mean_token_accuracy": 0.9535701423883438,
"num_tokens": 1695918.0,
"step": 192
},
{
"entropy": 1.4340824484825134,
"epoch": 0.6986425339366515,
"grad_norm": 0.4958471357822418,
"learning_rate": 0.00019850152118158472,
"loss": 0.3563914895057678,
"mean_token_accuracy": 0.9297028332948685,
"num_tokens": 1705739.0,
"step": 193
},
{
"entropy": 1.5259247124195099,
"epoch": 0.702262443438914,
"grad_norm": 0.6084112524986267,
"learning_rate": 0.00019847751771439738,
"loss": 0.1985940933227539,
"mean_token_accuracy": 0.9563599675893784,
"num_tokens": 1714371.0,
"step": 194
},
{
"entropy": 1.5039476454257965,
"epoch": 0.7058823529411765,
"grad_norm": 0.49471357464790344,
"learning_rate": 0.00019845332516103933,
"loss": 0.10389607399702072,
"mean_token_accuracy": 0.9768806099891663,
"num_tokens": 1722649.0,
"step": 195
},
{
"entropy": 1.4451472461223602,
"epoch": 0.709502262443439,
"grad_norm": 0.46219033002853394,
"learning_rate": 0.0001984289435732127,
"loss": 0.13712066411972046,
"mean_token_accuracy": 0.9715917259454727,
"num_tokens": 1731453.0,
"step": 196
},
{
"entropy": 1.4479835629463196,
"epoch": 0.7131221719457014,
"grad_norm": 0.4487977921962738,
"learning_rate": 0.00019840437300302366,
"loss": 0.08234203606843948,
"mean_token_accuracy": 0.9709072560071945,
"num_tokens": 1740339.0,
"step": 197
},
{
"entropy": 1.4245754480361938,
"epoch": 0.7167420814479638,
"grad_norm": 0.5051844120025635,
"learning_rate": 0.00019837961350298213,
"loss": 0.12205886840820312,
"mean_token_accuracy": 0.9654085338115692,
"num_tokens": 1748827.0,
"step": 198
},
{
"entropy": 1.5265796482563019,
"epoch": 0.7203619909502262,
"grad_norm": 0.7977287173271179,
"learning_rate": 0.00019835466512600197,
"loss": 0.23942159116268158,
"mean_token_accuracy": 0.9452069103717804,
"num_tokens": 1757208.0,
"step": 199
},
{
"entropy": 1.4634148180484772,
"epoch": 0.7239819004524887,
"grad_norm": 0.4664369523525238,
"learning_rate": 0.00019832952792540054,
"loss": 0.15900103747844696,
"mean_token_accuracy": 0.9659069031476974,
"num_tokens": 1765951.0,
"step": 200
},
{
"entropy": 1.4476028680801392,
"epoch": 0.7276018099547511,
"grad_norm": 0.6050535440444946,
"learning_rate": 0.00019830420195489877,
"loss": 0.17117048799991608,
"mean_token_accuracy": 0.9614047408103943,
"num_tokens": 1775046.0,
"step": 201
},
{
"entropy": 1.5342890918254852,
"epoch": 0.7312217194570135,
"grad_norm": 0.6432852745056152,
"learning_rate": 0.00019827868726862117,
"loss": 0.31473690271377563,
"mean_token_accuracy": 0.9222957193851471,
"num_tokens": 1783798.0,
"step": 202
},
{
"entropy": 1.533927708864212,
"epoch": 0.7348416289592761,
"grad_norm": 0.4874444305896759,
"learning_rate": 0.00019825298392109529,
"loss": 0.11646515130996704,
"mean_token_accuracy": 0.970250278711319,
"num_tokens": 1792421.0,
"step": 203
},
{
"entropy": 1.4147436618804932,
"epoch": 0.7384615384615385,
"grad_norm": 0.4096081256866455,
"learning_rate": 0.00019822709196725208,
"loss": 0.15057605504989624,
"mean_token_accuracy": 0.9578173905611038,
"num_tokens": 1801310.0,
"step": 204
},
{
"entropy": 1.447014480829239,
"epoch": 0.7420814479638009,
"grad_norm": 0.4614224135875702,
"learning_rate": 0.00019820101146242547,
"loss": 0.08990304172039032,
"mean_token_accuracy": 0.9700856953859329,
"num_tokens": 1810156.0,
"step": 205
},
{
"entropy": 1.4257702827453613,
"epoch": 0.7457013574660634,
"grad_norm": 0.7065203189849854,
"learning_rate": 0.00019817474246235233,
"loss": 0.23199987411499023,
"mean_token_accuracy": 0.9359505921602249,
"num_tokens": 1819254.0,
"step": 206
},
{
"entropy": 1.5035441517829895,
"epoch": 0.7493212669683258,
"grad_norm": 0.6417229771614075,
"learning_rate": 0.00019814828502317245,
"loss": 0.2971632480621338,
"mean_token_accuracy": 0.9346631169319153,
"num_tokens": 1828186.0,
"step": 207
},
{
"entropy": 1.451407551765442,
"epoch": 0.7529411764705882,
"grad_norm": 0.3759056031703949,
"learning_rate": 0.00019812163920142827,
"loss": 0.09616382420063019,
"mean_token_accuracy": 0.9760568290948868,
"num_tokens": 1837148.0,
"step": 208
},
{
"entropy": 1.5474788844585419,
"epoch": 0.7565610859728507,
"grad_norm": 0.5765470266342163,
"learning_rate": 0.0001980948050540648,
"loss": 0.17177847027778625,
"mean_token_accuracy": 0.9533937126398087,
"num_tokens": 1845935.0,
"step": 209
},
{
"entropy": 1.513023853302002,
"epoch": 0.7601809954751131,
"grad_norm": 0.42471620440483093,
"learning_rate": 0.00019806778263842964,
"loss": 0.13085290789604187,
"mean_token_accuracy": 0.9693024903535843,
"num_tokens": 1854982.0,
"step": 210
},
{
"entropy": 1.611119568347931,
"epoch": 0.7638009049773755,
"grad_norm": 0.4796192944049835,
"learning_rate": 0.00019804057201227259,
"loss": 0.10031422972679138,
"mean_token_accuracy": 0.9697081595659256,
"num_tokens": 1863315.0,
"step": 211
},
{
"entropy": 1.6045884191989899,
"epoch": 0.7674208144796381,
"grad_norm": 0.6628240346908569,
"learning_rate": 0.00019801317323374574,
"loss": 0.2653411030769348,
"mean_token_accuracy": 0.9474365711212158,
"num_tokens": 1871544.0,
"step": 212
},
{
"entropy": 1.4857927858829498,
"epoch": 0.7710407239819005,
"grad_norm": 0.5168606638908386,
"learning_rate": 0.00019798558636140333,
"loss": 0.11389698088169098,
"mean_token_accuracy": 0.9681493788957596,
"num_tokens": 1880335.0,
"step": 213
},
{
"entropy": 1.5443885922431946,
"epoch": 0.7746606334841629,
"grad_norm": 0.5839455723762512,
"learning_rate": 0.00019795781145420148,
"loss": 0.1918099969625473,
"mean_token_accuracy": 0.9581391960382462,
"num_tokens": 1889345.0,
"step": 214
},
{
"entropy": 1.456317961215973,
"epoch": 0.7782805429864253,
"grad_norm": 0.44960126280784607,
"learning_rate": 0.00019792984857149826,
"loss": 0.12002551555633545,
"mean_token_accuracy": 0.9649894386529922,
"num_tokens": 1898319.0,
"step": 215
},
{
"entropy": 1.4415479898452759,
"epoch": 0.7819004524886878,
"grad_norm": 0.4050678014755249,
"learning_rate": 0.00019790169777305345,
"loss": 0.11734722554683685,
"mean_token_accuracy": 0.9726845920085907,
"num_tokens": 1907445.0,
"step": 216
},
{
"entropy": 1.469126135110855,
"epoch": 0.7855203619909502,
"grad_norm": 0.6080633997917175,
"learning_rate": 0.00019787335911902835,
"loss": 0.23930203914642334,
"mean_token_accuracy": 0.9382640719413757,
"num_tokens": 1916139.0,
"step": 217
},
{
"entropy": 1.3760231733322144,
"epoch": 0.7891402714932126,
"grad_norm": 0.4775363802909851,
"learning_rate": 0.00019784483266998575,
"loss": 0.2287708967924118,
"mean_token_accuracy": 0.9554623812437057,
"num_tokens": 1925201.0,
"step": 218
},
{
"entropy": 1.4956690967082977,
"epoch": 0.7927601809954751,
"grad_norm": 0.7189075946807861,
"learning_rate": 0.0001978161184868899,
"loss": 0.29633429646492004,
"mean_token_accuracy": 0.9378493428230286,
"num_tokens": 1933751.0,
"step": 219
},
{
"entropy": 1.4381499290466309,
"epoch": 0.7963800904977375,
"grad_norm": 0.6960969567298889,
"learning_rate": 0.00019778721663110603,
"loss": 0.257027268409729,
"mean_token_accuracy": 0.9326806962490082,
"num_tokens": 1942630.0,
"step": 220
},
{
"entropy": 1.419518768787384,
"epoch": 0.8,
"grad_norm": 0.5286183953285217,
"learning_rate": 0.00019775812716440073,
"loss": 0.13132816553115845,
"mean_token_accuracy": 0.9626937806606293,
"num_tokens": 1951779.0,
"step": 221
},
{
"entropy": 1.4531921744346619,
"epoch": 0.8036199095022625,
"grad_norm": 0.5929961204528809,
"learning_rate": 0.00019772885014894125,
"loss": 0.2502361536026001,
"mean_token_accuracy": 0.9397906512022018,
"num_tokens": 1960796.0,
"step": 222
},
{
"entropy": 1.5002046525478363,
"epoch": 0.8072398190045249,
"grad_norm": 0.5530325770378113,
"learning_rate": 0.00019769938564729585,
"loss": 0.1540164351463318,
"mean_token_accuracy": 0.9645767956972122,
"num_tokens": 1969357.0,
"step": 223
},
{
"entropy": 1.4433454275131226,
"epoch": 0.8108597285067873,
"grad_norm": 0.598328709602356,
"learning_rate": 0.00019766973372243343,
"loss": 0.25215908885002136,
"mean_token_accuracy": 0.9557305723428726,
"num_tokens": 1978387.0,
"step": 224
},
{
"entropy": 1.4278393983840942,
"epoch": 0.8144796380090498,
"grad_norm": 0.7414979934692383,
"learning_rate": 0.00019763989443772337,
"loss": 0.2587546408176422,
"mean_token_accuracy": 0.9320106357336044,
"num_tokens": 1987317.0,
"step": 225
},
{
"entropy": 1.4586597084999084,
"epoch": 0.8180995475113122,
"grad_norm": 0.623528778553009,
"learning_rate": 0.0001976098678569355,
"loss": 0.21999062597751617,
"mean_token_accuracy": 0.9393687099218369,
"num_tokens": 1996024.0,
"step": 226
},
{
"entropy": 1.487585186958313,
"epoch": 0.8217194570135746,
"grad_norm": 0.6269425749778748,
"learning_rate": 0.00019757965404423994,
"loss": 0.2157050371170044,
"mean_token_accuracy": 0.9460557103157043,
"num_tokens": 2005075.0,
"step": 227
},
{
"entropy": 1.4444681107997894,
"epoch": 0.8253393665158371,
"grad_norm": 0.4378167390823364,
"learning_rate": 0.0001975492530642069,
"loss": 0.20458024740219116,
"mean_token_accuracy": 0.9438146203756332,
"num_tokens": 2014220.0,
"step": 228
},
{
"entropy": 1.4960754811763763,
"epoch": 0.8289592760180996,
"grad_norm": 0.4860404133796692,
"learning_rate": 0.0001975186649818066,
"loss": 0.28230106830596924,
"mean_token_accuracy": 0.9395165890455246,
"num_tokens": 2023338.0,
"step": 229
},
{
"entropy": 1.5679333209991455,
"epoch": 0.832579185520362,
"grad_norm": 0.655274510383606,
"learning_rate": 0.00019748788986240917,
"loss": 0.1854293942451477,
"mean_token_accuracy": 0.9481187909841537,
"num_tokens": 2031605.0,
"step": 230
},
{
"entropy": 1.4240575730800629,
"epoch": 0.8361990950226245,
"grad_norm": 0.3618476986885071,
"learning_rate": 0.0001974569277717844,
"loss": 0.09264940023422241,
"mean_token_accuracy": 0.975529670715332,
"num_tokens": 2041057.0,
"step": 231
},
{
"entropy": 1.5365903079509735,
"epoch": 0.8398190045248869,
"grad_norm": 0.4857717454433441,
"learning_rate": 0.00019742577877610173,
"loss": 0.3051563799381256,
"mean_token_accuracy": 0.9376842081546783,
"num_tokens": 2049959.0,
"step": 232
},
{
"entropy": 1.383328765630722,
"epoch": 0.8434389140271493,
"grad_norm": 0.45292651653289795,
"learning_rate": 0.0001973944429419299,
"loss": 0.2122434377670288,
"mean_token_accuracy": 0.9505620300769806,
"num_tokens": 2059255.0,
"step": 233
},
{
"entropy": 1.445828378200531,
"epoch": 0.8470588235294118,
"grad_norm": 0.41845881938934326,
"learning_rate": 0.00019736292033623704,
"loss": 0.11589747667312622,
"mean_token_accuracy": 0.9683210551738739,
"num_tokens": 2068297.0,
"step": 234
},
{
"entropy": 1.4795459508895874,
"epoch": 0.8506787330316742,
"grad_norm": 0.4432578384876251,
"learning_rate": 0.00019733121102639048,
"loss": 0.07866068929433823,
"mean_token_accuracy": 0.9755797684192657,
"num_tokens": 2076844.0,
"step": 235
},
{
"entropy": 1.5486867129802704,
"epoch": 0.8542986425339366,
"grad_norm": 0.5824456810951233,
"learning_rate": 0.00019729931508015647,
"loss": 0.23570555448532104,
"mean_token_accuracy": 0.943296879529953,
"num_tokens": 2085155.0,
"step": 236
},
{
"entropy": 1.4806998372077942,
"epoch": 0.857918552036199,
"grad_norm": 0.5646211504936218,
"learning_rate": 0.0001972672325657001,
"loss": 0.20387002825737,
"mean_token_accuracy": 0.9514354765415192,
"num_tokens": 2093967.0,
"step": 237
},
{
"entropy": 1.4301677942276,
"epoch": 0.8615384615384616,
"grad_norm": 0.5802773833274841,
"learning_rate": 0.0001972349635515853,
"loss": 0.2552819550037384,
"mean_token_accuracy": 0.922490268945694,
"num_tokens": 2103028.0,
"step": 238
},
{
"entropy": 1.324641764163971,
"epoch": 0.865158371040724,
"grad_norm": 0.4743529260158539,
"learning_rate": 0.00019720250810677446,
"loss": 0.12536188960075378,
"mean_token_accuracy": 0.9665613174438477,
"num_tokens": 2111936.0,
"step": 239
},
{
"entropy": 1.397893875837326,
"epoch": 0.8687782805429864,
"grad_norm": 0.555899441242218,
"learning_rate": 0.00019716986630062842,
"loss": 0.12725675106048584,
"mean_token_accuracy": 0.9541808664798737,
"num_tokens": 2120622.0,
"step": 240
},
{
"entropy": 1.3342379927635193,
"epoch": 0.8723981900452489,
"grad_norm": 0.552416205406189,
"learning_rate": 0.00019713703820290634,
"loss": 0.16678127646446228,
"mean_token_accuracy": 0.9486428052186966,
"num_tokens": 2129761.0,
"step": 241
},
{
"entropy": 1.3923524022102356,
"epoch": 0.8760180995475113,
"grad_norm": 0.5822156667709351,
"learning_rate": 0.00019710402388376544,
"loss": 0.1675427258014679,
"mean_token_accuracy": 0.957242414355278,
"num_tokens": 2138328.0,
"step": 242
},
{
"entropy": 1.3071589767932892,
"epoch": 0.8796380090497737,
"grad_norm": 0.4842599928379059,
"learning_rate": 0.00019707082341376093,
"loss": 0.10306215286254883,
"mean_token_accuracy": 0.9641513824462891,
"num_tokens": 2147288.0,
"step": 243
},
{
"entropy": 1.327076405286789,
"epoch": 0.8832579185520362,
"grad_norm": 0.5377116799354553,
"learning_rate": 0.0001970374368638459,
"loss": 0.11235228180885315,
"mean_token_accuracy": 0.9719918966293335,
"num_tokens": 2156064.0,
"step": 244
},
{
"entropy": 1.3401291370391846,
"epoch": 0.8868778280542986,
"grad_norm": 0.5172064900398254,
"learning_rate": 0.00019700386430537105,
"loss": 0.1520514041185379,
"mean_token_accuracy": 0.9551143050193787,
"num_tokens": 2165166.0,
"step": 245
},
{
"entropy": 1.335503101348877,
"epoch": 0.890497737556561,
"grad_norm": 0.551672101020813,
"learning_rate": 0.00019697010581008463,
"loss": 0.1884230524301529,
"mean_token_accuracy": 0.9497657865285873,
"num_tokens": 2174115.0,
"step": 246
},
{
"entropy": 1.3267870545387268,
"epoch": 0.8941176470588236,
"grad_norm": 0.4992049038410187,
"learning_rate": 0.00019693616145013227,
"loss": 0.18525123596191406,
"mean_token_accuracy": 0.9568396955728531,
"num_tokens": 2183228.0,
"step": 247
},
{
"entropy": 1.405649095773697,
"epoch": 0.897737556561086,
"grad_norm": 1.0682220458984375,
"learning_rate": 0.00019690203129805672,
"loss": 0.379827618598938,
"mean_token_accuracy": 0.8877183198928833,
"num_tokens": 2192055.0,
"step": 248
},
{
"entropy": 1.373319834470749,
"epoch": 0.9013574660633484,
"grad_norm": 0.44303223490715027,
"learning_rate": 0.00019686771542679797,
"loss": 0.09994952380657196,
"mean_token_accuracy": 0.9754335582256317,
"num_tokens": 2201010.0,
"step": 249
},
{
"entropy": 1.4375985264778137,
"epoch": 0.9049773755656109,
"grad_norm": 0.5711826682090759,
"learning_rate": 0.0001968332139096927,
"loss": 0.1751662790775299,
"mean_token_accuracy": 0.9558616876602173,
"num_tokens": 2209782.0,
"step": 250
},
{
"entropy": 1.393396943807602,
"epoch": 0.9085972850678733,
"grad_norm": 0.4692879915237427,
"learning_rate": 0.00019679852682047457,
"loss": 0.1593962460756302,
"mean_token_accuracy": 0.9541475772857666,
"num_tokens": 2218923.0,
"step": 251
},
{
"entropy": 1.3871179819107056,
"epoch": 0.9122171945701357,
"grad_norm": 0.4700246751308441,
"learning_rate": 0.0001967636542332736,
"loss": 0.11389590799808502,
"mean_token_accuracy": 0.971915066242218,
"num_tokens": 2227564.0,
"step": 252
},
{
"entropy": 1.3677232265472412,
"epoch": 0.9158371040723982,
"grad_norm": 0.3912360668182373,
"learning_rate": 0.00019672859622261633,
"loss": 0.09452933073043823,
"mean_token_accuracy": 0.9733647406101227,
"num_tokens": 2236836.0,
"step": 253
},
{
"entropy": 1.3464795053005219,
"epoch": 0.9194570135746606,
"grad_norm": 0.44805341958999634,
"learning_rate": 0.0001966933528634256,
"loss": 0.15307973325252533,
"mean_token_accuracy": 0.9646724760532379,
"num_tokens": 2245881.0,
"step": 254
},
{
"entropy": 1.3869231641292572,
"epoch": 0.9230769230769231,
"grad_norm": 0.4598497450351715,
"learning_rate": 0.00019665792423102037,
"loss": 0.13864350318908691,
"mean_token_accuracy": 0.963247537612915,
"num_tokens": 2254736.0,
"step": 255
},
{
"entropy": 1.385448396205902,
"epoch": 0.9266968325791856,
"grad_norm": 0.5225904583930969,
"learning_rate": 0.0001966223104011155,
"loss": 0.1266992688179016,
"mean_token_accuracy": 0.9631596356630325,
"num_tokens": 2263315.0,
"step": 256
},
{
"entropy": 1.4815338253974915,
"epoch": 0.930316742081448,
"grad_norm": 0.6528983116149902,
"learning_rate": 0.00019658651144982163,
"loss": 0.2216826230287552,
"mean_token_accuracy": 0.9490031599998474,
"num_tokens": 2271832.0,
"step": 257
},
{
"entropy": 1.4158953726291656,
"epoch": 0.9339366515837104,
"grad_norm": 0.45583924651145935,
"learning_rate": 0.00019655052745364509,
"loss": 0.13053885102272034,
"mean_token_accuracy": 0.9635764211416245,
"num_tokens": 2280582.0,
"step": 258
},
{
"entropy": 1.3917952477931976,
"epoch": 0.9375565610859729,
"grad_norm": 0.4816400110721588,
"learning_rate": 0.00019651435848948762,
"loss": 0.13310927152633667,
"mean_token_accuracy": 0.9665548801422119,
"num_tokens": 2289239.0,
"step": 259
},
{
"entropy": 1.335536628961563,
"epoch": 0.9411764705882353,
"grad_norm": 0.46578988432884216,
"learning_rate": 0.00019647800463464622,
"loss": 0.17411091923713684,
"mean_token_accuracy": 0.9487465471029282,
"num_tokens": 2298569.0,
"step": 260
},
{
"entropy": 1.3734354674816132,
"epoch": 0.9447963800904977,
"grad_norm": 0.4758208990097046,
"learning_rate": 0.00019644146596681312,
"loss": 0.13356219232082367,
"mean_token_accuracy": 0.966251790523529,
"num_tokens": 2307333.0,
"step": 261
},
{
"entropy": 1.3474647402763367,
"epoch": 0.9484162895927601,
"grad_norm": 0.30391982197761536,
"learning_rate": 0.00019640474256407545,
"loss": 0.07434239238500595,
"mean_token_accuracy": 0.9785039573907852,
"num_tokens": 2315917.0,
"step": 262
},
{
"entropy": 1.327022761106491,
"epoch": 0.9520361990950226,
"grad_norm": 0.6123433113098145,
"learning_rate": 0.00019636783450491517,
"loss": 0.37289372086524963,
"mean_token_accuracy": 0.9232099652290344,
"num_tokens": 2325041.0,
"step": 263
},
{
"entropy": 1.2676138877868652,
"epoch": 0.9556561085972851,
"grad_norm": 0.6251657605171204,
"learning_rate": 0.00019633074186820886,
"loss": 0.2077867090702057,
"mean_token_accuracy": 0.9404798150062561,
"num_tokens": 2334552.0,
"step": 264
},
{
"entropy": 1.3420342803001404,
"epoch": 0.9592760180995475,
"grad_norm": 0.44629064202308655,
"learning_rate": 0.0001962934647332275,
"loss": 0.10463929176330566,
"mean_token_accuracy": 0.9691900908946991,
"num_tokens": 2343361.0,
"step": 265
},
{
"entropy": 1.449256420135498,
"epoch": 0.96289592760181,
"grad_norm": 0.9838017821311951,
"learning_rate": 0.0001962560031796365,
"loss": 0.2925741970539093,
"mean_token_accuracy": 0.9219896346330643,
"num_tokens": 2351556.0,
"step": 266
},
{
"entropy": 1.2965570390224457,
"epoch": 0.9665158371040724,
"grad_norm": 0.5719688534736633,
"learning_rate": 0.00019621835728749525,
"loss": 0.23820579051971436,
"mean_token_accuracy": 0.9379686415195465,
"num_tokens": 2361033.0,
"step": 267
},
{
"entropy": 1.3180812895298004,
"epoch": 0.9701357466063348,
"grad_norm": 0.4512738883495331,
"learning_rate": 0.0001961805271372572,
"loss": 0.15501753985881805,
"mean_token_accuracy": 0.9587176293134689,
"num_tokens": 2369972.0,
"step": 268
},
{
"entropy": 1.360179752111435,
"epoch": 0.9737556561085973,
"grad_norm": 0.39895451068878174,
"learning_rate": 0.00019614251280976948,
"loss": 0.08832871168851852,
"mean_token_accuracy": 0.9742900878190994,
"num_tokens": 2378872.0,
"step": 269
},
{
"entropy": 1.3346007466316223,
"epoch": 0.9773755656108597,
"grad_norm": 0.4796980619430542,
"learning_rate": 0.00019610431438627296,
"loss": 0.1033824160695076,
"mean_token_accuracy": 0.9715064615011215,
"num_tokens": 2387688.0,
"step": 270
},
{
"entropy": 1.301027923822403,
"epoch": 0.9809954751131221,
"grad_norm": 0.4513569176197052,
"learning_rate": 0.00019606593194840177,
"loss": 0.18686418235301971,
"mean_token_accuracy": 0.9559367001056671,
"num_tokens": 2397053.0,
"step": 271
},
{
"entropy": 1.3664647042751312,
"epoch": 0.9846153846153847,
"grad_norm": 0.5319916009902954,
"learning_rate": 0.0001960273655781835,
"loss": 0.24958105385303497,
"mean_token_accuracy": 0.9426600635051727,
"num_tokens": 2405763.0,
"step": 272
},
{
"entropy": 1.3043864369392395,
"epoch": 0.9882352941176471,
"grad_norm": 0.44749879837036133,
"learning_rate": 0.00019598861535803863,
"loss": 0.15809005498886108,
"mean_token_accuracy": 0.9583301842212677,
"num_tokens": 2414884.0,
"step": 273
},
{
"entropy": 1.3713374137878418,
"epoch": 0.9918552036199095,
"grad_norm": 0.5084534883499146,
"learning_rate": 0.00019594968137078068,
"loss": 0.17388306558132172,
"mean_token_accuracy": 0.9520856887102127,
"num_tokens": 2423715.0,
"step": 274
},
{
"entropy": 1.365702897310257,
"epoch": 0.995475113122172,
"grad_norm": 0.459522008895874,
"learning_rate": 0.00019591056369961586,
"loss": 0.33452439308166504,
"mean_token_accuracy": 0.9534527063369751,
"num_tokens": 2432458.0,
"step": 275
},
{
"entropy": 1.3280319273471832,
"epoch": 0.9990950226244344,
"grad_norm": 0.4350273311138153,
"learning_rate": 0.00019587126242814288,
"loss": 0.12349647283554077,
"mean_token_accuracy": 0.9627684652805328,
"num_tokens": 2441126.0,
"step": 276
},
{
"entropy": 1.6572558879852295,
"epoch": 1.0,
"grad_norm": 0.9549095630645752,
"learning_rate": 0.00019583177764035295,
"loss": 0.028879065066576004,
"mean_token_accuracy": 1.0,
"num_tokens": 2441725.0,
"step": 277
},
{
"epoch": 1.0,
"eval_entropy": 1.3299247259046973,
"eval_loss": 0.16865810751914978,
"eval_mean_token_accuracy": 0.9547278832613937,
"eval_num_tokens": 2441725.0,
"eval_runtime": 31.7656,
"eval_samples_per_second": 11.616,
"eval_steps_per_second": 3.872,
"step": 277
},
{
"entropy": 1.3716817200183868,
"epoch": 1.0036199095022624,
"grad_norm": 0.6422934532165527,
"learning_rate": 0.00019579210942062932,
"loss": 0.1597217321395874,
"mean_token_accuracy": 0.955962136387825,
"num_tokens": 2450437.0,
"step": 278
},
{
"entropy": 1.2969892621040344,
"epoch": 1.0072398190045249,
"grad_norm": 0.44608697295188904,
"learning_rate": 0.0001957522578537474,
"loss": 0.09895047545433044,
"mean_token_accuracy": 0.9709436744451523,
"num_tokens": 2459078.0,
"step": 279
},
{
"entropy": 1.2229366302490234,
"epoch": 1.0108597285067873,
"grad_norm": 0.4623957872390747,
"learning_rate": 0.0001957122230248743,
"loss": 0.16620564460754395,
"mean_token_accuracy": 0.9499163031578064,
"num_tokens": 2468073.0,
"step": 280
},
{
"entropy": 1.33988156914711,
"epoch": 1.0144796380090497,
"grad_norm": 0.6455283761024475,
"learning_rate": 0.0001956720050195689,
"loss": 0.1705092489719391,
"mean_token_accuracy": 0.9635123610496521,
"num_tokens": 2476886.0,
"step": 281
},
{
"entropy": 1.2605039477348328,
"epoch": 1.0180995475113122,
"grad_norm": 0.4512750804424286,
"learning_rate": 0.00019563160392378144,
"loss": 0.30783483386039734,
"mean_token_accuracy": 0.932560071349144,
"num_tokens": 2486249.0,
"step": 282
},
{
"entropy": 1.2216798067092896,
"epoch": 1.0217194570135746,
"grad_norm": 0.45792317390441895,
"learning_rate": 0.00019559101982385356,
"loss": 0.1141764372587204,
"mean_token_accuracy": 0.9699619710445404,
"num_tokens": 2495163.0,
"step": 283
},
{
"entropy": 1.258674830198288,
"epoch": 1.025339366515837,
"grad_norm": 0.48922327160835266,
"learning_rate": 0.00019555025280651786,
"loss": 0.11167445778846741,
"mean_token_accuracy": 0.9687153398990631,
"num_tokens": 2504066.0,
"step": 284
},
{
"entropy": 1.3179872334003448,
"epoch": 1.0289592760180994,
"grad_norm": 0.6329558491706848,
"learning_rate": 0.00019550930295889803,
"loss": 0.17174683511257172,
"mean_token_accuracy": 0.9596489369869232,
"num_tokens": 2512418.0,
"step": 285
},
{
"entropy": 1.3070985078811646,
"epoch": 1.032579185520362,
"grad_norm": 0.6164458990097046,
"learning_rate": 0.00019546817036850827,
"loss": 0.15053772926330566,
"mean_token_accuracy": 0.9639337956905365,
"num_tokens": 2520822.0,
"step": 286
},
{
"entropy": 1.1846771538257599,
"epoch": 1.0361990950226245,
"grad_norm": 0.49578550457954407,
"learning_rate": 0.00019542685512325357,
"loss": 0.1119435578584671,
"mean_token_accuracy": 0.9712934345006943,
"num_tokens": 2530019.0,
"step": 287
},
{
"entropy": 1.1750589311122894,
"epoch": 1.039819004524887,
"grad_norm": 0.3983454406261444,
"learning_rate": 0.00019538535731142907,
"loss": 0.09416541457176208,
"mean_token_accuracy": 0.9704293310642242,
"num_tokens": 2539456.0,
"step": 288
},
{
"entropy": 1.3071076273918152,
"epoch": 1.0434389140271494,
"grad_norm": 0.6677536368370056,
"learning_rate": 0.00019534367702172016,
"loss": 0.19954326748847961,
"mean_token_accuracy": 0.9511717855930328,
"num_tokens": 2548133.0,
"step": 289
},
{
"entropy": 1.3091352581977844,
"epoch": 1.0470588235294118,
"grad_norm": 0.5290597081184387,
"learning_rate": 0.00019530181434320224,
"loss": 0.09285785257816315,
"mean_token_accuracy": 0.9735703021287918,
"num_tokens": 2556747.0,
"step": 290
},
{
"entropy": 1.1955563724040985,
"epoch": 1.0506787330316743,
"grad_norm": 0.5170581340789795,
"learning_rate": 0.00019525976936534035,
"loss": 0.1392112374305725,
"mean_token_accuracy": 0.9660876840353012,
"num_tokens": 2566078.0,
"step": 291
},
{
"entropy": 1.2591860592365265,
"epoch": 1.0542986425339367,
"grad_norm": 0.6322119832038879,
"learning_rate": 0.00019521754217798935,
"loss": 0.2537181079387665,
"mean_token_accuracy": 0.9380859881639481,
"num_tokens": 2575652.0,
"step": 292
},
{
"entropy": 1.3299800157546997,
"epoch": 1.0579185520361991,
"grad_norm": 0.6024647355079651,
"learning_rate": 0.00019517513287139326,
"loss": 0.1248047798871994,
"mean_token_accuracy": 0.9594511985778809,
"num_tokens": 2584405.0,
"step": 293
},
{
"entropy": 1.3162773251533508,
"epoch": 1.0615384615384615,
"grad_norm": 0.6070277690887451,
"learning_rate": 0.0001951325415361855,
"loss": 0.14759968221187592,
"mean_token_accuracy": 0.9557203203439713,
"num_tokens": 2593314.0,
"step": 294
},
{
"entropy": 1.2592947483062744,
"epoch": 1.065158371040724,
"grad_norm": 0.44067755341529846,
"learning_rate": 0.00019508976826338844,
"loss": 0.131802037358284,
"mean_token_accuracy": 0.9598903208971024,
"num_tokens": 2602597.0,
"step": 295
},
{
"entropy": 1.2933553457260132,
"epoch": 1.0687782805429864,
"grad_norm": 0.5667601227760315,
"learning_rate": 0.00019504681314441323,
"loss": 0.15577419102191925,
"mean_token_accuracy": 0.9549229890108109,
"num_tokens": 2611891.0,
"step": 296
},
{
"entropy": 1.3254594206809998,
"epoch": 1.0723981900452488,
"grad_norm": 0.519059956073761,
"learning_rate": 0.00019500367627105965,
"loss": 0.14545762538909912,
"mean_token_accuracy": 0.9637808352708817,
"num_tokens": 2620889.0,
"step": 297
},
{
"entropy": 1.277051329612732,
"epoch": 1.0760180995475113,
"grad_norm": 0.4660187065601349,
"learning_rate": 0.00019496035773551592,
"loss": 0.14717882871627808,
"mean_token_accuracy": 0.9652319550514221,
"num_tokens": 2629885.0,
"step": 298
},
{
"entropy": 1.3058906197547913,
"epoch": 1.0796380090497737,
"grad_norm": 0.8524317145347595,
"learning_rate": 0.0001949168576303586,
"loss": 0.10511646419763565,
"mean_token_accuracy": 0.9672305583953857,
"num_tokens": 2638639.0,
"step": 299
},
{
"entropy": 1.3437564969062805,
"epoch": 1.0832579185520361,
"grad_norm": 0.9568637609481812,
"learning_rate": 0.00019487317604855212,
"loss": 0.12141256034374237,
"mean_token_accuracy": 0.9637749493122101,
"num_tokens": 2647779.0,
"step": 300
},
{
"entropy": 1.3359644711017609,
"epoch": 1.0868778280542986,
"grad_norm": 0.6103286147117615,
"learning_rate": 0.00019482931308344888,
"loss": 0.14975383877754211,
"mean_token_accuracy": 0.9517766237258911,
"num_tokens": 2656565.0,
"step": 301
},
{
"entropy": 1.2883181869983673,
"epoch": 1.090497737556561,
"grad_norm": 0.4035210609436035,
"learning_rate": 0.00019478526882878876,
"loss": 0.10880422592163086,
"mean_token_accuracy": 0.9710263162851334,
"num_tokens": 2665569.0,
"step": 302
},
{
"entropy": 1.357493907213211,
"epoch": 1.0941176470588236,
"grad_norm": 0.5569011569023132,
"learning_rate": 0.00019474104337869924,
"loss": 0.13409318029880524,
"mean_token_accuracy": 0.9593861550092697,
"num_tokens": 2674397.0,
"step": 303
},
{
"entropy": 1.3459482192993164,
"epoch": 1.097737556561086,
"grad_norm": 0.6249450445175171,
"learning_rate": 0.00019469663682769491,
"loss": 0.19079425930976868,
"mean_token_accuracy": 0.9542694389820099,
"num_tokens": 2683208.0,
"step": 304
},
{
"entropy": 1.3437992334365845,
"epoch": 1.1013574660633485,
"grad_norm": 0.5010721683502197,
"learning_rate": 0.00019465204927067754,
"loss": 0.14577272534370422,
"mean_token_accuracy": 0.9553558230400085,
"num_tokens": 2691945.0,
"step": 305
},
{
"entropy": 1.42454132437706,
"epoch": 1.104977375565611,
"grad_norm": 0.5698776841163635,
"learning_rate": 0.0001946072808029355,
"loss": 0.15152013301849365,
"mean_token_accuracy": 0.9586130678653717,
"num_tokens": 2700595.0,
"step": 306
},
{
"entropy": 1.3169154226779938,
"epoch": 1.1085972850678734,
"grad_norm": 0.5620861649513245,
"learning_rate": 0.00019456233152014406,
"loss": 0.20957821607589722,
"mean_token_accuracy": 0.9405840635299683,
"num_tokens": 2709771.0,
"step": 307
},
{
"entropy": 1.358991116285324,
"epoch": 1.1122171945701358,
"grad_norm": 0.46596547961235046,
"learning_rate": 0.00019451720151836467,
"loss": 0.12322796881198883,
"mean_token_accuracy": 0.9694350957870483,
"num_tokens": 2718386.0,
"step": 308
},
{
"entropy": 1.3823304772377014,
"epoch": 1.1158371040723982,
"grad_norm": 0.4822905659675598,
"learning_rate": 0.00019447189089404513,
"loss": 0.08683277666568756,
"mean_token_accuracy": 0.9738442450761795,
"num_tokens": 2726918.0,
"step": 309
},
{
"entropy": 1.2822128236293793,
"epoch": 1.1194570135746607,
"grad_norm": 0.5359209775924683,
"learning_rate": 0.00019442639974401923,
"loss": 0.14018404483795166,
"mean_token_accuracy": 0.9630445092916489,
"num_tokens": 2736277.0,
"step": 310
},
{
"entropy": 1.2816834151744843,
"epoch": 1.123076923076923,
"grad_norm": 0.4495919644832611,
"learning_rate": 0.00019438072816550654,
"loss": 0.13849881291389465,
"mean_token_accuracy": 0.9659072607755661,
"num_tokens": 2745710.0,
"step": 311
},
{
"entropy": 1.308707356452942,
"epoch": 1.1266968325791855,
"grad_norm": 0.404366135597229,
"learning_rate": 0.0001943348762561123,
"loss": 0.09485723078250885,
"mean_token_accuracy": 0.9704456329345703,
"num_tokens": 2754571.0,
"step": 312
},
{
"entropy": 1.2276718020439148,
"epoch": 1.130316742081448,
"grad_norm": 0.524721086025238,
"learning_rate": 0.00019428884411382694,
"loss": 0.15793752670288086,
"mean_token_accuracy": 0.9559158235788345,
"num_tokens": 2763888.0,
"step": 313
},
{
"entropy": 1.1902028918266296,
"epoch": 1.1339366515837104,
"grad_norm": 0.41495031118392944,
"learning_rate": 0.00019424263183702634,
"loss": 0.08333931863307953,
"mean_token_accuracy": 0.9717884361743927,
"num_tokens": 2773451.0,
"step": 314
},
{
"entropy": 1.3228943943977356,
"epoch": 1.1375565610859728,
"grad_norm": 0.5118943452835083,
"learning_rate": 0.00019419623952447113,
"loss": 0.13395828008651733,
"mean_token_accuracy": 0.9631195217370987,
"num_tokens": 2782201.0,
"step": 315
},
{
"entropy": 1.2930387556552887,
"epoch": 1.1411764705882352,
"grad_norm": 0.5813370943069458,
"learning_rate": 0.0001941496672753068,
"loss": 0.20286405086517334,
"mean_token_accuracy": 0.9531850218772888,
"num_tokens": 2791064.0,
"step": 316
},
{
"entropy": 1.2918854355812073,
"epoch": 1.1447963800904977,
"grad_norm": 0.5122251510620117,
"learning_rate": 0.00019410291518906337,
"loss": 0.13441404700279236,
"mean_token_accuracy": 0.9620678126811981,
"num_tokens": 2799899.0,
"step": 317
},
{
"entropy": 1.2272875010967255,
"epoch": 1.14841628959276,
"grad_norm": 0.4150262475013733,
"learning_rate": 0.00019405598336565518,
"loss": 0.11041846871376038,
"mean_token_accuracy": 0.9675246626138687,
"num_tokens": 2809282.0,
"step": 318
},
{
"entropy": 1.2951529920101166,
"epoch": 1.1520361990950225,
"grad_norm": 0.6175352334976196,
"learning_rate": 0.00019400887190538068,
"loss": 0.23986774682998657,
"mean_token_accuracy": 0.9405841678380966,
"num_tokens": 2818065.0,
"step": 319
},
{
"entropy": 1.279006689786911,
"epoch": 1.155656108597285,
"grad_norm": 0.5972070693969727,
"learning_rate": 0.00019396158090892224,
"loss": 0.11880911141633987,
"mean_token_accuracy": 0.9688185602426529,
"num_tokens": 2827164.0,
"step": 320
},
{
"entropy": 1.3515214622020721,
"epoch": 1.1592760180995474,
"grad_norm": 0.6211175322532654,
"learning_rate": 0.00019391411047734589,
"loss": 0.20557641983032227,
"mean_token_accuracy": 0.9422554075717926,
"num_tokens": 2835820.0,
"step": 321
},
{
"entropy": 1.3550400137901306,
"epoch": 1.16289592760181,
"grad_norm": 0.5280090570449829,
"learning_rate": 0.00019386646071210118,
"loss": 0.10726026445627213,
"mean_token_accuracy": 0.9667237550020218,
"num_tokens": 2844372.0,
"step": 322
},
{
"entropy": 1.2510974407196045,
"epoch": 1.1665158371040725,
"grad_norm": 0.5300607085227966,
"learning_rate": 0.00019381863171502088,
"loss": 0.11015061289072037,
"mean_token_accuracy": 0.9701625555753708,
"num_tokens": 2853765.0,
"step": 323
},
{
"entropy": 1.3709427416324615,
"epoch": 1.170135746606335,
"grad_norm": 0.5143060088157654,
"learning_rate": 0.00019377062358832083,
"loss": 0.1023775190114975,
"mean_token_accuracy": 0.9671410173177719,
"num_tokens": 2862380.0,
"step": 324
},
{
"entropy": 1.3767890334129333,
"epoch": 1.1737556561085973,
"grad_norm": 0.5431631207466125,
"learning_rate": 0.00019372243643459963,
"loss": 0.10558684170246124,
"mean_token_accuracy": 0.9684659391641617,
"num_tokens": 2870789.0,
"step": 325
},
{
"entropy": 1.2412969470024109,
"epoch": 1.1773755656108598,
"grad_norm": 0.4398118257522583,
"learning_rate": 0.0001936740703568386,
"loss": 0.08340749889612198,
"mean_token_accuracy": 0.9579537361860275,
"num_tokens": 2880113.0,
"step": 326
},
{
"entropy": 1.3048627376556396,
"epoch": 1.1809954751131222,
"grad_norm": 0.5652127861976624,
"learning_rate": 0.00019362552545840121,
"loss": 0.10193085670471191,
"mean_token_accuracy": 0.9732934236526489,
"num_tokens": 2888677.0,
"step": 327
},
{
"entropy": 1.251385748386383,
"epoch": 1.1846153846153846,
"grad_norm": 1.2118943929672241,
"learning_rate": 0.00019357680184303334,
"loss": 0.11773515492677689,
"mean_token_accuracy": 0.9713618904352188,
"num_tokens": 2897670.0,
"step": 328
},
{
"entropy": 1.2210056483745575,
"epoch": 1.188235294117647,
"grad_norm": 0.566527247428894,
"learning_rate": 0.00019352789961486273,
"loss": 0.13454154133796692,
"mean_token_accuracy": 0.9604692161083221,
"num_tokens": 2906641.0,
"step": 329
},
{
"entropy": 1.2078846395015717,
"epoch": 1.1918552036199095,
"grad_norm": 0.5194998979568481,
"learning_rate": 0.00019347881887839878,
"loss": 0.12406279891729355,
"mean_token_accuracy": 0.9622378796339035,
"num_tokens": 2915822.0,
"step": 330
},
{
"entropy": 1.1988820135593414,
"epoch": 1.195475113122172,
"grad_norm": 0.6065554618835449,
"learning_rate": 0.00019342955973853236,
"loss": 0.13872164487838745,
"mean_token_accuracy": 0.9626883864402771,
"num_tokens": 2924800.0,
"step": 331
},
{
"entropy": 1.2309212684631348,
"epoch": 1.1990950226244343,
"grad_norm": 0.4229549765586853,
"learning_rate": 0.00019338012230053574,
"loss": 0.0660879835486412,
"mean_token_accuracy": 0.9759407639503479,
"num_tokens": 2933258.0,
"step": 332
},
{
"entropy": 1.2222792506217957,
"epoch": 1.2027149321266968,
"grad_norm": 0.3530382215976715,
"learning_rate": 0.00019333050667006213,
"loss": 0.049285903573036194,
"mean_token_accuracy": 0.9843859821557999,
"num_tokens": 2942300.0,
"step": 333
},
{
"entropy": 1.2958929240703583,
"epoch": 1.2063348416289592,
"grad_norm": 0.7826951742172241,
"learning_rate": 0.00019328071295314557,
"loss": 0.1304435431957245,
"mean_token_accuracy": 0.9591120481491089,
"num_tokens": 2950767.0,
"step": 334
},
{
"entropy": 1.117813378572464,
"epoch": 1.2099547511312216,
"grad_norm": 0.4954948425292969,
"learning_rate": 0.0001932307412562007,
"loss": 0.13508693873882294,
"mean_token_accuracy": 0.9585950672626495,
"num_tokens": 2960646.0,
"step": 335
},
{
"entropy": 1.1835031807422638,
"epoch": 1.213574660633484,
"grad_norm": 0.5961520671844482,
"learning_rate": 0.00019318059168602251,
"loss": 0.13114790618419647,
"mean_token_accuracy": 0.9647198617458344,
"num_tokens": 2969505.0,
"step": 336
},
{
"entropy": 1.2350926995277405,
"epoch": 1.2171945701357467,
"grad_norm": 0.639654815196991,
"learning_rate": 0.0001931302643497862,
"loss": 0.14303456246852875,
"mean_token_accuracy": 0.9530725330114365,
"num_tokens": 2978152.0,
"step": 337
},
{
"entropy": 1.2022943198680878,
"epoch": 1.2208144796380092,
"grad_norm": 0.481646865606308,
"learning_rate": 0.00019307975935504672,
"loss": 0.08106391131877899,
"mean_token_accuracy": 0.9747956246137619,
"num_tokens": 2987130.0,
"step": 338
},
{
"entropy": 1.1898784339427948,
"epoch": 1.2244343891402716,
"grad_norm": 0.6450216770172119,
"learning_rate": 0.00019302907680973888,
"loss": 0.13768108189105988,
"mean_token_accuracy": 0.9617143720388412,
"num_tokens": 2996486.0,
"step": 339
},
{
"entropy": 1.2522628903388977,
"epoch": 1.228054298642534,
"grad_norm": 0.6536288857460022,
"learning_rate": 0.00019297821682217676,
"loss": 0.10431766510009766,
"mean_token_accuracy": 0.9742023795843124,
"num_tokens": 3005405.0,
"step": 340
},
{
"entropy": 1.227498173713684,
"epoch": 1.2316742081447964,
"grad_norm": 0.43904492259025574,
"learning_rate": 0.00019292717950105382,
"loss": 0.08097510784864426,
"mean_token_accuracy": 0.9721501767635345,
"num_tokens": 3014074.0,
"step": 341
},
{
"entropy": 1.2638218104839325,
"epoch": 1.2352941176470589,
"grad_norm": 0.43028151988983154,
"learning_rate": 0.00019287596495544233,
"loss": 0.08344768732786179,
"mean_token_accuracy": 0.9769876450300217,
"num_tokens": 3022652.0,
"step": 342
},
{
"entropy": 1.3337022960186005,
"epoch": 1.2389140271493213,
"grad_norm": 0.5973348021507263,
"learning_rate": 0.0001928245732947935,
"loss": 0.15496382117271423,
"mean_token_accuracy": 0.9576286971569061,
"num_tokens": 3030878.0,
"step": 343
},
{
"entropy": 1.3223533630371094,
"epoch": 1.2425339366515837,
"grad_norm": 0.5969900488853455,
"learning_rate": 0.0001927730046289369,
"loss": 0.15188942849636078,
"mean_token_accuracy": 0.9633741676807404,
"num_tokens": 3039311.0,
"step": 344
},
{
"entropy": 1.2445839643478394,
"epoch": 1.2461538461538462,
"grad_norm": 0.528199315071106,
"learning_rate": 0.00019272125906808038,
"loss": 0.17896166443824768,
"mean_token_accuracy": 0.9567457586526871,
"num_tokens": 3048358.0,
"step": 345
},
{
"entropy": 1.2038472890853882,
"epoch": 1.2497737556561086,
"grad_norm": 0.5293557643890381,
"learning_rate": 0.00019266933672280998,
"loss": 0.09390994906425476,
"mean_token_accuracy": 0.9739362895488739,
"num_tokens": 3057158.0,
"step": 346
},
{
"entropy": 1.3120388686656952,
"epoch": 1.253393665158371,
"grad_norm": 0.6997618675231934,
"learning_rate": 0.00019261723770408942,
"loss": 0.10516057163476944,
"mean_token_accuracy": 0.9721816033124924,
"num_tokens": 3065552.0,
"step": 347
},
{
"entropy": 1.2618502080440521,
"epoch": 1.2570135746606335,
"grad_norm": 0.9361847043037415,
"learning_rate": 0.00019256496212326,
"loss": 0.20228593051433563,
"mean_token_accuracy": 0.952130600810051,
"num_tokens": 3074114.0,
"step": 348
},
{
"entropy": 1.1915834546089172,
"epoch": 1.260633484162896,
"grad_norm": 0.5289909243583679,
"learning_rate": 0.00019251251009204037,
"loss": 0.190103217959404,
"mean_token_accuracy": 0.9520878791809082,
"num_tokens": 3083265.0,
"step": 349
},
{
"entropy": 1.2389221489429474,
"epoch": 1.2642533936651583,
"grad_norm": 0.5938348770141602,
"learning_rate": 0.0001924598817225263,
"loss": 0.14038029313087463,
"mean_token_accuracy": 0.9594598710536957,
"num_tokens": 3092345.0,
"step": 350
},
{
"entropy": 1.2747001945972443,
"epoch": 1.2678733031674208,
"grad_norm": 0.49824684858322144,
"learning_rate": 0.00019240707712719042,
"loss": 0.09080217778682709,
"mean_token_accuracy": 0.9752634167671204,
"num_tokens": 3100948.0,
"step": 351
},
{
"entropy": 1.2456386089324951,
"epoch": 1.2714932126696832,
"grad_norm": 0.4126341938972473,
"learning_rate": 0.0001923540964188819,
"loss": 0.07552246004343033,
"mean_token_accuracy": 0.973249226808548,
"num_tokens": 3109527.0,
"step": 352
},
{
"entropy": 1.2375999987125397,
"epoch": 1.2751131221719456,
"grad_norm": 0.5416721701622009,
"learning_rate": 0.0001923009397108264,
"loss": 0.11064227670431137,
"mean_token_accuracy": 0.9698382914066315,
"num_tokens": 3118529.0,
"step": 353
},
{
"entropy": 1.323571503162384,
"epoch": 1.278733031674208,
"grad_norm": 0.539566159248352,
"learning_rate": 0.00019224760711662555,
"loss": 0.06606832891702652,
"mean_token_accuracy": 0.9804727733135223,
"num_tokens": 3127119.0,
"step": 354
},
{
"entropy": 1.2750852406024933,
"epoch": 1.2823529411764705,
"grad_norm": 0.466310054063797,
"learning_rate": 0.00019219409875025705,
"loss": 0.09571981430053711,
"mean_token_accuracy": 0.9744787514209747,
"num_tokens": 3135490.0,
"step": 355
},
{
"entropy": 1.2746248841285706,
"epoch": 1.285972850678733,
"grad_norm": 0.5672609210014343,
"learning_rate": 0.00019214041472607408,
"loss": 0.11834894865751266,
"mean_token_accuracy": 0.9640013575553894,
"num_tokens": 3144069.0,
"step": 356
},
{
"entropy": 1.2627168595790863,
"epoch": 1.2895927601809956,
"grad_norm": 0.6581894755363464,
"learning_rate": 0.00019208655515880532,
"loss": 0.09647037833929062,
"mean_token_accuracy": 0.9689393192529678,
"num_tokens": 3152615.0,
"step": 357
},
{
"entropy": 1.1794680655002594,
"epoch": 1.293212669683258,
"grad_norm": 0.5053655505180359,
"learning_rate": 0.00019203252016355458,
"loss": 0.11060067266225815,
"mean_token_accuracy": 0.9651738703250885,
"num_tokens": 3161401.0,
"step": 358
},
{
"entropy": 1.212617665529251,
"epoch": 1.2968325791855204,
"grad_norm": 0.8028094172477722,
"learning_rate": 0.00019197830985580064,
"loss": 0.2338217943906784,
"mean_token_accuracy": 0.9495572596788406,
"num_tokens": 3170012.0,
"step": 359
},
{
"entropy": 1.1559679508209229,
"epoch": 1.3004524886877828,
"grad_norm": 0.7312418818473816,
"learning_rate": 0.00019192392435139676,
"loss": 0.20356854796409607,
"mean_token_accuracy": 0.948117196559906,
"num_tokens": 3179571.0,
"step": 360
},
{
"entropy": 1.23529851436615,
"epoch": 1.3040723981900453,
"grad_norm": 0.5860849022865295,
"learning_rate": 0.00019186936376657085,
"loss": 0.09007853269577026,
"mean_token_accuracy": 0.9779597371816635,
"num_tokens": 3188355.0,
"step": 361
},
{
"entropy": 1.1607768833637238,
"epoch": 1.3076923076923077,
"grad_norm": 0.5011453032493591,
"learning_rate": 0.0001918146282179248,
"loss": 0.10823806375265121,
"mean_token_accuracy": 0.9690431505441666,
"num_tokens": 3197273.0,
"step": 362
},
{
"entropy": 1.3031161725521088,
"epoch": 1.3113122171945701,
"grad_norm": 0.7570735812187195,
"learning_rate": 0.0001917597178224345,
"loss": 0.12617093324661255,
"mean_token_accuracy": 0.9726371467113495,
"num_tokens": 3205523.0,
"step": 363
},
{
"entropy": 1.2163672745227814,
"epoch": 1.3149321266968326,
"grad_norm": 0.79036545753479,
"learning_rate": 0.0001917046326974495,
"loss": 0.1774558573961258,
"mean_token_accuracy": 0.9524620473384857,
"num_tokens": 3214337.0,
"step": 364
},
{
"entropy": 1.2065471410751343,
"epoch": 1.318552036199095,
"grad_norm": 0.4970189332962036,
"learning_rate": 0.00019164937296069275,
"loss": 0.11900650709867477,
"mean_token_accuracy": 0.9698603302240372,
"num_tokens": 3223272.0,
"step": 365
},
{
"entropy": 1.1421026289463043,
"epoch": 1.3221719457013574,
"grad_norm": 0.4852188527584076,
"learning_rate": 0.0001915939387302604,
"loss": 0.13389186561107635,
"mean_token_accuracy": 0.9624761492013931,
"num_tokens": 3232805.0,
"step": 366
},
{
"entropy": 1.2605140805244446,
"epoch": 1.3257918552036199,
"grad_norm": 0.4731639623641968,
"learning_rate": 0.00019153833012462148,
"loss": 0.13248053193092346,
"mean_token_accuracy": 0.9626576453447342,
"num_tokens": 3242004.0,
"step": 367
},
{
"entropy": 1.2730947136878967,
"epoch": 1.3294117647058823,
"grad_norm": 0.5051520466804504,
"learning_rate": 0.0001914825472626177,
"loss": 0.13219039142131805,
"mean_token_accuracy": 0.9609686881303787,
"num_tokens": 3250883.0,
"step": 368
},
{
"entropy": 1.2701692283153534,
"epoch": 1.3330316742081447,
"grad_norm": 0.4948354959487915,
"learning_rate": 0.00019142659026346315,
"loss": 0.11131806671619415,
"mean_token_accuracy": 0.9648500084877014,
"num_tokens": 3259677.0,
"step": 369
},
{
"entropy": 1.1959101557731628,
"epoch": 1.3366515837104074,
"grad_norm": 0.4939797520637512,
"learning_rate": 0.00019137045924674402,
"loss": 0.23960143327713013,
"mean_token_accuracy": 0.9330800324678421,
"num_tokens": 3268905.0,
"step": 370
},
{
"entropy": 1.233262300491333,
"epoch": 1.3402714932126698,
"grad_norm": 0.5234730839729309,
"learning_rate": 0.00019131415433241855,
"loss": 0.1446327269077301,
"mean_token_accuracy": 0.960878998041153,
"num_tokens": 3278287.0,
"step": 371
},
{
"entropy": 1.273567020893097,
"epoch": 1.3438914027149322,
"grad_norm": 0.8267992734909058,
"learning_rate": 0.0001912576756408165,
"loss": 0.18108022212982178,
"mean_token_accuracy": 0.957704022526741,
"num_tokens": 3287174.0,
"step": 372
},
{
"entropy": 1.222804993391037,
"epoch": 1.3475113122171947,
"grad_norm": 0.45400527119636536,
"learning_rate": 0.000191201023292639,
"loss": 0.0790121927857399,
"mean_token_accuracy": 0.9772313088178635,
"num_tokens": 3296209.0,
"step": 373
},
{
"entropy": 1.2407637536525726,
"epoch": 1.351131221719457,
"grad_norm": 0.4726577401161194,
"learning_rate": 0.00019114419740895837,
"loss": 0.09999781101942062,
"mean_token_accuracy": 0.9702717959880829,
"num_tokens": 3304805.0,
"step": 374
},
{
"entropy": 1.287319839000702,
"epoch": 1.3547511312217195,
"grad_norm": 0.7152829766273499,
"learning_rate": 0.00019108719811121772,
"loss": 0.24104124307632446,
"mean_token_accuracy": 0.9389240592718124,
"num_tokens": 3313481.0,
"step": 375
},
{
"entropy": 1.3423154652118683,
"epoch": 1.358371040723982,
"grad_norm": 0.643339991569519,
"learning_rate": 0.00019103002552123087,
"loss": 0.12666553258895874,
"mean_token_accuracy": 0.9654455035924911,
"num_tokens": 3321945.0,
"step": 376
},
{
"entropy": 1.291712373495102,
"epoch": 1.3619909502262444,
"grad_norm": 0.6353316307067871,
"learning_rate": 0.0001909726797611819,
"loss": 0.19853462278842926,
"mean_token_accuracy": 0.9522408545017242,
"num_tokens": 3330280.0,
"step": 377
},
{
"entropy": 1.2879594564437866,
"epoch": 1.3656108597285068,
"grad_norm": 0.5303124785423279,
"learning_rate": 0.000190915160953625,
"loss": 0.07663790136575699,
"mean_token_accuracy": 0.976201668381691,
"num_tokens": 3338655.0,
"step": 378
},
{
"entropy": 1.2824178040027618,
"epoch": 1.3692307692307693,
"grad_norm": 0.7815021872520447,
"learning_rate": 0.00019085746922148413,
"loss": 0.29046493768692017,
"mean_token_accuracy": 0.9358613342046738,
"num_tokens": 3347371.0,
"step": 379
},
{
"entropy": 1.3061807453632355,
"epoch": 1.3728506787330317,
"grad_norm": 0.6244264841079712,
"learning_rate": 0.00019079960468805293,
"loss": 0.14304828643798828,
"mean_token_accuracy": 0.9528721123933792,
"num_tokens": 3355742.0,
"step": 380
},
{
"entropy": 1.2946366965770721,
"epoch": 1.3764705882352941,
"grad_norm": 0.5752382278442383,
"learning_rate": 0.0001907415674769942,
"loss": 0.10442131012678146,
"mean_token_accuracy": 0.9699090272188187,
"num_tokens": 3364520.0,
"step": 381
},
{
"entropy": 1.341704785823822,
"epoch": 1.3800904977375565,
"grad_norm": 0.5594130754470825,
"learning_rate": 0.00019068335771233987,
"loss": 0.15637581050395966,
"mean_token_accuracy": 0.957904726266861,
"num_tokens": 3373052.0,
"step": 382
},
{
"entropy": 1.2504234313964844,
"epoch": 1.383710407239819,
"grad_norm": 0.5094558596611023,
"learning_rate": 0.0001906249755184906,
"loss": 0.1411437839269638,
"mean_token_accuracy": 0.9628296792507172,
"num_tokens": 3382086.0,
"step": 383
},
{
"entropy": 1.2977957129478455,
"epoch": 1.3873303167420814,
"grad_norm": 0.745290994644165,
"learning_rate": 0.00019056642102021555,
"loss": 0.16349725425243378,
"mean_token_accuracy": 0.9588443785905838,
"num_tokens": 3391079.0,
"step": 384
},
{
"entropy": 1.3416646420955658,
"epoch": 1.3909502262443438,
"grad_norm": 0.6071876287460327,
"learning_rate": 0.00019050769434265206,
"loss": 0.09388062357902527,
"mean_token_accuracy": 0.9732513576745987,
"num_tokens": 3399955.0,
"step": 385
},
{
"entropy": 1.2769374251365662,
"epoch": 1.3945701357466063,
"grad_norm": 0.458711713552475,
"learning_rate": 0.00019044879561130553,
"loss": 0.0859433189034462,
"mean_token_accuracy": 0.9764412939548492,
"num_tokens": 3408410.0,
"step": 386
},
{
"entropy": 1.2753216326236725,
"epoch": 1.3981900452488687,
"grad_norm": 0.5320125818252563,
"learning_rate": 0.00019038972495204906,
"loss": 0.12986987829208374,
"mean_token_accuracy": 0.9634640216827393,
"num_tokens": 3416972.0,
"step": 387
},
{
"entropy": 1.3119005858898163,
"epoch": 1.4018099547511311,
"grad_norm": 0.5521829128265381,
"learning_rate": 0.00019033048249112304,
"loss": 0.12304998189210892,
"mean_token_accuracy": 0.9719865322113037,
"num_tokens": 3425822.0,
"step": 388
},
{
"entropy": 1.246106207370758,
"epoch": 1.4054298642533936,
"grad_norm": 0.8556618690490723,
"learning_rate": 0.00019027106835513519,
"loss": 0.09790987521409988,
"mean_token_accuracy": 0.9762465804815292,
"num_tokens": 3434913.0,
"step": 389
},
{
"entropy": 1.3027318120002747,
"epoch": 1.409049773755656,
"grad_norm": 0.5831668972969055,
"learning_rate": 0.00019021148267106002,
"loss": 0.12975762784481049,
"mean_token_accuracy": 0.9633228182792664,
"num_tokens": 3443968.0,
"step": 390
},
{
"entropy": 1.2715855538845062,
"epoch": 1.4126696832579184,
"grad_norm": 0.5838220715522766,
"learning_rate": 0.00019015172556623863,
"loss": 0.24836933612823486,
"mean_token_accuracy": 0.9483881741762161,
"num_tokens": 3453112.0,
"step": 391
},
{
"entropy": 1.282654047012329,
"epoch": 1.416289592760181,
"grad_norm": 0.5957282185554504,
"learning_rate": 0.00019009179716837865,
"loss": 0.15488135814666748,
"mean_token_accuracy": 0.9558688700199127,
"num_tokens": 3461543.0,
"step": 392
},
{
"entropy": 1.2500621974468231,
"epoch": 1.4199095022624435,
"grad_norm": 0.5969480872154236,
"learning_rate": 0.0001900316976055535,
"loss": 0.118685781955719,
"mean_token_accuracy": 0.9681677222251892,
"num_tokens": 3470434.0,
"step": 393
},
{
"entropy": 1.1552152931690216,
"epoch": 1.423529411764706,
"grad_norm": 0.9155212044715881,
"learning_rate": 0.00018997142700620257,
"loss": 0.07897457480430603,
"mean_token_accuracy": 0.9746341109275818,
"num_tokens": 3479766.0,
"step": 394
},
{
"entropy": 1.2394072115421295,
"epoch": 1.4271493212669684,
"grad_norm": 0.5433281064033508,
"learning_rate": 0.00018991098549913084,
"loss": 0.10004748404026031,
"mean_token_accuracy": 0.970876082777977,
"num_tokens": 3487864.0,
"step": 395
},
{
"entropy": 1.194315493106842,
"epoch": 1.4307692307692308,
"grad_norm": 0.5625967383384705,
"learning_rate": 0.00018985037321350836,
"loss": 0.15023575723171234,
"mean_token_accuracy": 0.9571869820356369,
"num_tokens": 3496909.0,
"step": 396
},
{
"entropy": 1.1606856882572174,
"epoch": 1.4343891402714932,
"grad_norm": 0.41998282074928284,
"learning_rate": 0.0001897895902788703,
"loss": 0.06819174438714981,
"mean_token_accuracy": 0.9810739904642105,
"num_tokens": 3505970.0,
"step": 397
},
{
"entropy": 1.2330349385738373,
"epoch": 1.4380090497737557,
"grad_norm": 0.46300387382507324,
"learning_rate": 0.00018972863682511639,
"loss": 0.11061134934425354,
"mean_token_accuracy": 0.9722975939512253,
"num_tokens": 3514464.0,
"step": 398
},
{
"entropy": 1.2248220443725586,
"epoch": 1.441628959276018,
"grad_norm": 0.6571424007415771,
"learning_rate": 0.00018966751298251093,
"loss": 0.17451299726963043,
"mean_token_accuracy": 0.9536366164684296,
"num_tokens": 3523449.0,
"step": 399
},
{
"entropy": 1.2604781985282898,
"epoch": 1.4452488687782805,
"grad_norm": 0.7056911587715149,
"learning_rate": 0.00018960621888168224,
"loss": 0.1714896708726883,
"mean_token_accuracy": 0.9471757411956787,
"num_tokens": 3532403.0,
"step": 400
},
{
"entropy": 1.2510155141353607,
"epoch": 1.448868778280543,
"grad_norm": 0.582774817943573,
"learning_rate": 0.00018954475465362256,
"loss": 0.1749400794506073,
"mean_token_accuracy": 0.9718905985355377,
"num_tokens": 3541154.0,
"step": 401
},
{
"entropy": 1.3139209747314453,
"epoch": 1.4524886877828054,
"grad_norm": 0.5442278981208801,
"learning_rate": 0.00018948312042968768,
"loss": 0.14033550024032593,
"mean_token_accuracy": 0.9652996808290482,
"num_tokens": 3549472.0,
"step": 402
},
{
"entropy": 1.2959212362766266,
"epoch": 1.4561085972850678,
"grad_norm": 0.5408539175987244,
"learning_rate": 0.00018942131634159672,
"loss": 0.10644003748893738,
"mean_token_accuracy": 0.9660961031913757,
"num_tokens": 3557890.0,
"step": 403
},
{
"entropy": 1.285716027021408,
"epoch": 1.4597285067873302,
"grad_norm": 0.49533000588417053,
"learning_rate": 0.00018935934252143182,
"loss": 0.12249112129211426,
"mean_token_accuracy": 0.9659420847892761,
"num_tokens": 3566859.0,
"step": 404
},
{
"entropy": 1.270496904850006,
"epoch": 1.463348416289593,
"grad_norm": 0.4820156991481781,
"learning_rate": 0.0001892971991016378,
"loss": 0.11467836797237396,
"mean_token_accuracy": 0.9734348058700562,
"num_tokens": 3575782.0,
"step": 405
},
{
"entropy": 1.2943720519542694,
"epoch": 1.4669683257918553,
"grad_norm": 0.34380054473876953,
"learning_rate": 0.00018923488621502198,
"loss": 0.06653222441673279,
"mean_token_accuracy": 0.9807761162519455,
"num_tokens": 3584468.0,
"step": 406
},
{
"entropy": 1.3197762072086334,
"epoch": 1.4705882352941178,
"grad_norm": 0.5286886692047119,
"learning_rate": 0.00018917240399475387,
"loss": 0.1562788039445877,
"mean_token_accuracy": 0.9582570791244507,
"num_tokens": 3593198.0,
"step": 407
},
{
"entropy": 1.2224310040473938,
"epoch": 1.4742081447963802,
"grad_norm": 0.47084492444992065,
"learning_rate": 0.00018910975257436477,
"loss": 0.08669901639223099,
"mean_token_accuracy": 0.9738520681858063,
"num_tokens": 3602529.0,
"step": 408
},
{
"entropy": 1.2566787004470825,
"epoch": 1.4778280542986426,
"grad_norm": 0.5421281456947327,
"learning_rate": 0.00018904693208774773,
"loss": 0.09741362929344177,
"mean_token_accuracy": 0.9702080637216568,
"num_tokens": 3611438.0,
"step": 409
},
{
"entropy": 1.2232867777347565,
"epoch": 1.481447963800905,
"grad_norm": 0.591533899307251,
"learning_rate": 0.000188983942669157,
"loss": 0.1027684137225151,
"mean_token_accuracy": 0.9754174500703812,
"num_tokens": 3620209.0,
"step": 410
},
{
"entropy": 1.172331839799881,
"epoch": 1.4850678733031675,
"grad_norm": 0.6023557782173157,
"learning_rate": 0.00018892078445320785,
"loss": 0.19547313451766968,
"mean_token_accuracy": 0.9482788294553757,
"num_tokens": 3629483.0,
"step": 411
},
{
"entropy": 1.1692214012145996,
"epoch": 1.48868778280543,
"grad_norm": 0.4271499216556549,
"learning_rate": 0.00018885745757487633,
"loss": 0.06866015493869781,
"mean_token_accuracy": 0.9770233035087585,
"num_tokens": 3638167.0,
"step": 412
},
{
"entropy": 1.218559056520462,
"epoch": 1.4923076923076923,
"grad_norm": 0.6048818826675415,
"learning_rate": 0.00018879396216949895,
"loss": 0.1184379905462265,
"mean_token_accuracy": 0.9668450653553009,
"num_tokens": 3646941.0,
"step": 413
},
{
"entropy": 1.1115884184837341,
"epoch": 1.4959276018099548,
"grad_norm": 0.5207604765892029,
"learning_rate": 0.00018873029837277236,
"loss": 0.09107951819896698,
"mean_token_accuracy": 0.970865860581398,
"num_tokens": 3656408.0,
"step": 414
},
{
"entropy": 1.201383799314499,
"epoch": 1.4995475113122172,
"grad_norm": 0.6223848462104797,
"learning_rate": 0.0001886664663207531,
"loss": 0.14010068774223328,
"mean_token_accuracy": 0.9613067805767059,
"num_tokens": 3665223.0,
"step": 415
},
{
"entropy": 1.2117818593978882,
"epoch": 1.5031674208144796,
"grad_norm": 0.5893263220787048,
"learning_rate": 0.00018860246614985725,
"loss": 0.09732384979724884,
"mean_token_accuracy": 0.9756953567266464,
"num_tokens": 3673568.0,
"step": 416
},
{
"entropy": 1.068862423300743,
"epoch": 1.506787330316742,
"grad_norm": 0.6032145023345947,
"learning_rate": 0.0001885382979968602,
"loss": 0.16733071208000183,
"mean_token_accuracy": 0.9616134315729141,
"num_tokens": 3683588.0,
"step": 417
},
{
"entropy": 1.164185881614685,
"epoch": 1.5104072398190045,
"grad_norm": 0.8030836582183838,
"learning_rate": 0.00018847396199889638,
"loss": 0.191024512052536,
"mean_token_accuracy": 0.9492377042770386,
"num_tokens": 3692221.0,
"step": 418
},
{
"entropy": 1.2202486097812653,
"epoch": 1.514027149321267,
"grad_norm": 0.45743128657341003,
"learning_rate": 0.00018840945829345885,
"loss": 0.0803522914648056,
"mean_token_accuracy": 0.9740827530622482,
"num_tokens": 3700599.0,
"step": 419
},
{
"entropy": 1.2234172523021698,
"epoch": 1.5176470588235293,
"grad_norm": 0.6399345397949219,
"learning_rate": 0.0001883447870183991,
"loss": 0.13413016498088837,
"mean_token_accuracy": 0.9694488942623138,
"num_tokens": 3709316.0,
"step": 420
},
{
"entropy": 1.219748318195343,
"epoch": 1.5212669683257918,
"grad_norm": 0.8917973041534424,
"learning_rate": 0.00018827994831192675,
"loss": 0.14400342106819153,
"mean_token_accuracy": 0.9627624005079269,
"num_tokens": 3717808.0,
"step": 421
},
{
"entropy": 1.257588416337967,
"epoch": 1.5248868778280542,
"grad_norm": 0.61835116147995,
"learning_rate": 0.0001882149423126093,
"loss": 0.12378428876399994,
"mean_token_accuracy": 0.9704181104898453,
"num_tokens": 3726278.0,
"step": 422
},
{
"entropy": 1.2053538858890533,
"epoch": 1.5285067873303166,
"grad_norm": 0.5542409420013428,
"learning_rate": 0.0001881497691593716,
"loss": 0.0971193015575409,
"mean_token_accuracy": 0.9743164777755737,
"num_tokens": 3734935.0,
"step": 423
},
{
"entropy": 1.1871007978916168,
"epoch": 1.532126696832579,
"grad_norm": 0.3344699740409851,
"learning_rate": 0.0001880844289914959,
"loss": 0.03764911741018295,
"mean_token_accuracy": 0.9889417439699173,
"num_tokens": 3743868.0,
"step": 424
},
{
"entropy": 1.266874372959137,
"epoch": 1.5357466063348415,
"grad_norm": 0.5863392949104309,
"learning_rate": 0.0001880189219486213,
"loss": 0.12176309525966644,
"mean_token_accuracy": 0.9698539972305298,
"num_tokens": 3752414.0,
"step": 425
},
{
"entropy": 1.1402153968811035,
"epoch": 1.539366515837104,
"grad_norm": 0.4934159517288208,
"learning_rate": 0.00018795324817074354,
"loss": 0.1012497991323471,
"mean_token_accuracy": 0.9715209603309631,
"num_tokens": 3761525.0,
"step": 426
},
{
"entropy": 1.2725040912628174,
"epoch": 1.5429864253393664,
"grad_norm": 0.6184947490692139,
"learning_rate": 0.0001878874077982147,
"loss": 0.09586258977651596,
"mean_token_accuracy": 0.9663965255022049,
"num_tokens": 3769460.0,
"step": 427
},
{
"entropy": 1.1935326755046844,
"epoch": 1.5466063348416288,
"grad_norm": 0.711137056350708,
"learning_rate": 0.0001878214009717429,
"loss": 0.203983873128891,
"mean_token_accuracy": 0.9529214203357697,
"num_tokens": 3778214.0,
"step": 428
},
{
"entropy": 1.2496784329414368,
"epoch": 1.5502262443438914,
"grad_norm": 0.5968933701515198,
"learning_rate": 0.00018775522783239198,
"loss": 0.14222870767116547,
"mean_token_accuracy": 0.963072806596756,
"num_tokens": 3786842.0,
"step": 429
},
{
"entropy": 1.1680949032306671,
"epoch": 1.5538461538461539,
"grad_norm": 0.5371803641319275,
"learning_rate": 0.0001876888885215812,
"loss": 0.1057431548833847,
"mean_token_accuracy": 0.9662029445171356,
"num_tokens": 3795718.0,
"step": 430
},
{
"entropy": 1.1462930738925934,
"epoch": 1.5574660633484163,
"grad_norm": 0.4879329800605774,
"learning_rate": 0.0001876223831810849,
"loss": 0.0943731814622879,
"mean_token_accuracy": 0.9766229838132858,
"num_tokens": 3804833.0,
"step": 431
},
{
"entropy": 1.2148894369602203,
"epoch": 1.5610859728506787,
"grad_norm": 0.8211036324501038,
"learning_rate": 0.00018755571195303234,
"loss": 0.354468435049057,
"mean_token_accuracy": 0.9370880573987961,
"num_tokens": 3813804.0,
"step": 432
},
{
"entropy": 1.2123090624809265,
"epoch": 1.5647058823529412,
"grad_norm": 0.7378460168838501,
"learning_rate": 0.00018748887497990727,
"loss": 0.10816670954227448,
"mean_token_accuracy": 0.9683575332164764,
"num_tokens": 3822405.0,
"step": 433
},
{
"entropy": 1.224818378686905,
"epoch": 1.5683257918552036,
"grad_norm": 0.6279307007789612,
"learning_rate": 0.00018742187240454762,
"loss": 0.15239953994750977,
"mean_token_accuracy": 0.9667632728815079,
"num_tokens": 3830990.0,
"step": 434
},
{
"entropy": 1.2601450085639954,
"epoch": 1.571945701357466,
"grad_norm": 0.705889105796814,
"learning_rate": 0.00018735470437014523,
"loss": 0.07424315810203552,
"mean_token_accuracy": 0.9760804325342178,
"num_tokens": 3839416.0,
"step": 435
},
{
"entropy": 1.190734475851059,
"epoch": 1.5755656108597285,
"grad_norm": 0.3725827634334564,
"learning_rate": 0.00018728737102024557,
"loss": 0.06711282581090927,
"mean_token_accuracy": 0.9825543165206909,
"num_tokens": 3848851.0,
"step": 436
},
{
"entropy": 1.22418212890625,
"epoch": 1.5791855203619911,
"grad_norm": 0.46165597438812256,
"learning_rate": 0.00018721987249874746,
"loss": 0.0854751318693161,
"mean_token_accuracy": 0.9737638980150223,
"num_tokens": 3857922.0,
"step": 437
},
{
"entropy": 1.2956913709640503,
"epoch": 1.5828054298642535,
"grad_norm": 0.5615050196647644,
"learning_rate": 0.0001871522089499026,
"loss": 0.11327139288187027,
"mean_token_accuracy": 0.9696285277605057,
"num_tokens": 3866613.0,
"step": 438
},
{
"entropy": 1.2908932268619537,
"epoch": 1.586425339366516,
"grad_norm": 0.5302788615226746,
"learning_rate": 0.00018708438051831544,
"loss": 0.1113893985748291,
"mean_token_accuracy": 0.9665745049715042,
"num_tokens": 3875118.0,
"step": 439
},
{
"entropy": 1.217937409877777,
"epoch": 1.5900452488687784,
"grad_norm": 0.3633266091346741,
"learning_rate": 0.00018701638734894277,
"loss": 0.06300318986177444,
"mean_token_accuracy": 0.9804518818855286,
"num_tokens": 3884140.0,
"step": 440
},
{
"entropy": 1.246230572462082,
"epoch": 1.5936651583710408,
"grad_norm": 0.474832147359848,
"learning_rate": 0.00018694822958709346,
"loss": 0.07810146361589432,
"mean_token_accuracy": 0.9765773564577103,
"num_tokens": 3892875.0,
"step": 441
},
{
"entropy": 1.2694453299045563,
"epoch": 1.5972850678733033,
"grad_norm": 0.3883070647716522,
"learning_rate": 0.00018687990737842818,
"loss": 0.062109194695949554,
"mean_token_accuracy": 0.9821051061153412,
"num_tokens": 3901322.0,
"step": 442
},
{
"entropy": 1.2491609454154968,
"epoch": 1.6009049773755657,
"grad_norm": 0.6458966732025146,
"learning_rate": 0.0001868114208689589,
"loss": 0.16670571267604828,
"mean_token_accuracy": 0.9681277722120285,
"num_tokens": 3910066.0,
"step": 443
},
{
"entropy": 1.289526790380478,
"epoch": 1.6045248868778281,
"grad_norm": 0.498731404542923,
"learning_rate": 0.0001867427702050489,
"loss": 0.1024109423160553,
"mean_token_accuracy": 0.9770669341087341,
"num_tokens": 3918561.0,
"step": 444
},
{
"entropy": 1.3065584897994995,
"epoch": 1.6081447963800906,
"grad_norm": 0.5213361382484436,
"learning_rate": 0.00018667395553341213,
"loss": 0.10743463039398193,
"mean_token_accuracy": 0.968547523021698,
"num_tokens": 3927277.0,
"step": 445
},
{
"entropy": 1.2784769237041473,
"epoch": 1.611764705882353,
"grad_norm": 0.46484726667404175,
"learning_rate": 0.00018660497700111317,
"loss": 0.1371474266052246,
"mean_token_accuracy": 0.958917498588562,
"num_tokens": 3936569.0,
"step": 446
},
{
"entropy": 1.2938779890537262,
"epoch": 1.6153846153846154,
"grad_norm": 0.496535986661911,
"learning_rate": 0.00018653583475556663,
"loss": 0.07164958864450455,
"mean_token_accuracy": 0.9746866375207901,
"num_tokens": 3945175.0,
"step": 447
},
{
"entropy": 1.2572109401226044,
"epoch": 1.6190045248868778,
"grad_norm": 1.1173293590545654,
"learning_rate": 0.00018646652894453714,
"loss": 0.17376887798309326,
"mean_token_accuracy": 0.9627924859523773,
"num_tokens": 3953978.0,
"step": 448
},
{
"entropy": 1.2627245783805847,
"epoch": 1.6226244343891403,
"grad_norm": 0.7195766568183899,
"learning_rate": 0.00018639705971613878,
"loss": 0.1997443288564682,
"mean_token_accuracy": 0.9475196748971939,
"num_tokens": 3962897.0,
"step": 449
},
{
"entropy": 1.2522348463535309,
"epoch": 1.6262443438914027,
"grad_norm": 0.47655388712882996,
"learning_rate": 0.00018632742721883495,
"loss": 0.08064761012792587,
"mean_token_accuracy": 0.967804342508316,
"num_tokens": 3971740.0,
"step": 450
},
{
"entropy": 1.2210132479667664,
"epoch": 1.6298642533936651,
"grad_norm": 0.5680899620056152,
"learning_rate": 0.00018625763160143796,
"loss": 0.1746440976858139,
"mean_token_accuracy": 0.9542393088340759,
"num_tokens": 3980791.0,
"step": 451
},
{
"entropy": 1.276931256055832,
"epoch": 1.6334841628959276,
"grad_norm": 0.5334168076515198,
"learning_rate": 0.0001861876730131087,
"loss": 0.10449045896530151,
"mean_token_accuracy": 0.9746640473604202,
"num_tokens": 3990061.0,
"step": 452
},
{
"entropy": 1.3530596196651459,
"epoch": 1.63710407239819,
"grad_norm": 0.654348611831665,
"learning_rate": 0.00018611755160335633,
"loss": 0.13995029032230377,
"mean_token_accuracy": 0.9673926830291748,
"num_tokens": 3998360.0,
"step": 453
},
{
"entropy": 1.356580764055252,
"epoch": 1.6407239819004524,
"grad_norm": 0.7779679298400879,
"learning_rate": 0.000186047267522038,
"loss": 0.3456251621246338,
"mean_token_accuracy": 0.9260966181755066,
"num_tokens": 4007104.0,
"step": 454
},
{
"entropy": 1.2545486092567444,
"epoch": 1.6443438914027149,
"grad_norm": 0.49409937858581543,
"learning_rate": 0.00018597682091935856,
"loss": 0.13179728388786316,
"mean_token_accuracy": 0.9634029120206833,
"num_tokens": 4016114.0,
"step": 455
},
{
"entropy": 1.2673145532608032,
"epoch": 1.6479638009049773,
"grad_norm": 0.4457748532295227,
"learning_rate": 0.00018590621194587007,
"loss": 0.12703005969524384,
"mean_token_accuracy": 0.9695380330085754,
"num_tokens": 4025108.0,
"step": 456
},
{
"entropy": 1.2589110732078552,
"epoch": 1.6515837104072397,
"grad_norm": 0.5149383544921875,
"learning_rate": 0.0001858354407524717,
"loss": 0.14232893288135529,
"mean_token_accuracy": 0.9638843387365341,
"num_tokens": 4034230.0,
"step": 457
},
{
"entropy": 1.3335690796375275,
"epoch": 1.6552036199095022,
"grad_norm": 0.4933992326259613,
"learning_rate": 0.00018576450749040925,
"loss": 0.09372726082801819,
"mean_token_accuracy": 0.9729661494493484,
"num_tokens": 4043163.0,
"step": 458
},
{
"entropy": 1.2553574740886688,
"epoch": 1.6588235294117646,
"grad_norm": 0.426299124956131,
"learning_rate": 0.0001856934123112749,
"loss": 0.06914810836315155,
"mean_token_accuracy": 0.9822471588850021,
"num_tokens": 4051766.0,
"step": 459
},
{
"entropy": 1.2507081627845764,
"epoch": 1.662443438914027,
"grad_norm": 0.5942637324333191,
"learning_rate": 0.00018562215536700684,
"loss": 0.09914569556713104,
"mean_token_accuracy": 0.9764914512634277,
"num_tokens": 4061075.0,
"step": 460
},
{
"entropy": 1.2598736882209778,
"epoch": 1.6660633484162894,
"grad_norm": 0.43723762035369873,
"learning_rate": 0.000185550736809889,
"loss": 0.10020029544830322,
"mean_token_accuracy": 0.9784475862979889,
"num_tokens": 4069722.0,
"step": 461
},
{
"entropy": 1.25190868973732,
"epoch": 1.6696832579185519,
"grad_norm": 0.42693883180618286,
"learning_rate": 0.00018547915679255063,
"loss": 0.05682477727532387,
"mean_token_accuracy": 0.9844070225954056,
"num_tokens": 4078483.0,
"step": 462
},
{
"entropy": 1.231699526309967,
"epoch": 1.6733031674208145,
"grad_norm": 0.526006817817688,
"learning_rate": 0.00018540741546796616,
"loss": 0.0700770914554596,
"mean_token_accuracy": 0.9812077730894089,
"num_tokens": 4087298.0,
"step": 463
},
{
"entropy": 1.1822182536125183,
"epoch": 1.676923076923077,
"grad_norm": 0.5364736914634705,
"learning_rate": 0.00018533551298945467,
"loss": 0.11393093317747116,
"mean_token_accuracy": 0.9698948115110397,
"num_tokens": 4096459.0,
"step": 464
},
{
"entropy": 1.1950626969337463,
"epoch": 1.6805429864253394,
"grad_norm": 0.6466018557548523,
"learning_rate": 0.00018526344951067957,
"loss": 0.17861007153987885,
"mean_token_accuracy": 0.9639775156974792,
"num_tokens": 4105457.0,
"step": 465
},
{
"entropy": 1.1978155076503754,
"epoch": 1.6841628959276018,
"grad_norm": 0.4935831129550934,
"learning_rate": 0.00018519122518564853,
"loss": 0.08525866270065308,
"mean_token_accuracy": 0.9758468270301819,
"num_tokens": 4114038.0,
"step": 466
},
{
"entropy": 1.1613716185092926,
"epoch": 1.6877828054298643,
"grad_norm": 0.41199934482574463,
"learning_rate": 0.0001851188401687128,
"loss": 0.07017679512500763,
"mean_token_accuracy": 0.9776453971862793,
"num_tokens": 4123007.0,
"step": 467
},
{
"entropy": 1.1513382196426392,
"epoch": 1.6914027149321267,
"grad_norm": 0.5673828721046448,
"learning_rate": 0.00018504629461456716,
"loss": 0.09683945775032043,
"mean_token_accuracy": 0.9727943688631058,
"num_tokens": 4131925.0,
"step": 468
},
{
"entropy": 1.210301250219345,
"epoch": 1.6950226244343891,
"grad_norm": 0.5047227144241333,
"learning_rate": 0.00018497358867824933,
"loss": 0.08428950607776642,
"mean_token_accuracy": 0.9737090021371841,
"num_tokens": 4140851.0,
"step": 469
},
{
"entropy": 1.3032833933830261,
"epoch": 1.6986425339366515,
"grad_norm": 0.6038223505020142,
"learning_rate": 0.00018490072251513997,
"loss": 0.06815248727798462,
"mean_token_accuracy": 0.9779106676578522,
"num_tokens": 4148989.0,
"step": 470
},
{
"entropy": 1.23912513256073,
"epoch": 1.702262443438914,
"grad_norm": 0.6200883388519287,
"learning_rate": 0.00018482769628096207,
"loss": 0.08937155455350876,
"mean_token_accuracy": 0.9763010591268539,
"num_tokens": 4157517.0,
"step": 471
},
{
"entropy": 1.186729907989502,
"epoch": 1.7058823529411766,
"grad_norm": 0.5706359148025513,
"learning_rate": 0.00018475451013178062,
"loss": 0.1340227574110031,
"mean_token_accuracy": 0.9582885801792145,
"num_tokens": 4166422.0,
"step": 472
},
{
"entropy": 1.2165184915065765,
"epoch": 1.709502262443439,
"grad_norm": 0.4458298981189728,
"learning_rate": 0.00018468116422400258,
"loss": 0.06072482466697693,
"mean_token_accuracy": 0.9787209331989288,
"num_tokens": 4174473.0,
"step": 473
},
{
"entropy": 1.253999799489975,
"epoch": 1.7131221719457015,
"grad_norm": 0.5249391794204712,
"learning_rate": 0.00018460765871437614,
"loss": 0.13922284543514252,
"mean_token_accuracy": 0.9605212956666946,
"num_tokens": 4183121.0,
"step": 474
},
{
"entropy": 1.2191531360149384,
"epoch": 1.716742081447964,
"grad_norm": 0.6076857447624207,
"learning_rate": 0.0001845339937599906,
"loss": 0.12412364035844803,
"mean_token_accuracy": 0.9641828685998917,
"num_tokens": 4192043.0,
"step": 475
},
{
"entropy": 1.1414133310317993,
"epoch": 1.7203619909502263,
"grad_norm": 0.5084185004234314,
"learning_rate": 0.00018446016951827619,
"loss": 0.08176974952220917,
"mean_token_accuracy": 0.9742649793624878,
"num_tokens": 4201387.0,
"step": 476
},
{
"entropy": 1.2488122284412384,
"epoch": 1.7239819004524888,
"grad_norm": 0.5439050793647766,
"learning_rate": 0.0001843861861470033,
"loss": 0.12782233953475952,
"mean_token_accuracy": 0.9623425304889679,
"num_tokens": 4210216.0,
"step": 477
},
{
"entropy": 1.1921941936016083,
"epoch": 1.7276018099547512,
"grad_norm": 0.6951958537101746,
"learning_rate": 0.00018431204380428258,
"loss": 0.1784716546535492,
"mean_token_accuracy": 0.9505428522825241,
"num_tokens": 4219386.0,
"step": 478
},
{
"entropy": 1.2704735100269318,
"epoch": 1.7312217194570136,
"grad_norm": 0.4298340678215027,
"learning_rate": 0.00018423774264856433,
"loss": 0.07889077067375183,
"mean_token_accuracy": 0.9754424393177032,
"num_tokens": 4228012.0,
"step": 479
},
{
"entropy": 1.2802889347076416,
"epoch": 1.734841628959276,
"grad_norm": 0.6962547898292542,
"learning_rate": 0.00018416328283863827,
"loss": 0.10208003222942352,
"mean_token_accuracy": 0.9654501229524612,
"num_tokens": 4236361.0,
"step": 480
},
{
"entropy": 1.2991151809692383,
"epoch": 1.7384615384615385,
"grad_norm": 0.5058232545852661,
"learning_rate": 0.00018408866453363326,
"loss": 0.09221667051315308,
"mean_token_accuracy": 0.9807891100645065,
"num_tokens": 4244839.0,
"step": 481
},
{
"entropy": 1.2336131632328033,
"epoch": 1.742081447963801,
"grad_norm": 0.43272894620895386,
"learning_rate": 0.0001840138878930167,
"loss": 0.06068682670593262,
"mean_token_accuracy": 0.9835522323846817,
"num_tokens": 4253448.0,
"step": 482
},
{
"entropy": 1.1949119865894318,
"epoch": 1.7457013574660634,
"grad_norm": 0.49109941720962524,
"learning_rate": 0.00018393895307659456,
"loss": 0.11378560215234756,
"mean_token_accuracy": 0.9686966389417648,
"num_tokens": 4262859.0,
"step": 483
},
{
"entropy": 1.3476176857948303,
"epoch": 1.7493212669683258,
"grad_norm": 0.8294044733047485,
"learning_rate": 0.00018386386024451076,
"loss": 0.2184215486049652,
"mean_token_accuracy": 0.9302650094032288,
"num_tokens": 4271081.0,
"step": 484
},
{
"entropy": 1.2542327046394348,
"epoch": 1.7529411764705882,
"grad_norm": 0.5723432898521423,
"learning_rate": 0.0001837886095572469,
"loss": 0.12125033140182495,
"mean_token_accuracy": 0.9635651111602783,
"num_tokens": 4280186.0,
"step": 485
},
{
"entropy": 1.3042361438274384,
"epoch": 1.7565610859728507,
"grad_norm": 0.7513923645019531,
"learning_rate": 0.00018371320117562199,
"loss": 0.22429926693439484,
"mean_token_accuracy": 0.9487362802028656,
"num_tokens": 4288944.0,
"step": 486
},
{
"entropy": 1.2472188770771027,
"epoch": 1.760180995475113,
"grad_norm": 0.45380401611328125,
"learning_rate": 0.000183637635260792,
"loss": 0.09528672695159912,
"mean_token_accuracy": 0.972507655620575,
"num_tokens": 4297837.0,
"step": 487
},
{
"entropy": 1.2608753442764282,
"epoch": 1.7638009049773755,
"grad_norm": 0.4613839089870453,
"learning_rate": 0.00018356191197424964,
"loss": 0.12640029191970825,
"mean_token_accuracy": 0.9590530246496201,
"num_tokens": 4306763.0,
"step": 488
},
{
"entropy": 1.3038392961025238,
"epoch": 1.767420814479638,
"grad_norm": 0.484052836894989,
"learning_rate": 0.0001834860314778238,
"loss": 0.11759282648563385,
"mean_token_accuracy": 0.9666974991559982,
"num_tokens": 4315329.0,
"step": 489
},
{
"entropy": 1.2559982240200043,
"epoch": 1.7710407239819004,
"grad_norm": 0.3966261148452759,
"learning_rate": 0.00018340999393367952,
"loss": 0.060240764170885086,
"mean_token_accuracy": 0.9834811985492706,
"num_tokens": 4324074.0,
"step": 490
},
{
"entropy": 1.2651242315769196,
"epoch": 1.7746606334841628,
"grad_norm": 0.5276649594306946,
"learning_rate": 0.00018333379950431734,
"loss": 0.12336815893650055,
"mean_token_accuracy": 0.963963121175766,
"num_tokens": 4332893.0,
"step": 491
},
{
"entropy": 1.340283215045929,
"epoch": 1.7782805429864252,
"grad_norm": 0.7897153496742249,
"learning_rate": 0.0001832574483525731,
"loss": 0.47839581966400146,
"mean_token_accuracy": 0.9122047275304794,
"num_tokens": 4342135.0,
"step": 492
},
{
"entropy": 1.275327444076538,
"epoch": 1.7819004524886877,
"grad_norm": 0.5885195732116699,
"learning_rate": 0.00018318094064161765,
"loss": 0.15483446419239044,
"mean_token_accuracy": 0.962849572300911,
"num_tokens": 4351087.0,
"step": 493
},
{
"entropy": 1.3376893401145935,
"epoch": 1.78552036199095,
"grad_norm": 0.3955666124820709,
"learning_rate": 0.00018310427653495632,
"loss": 0.057708803564310074,
"mean_token_accuracy": 0.9868191480636597,
"num_tokens": 4359453.0,
"step": 494
},
{
"entropy": 1.2282173037528992,
"epoch": 1.7891402714932125,
"grad_norm": 0.4878976047039032,
"learning_rate": 0.00018302745619642874,
"loss": 0.09314609318971634,
"mean_token_accuracy": 0.9711224585771561,
"num_tokens": 4368135.0,
"step": 495
},
{
"entropy": 1.215164691209793,
"epoch": 1.792760180995475,
"grad_norm": 0.5175027847290039,
"learning_rate": 0.00018295047979020843,
"loss": 0.09527207911014557,
"mean_token_accuracy": 0.970024898648262,
"num_tokens": 4377547.0,
"step": 496
},
{
"entropy": 1.2793397009372711,
"epoch": 1.7963800904977374,
"grad_norm": 0.39276501536369324,
"learning_rate": 0.00018287334748080236,
"loss": 0.051972195506095886,
"mean_token_accuracy": 0.9798386096954346,
"num_tokens": 4386400.0,
"step": 497
},
{
"entropy": 1.2163749635219574,
"epoch": 1.8,
"grad_norm": 0.679814875125885,
"learning_rate": 0.00018279605943305084,
"loss": 0.17564961314201355,
"mean_token_accuracy": 0.9553549140691757,
"num_tokens": 4395216.0,
"step": 498
},
{
"entropy": 1.1838999092578888,
"epoch": 1.8036199095022625,
"grad_norm": 0.4638931155204773,
"learning_rate": 0.00018271861581212686,
"loss": 0.14034540951251984,
"mean_token_accuracy": 0.9645161777734756,
"num_tokens": 4404951.0,
"step": 499
},
{
"entropy": 1.3320802450180054,
"epoch": 1.807239819004525,
"grad_norm": 0.5198440551757812,
"learning_rate": 0.00018264101678353592,
"loss": 0.10376127064228058,
"mean_token_accuracy": 0.9734873324632645,
"num_tokens": 4413295.0,
"step": 500
},
{
"entropy": 1.3872873187065125,
"epoch": 1.8108597285067873,
"grad_norm": 0.6447911858558655,
"learning_rate": 0.00018256326251311572,
"loss": 0.14052145183086395,
"mean_token_accuracy": 0.9612767547369003,
"num_tokens": 4421685.0,
"step": 501
},
{
"entropy": 1.3084447383880615,
"epoch": 1.8144796380090498,
"grad_norm": 0.40712690353393555,
"learning_rate": 0.0001824853531670356,
"loss": 0.06982739269733429,
"mean_token_accuracy": 0.9787922501564026,
"num_tokens": 4430505.0,
"step": 502
},
{
"entropy": 1.2643664479255676,
"epoch": 1.8180995475113122,
"grad_norm": 0.5247007012367249,
"learning_rate": 0.00018240728891179647,
"loss": 0.13731859624385834,
"mean_token_accuracy": 0.9635806679725647,
"num_tokens": 4439489.0,
"step": 503
},
{
"entropy": 1.290818691253662,
"epoch": 1.8217194570135746,
"grad_norm": 0.5838543772697449,
"learning_rate": 0.00018232906991423015,
"loss": 0.14157697558403015,
"mean_token_accuracy": 0.9528656303882599,
"num_tokens": 4448345.0,
"step": 504
},
{
"entropy": 1.268191635608673,
"epoch": 1.825339366515837,
"grad_norm": 0.5802812576293945,
"learning_rate": 0.00018225069634149929,
"loss": 0.09528884291648865,
"mean_token_accuracy": 0.9706045240163803,
"num_tokens": 4456917.0,
"step": 505
},
{
"entropy": 1.3473228812217712,
"epoch": 1.8289592760180997,
"grad_norm": 0.4340367913246155,
"learning_rate": 0.0001821721683610968,
"loss": 0.10973142832517624,
"mean_token_accuracy": 0.9727098494768143,
"num_tokens": 4465709.0,
"step": 506
},
{
"entropy": 1.3878930509090424,
"epoch": 1.8325791855203621,
"grad_norm": 0.6490364074707031,
"learning_rate": 0.00018209348614084552,
"loss": 0.16322913765907288,
"mean_token_accuracy": 0.9474020302295685,
"num_tokens": 4474226.0,
"step": 507
},
{
"entropy": 1.2699617445468903,
"epoch": 1.8361990950226246,
"grad_norm": 0.3948211371898651,
"learning_rate": 0.0001820146498488981,
"loss": 0.06969399005174637,
"mean_token_accuracy": 0.9818782657384872,
"num_tokens": 4483339.0,
"step": 508
},
{
"entropy": 1.30024915933609,
"epoch": 1.839819004524887,
"grad_norm": 0.5742026567459106,
"learning_rate": 0.0001819356596537363,
"loss": 0.17156578600406647,
"mean_token_accuracy": 0.9574418365955353,
"num_tokens": 4491595.0,
"step": 509
},
{
"entropy": 1.1884125173091888,
"epoch": 1.8434389140271494,
"grad_norm": 0.9810559749603271,
"learning_rate": 0.00018185651572417082,
"loss": 0.10991943627595901,
"mean_token_accuracy": 0.9672488421201706,
"num_tokens": 4500876.0,
"step": 510
},
{
"entropy": 1.2843712866306305,
"epoch": 1.8470588235294119,
"grad_norm": 0.6281305551528931,
"learning_rate": 0.00018177721822934097,
"loss": 0.2122631072998047,
"mean_token_accuracy": 0.951048418879509,
"num_tokens": 4509761.0,
"step": 511
},
{
"entropy": 1.3060316145420074,
"epoch": 1.8506787330316743,
"grad_norm": 0.41631197929382324,
"learning_rate": 0.00018169776733871422,
"loss": 0.06948904693126678,
"mean_token_accuracy": 0.9814287573099136,
"num_tokens": 4518518.0,
"step": 512
},
{
"entropy": 1.3802857398986816,
"epoch": 1.8542986425339367,
"grad_norm": 0.6457367539405823,
"learning_rate": 0.0001816181632220858,
"loss": 0.13644376397132874,
"mean_token_accuracy": 0.9505161345005035,
"num_tokens": 4527031.0,
"step": 513
},
{
"entropy": 1.2905277013778687,
"epoch": 1.8579185520361992,
"grad_norm": 0.4269099831581116,
"learning_rate": 0.00018153840604957845,
"loss": 0.09844273328781128,
"mean_token_accuracy": 0.976461797952652,
"num_tokens": 4535711.0,
"step": 514
},
{
"entropy": 1.261955976486206,
"epoch": 1.8615384615384616,
"grad_norm": 0.4700486660003662,
"learning_rate": 0.00018145849599164205,
"loss": 0.09094507992267609,
"mean_token_accuracy": 0.9761765152215958,
"num_tokens": 4544533.0,
"step": 515
},
{
"entropy": 1.31758314371109,
"epoch": 1.865158371040724,
"grad_norm": 0.6493532061576843,
"learning_rate": 0.00018137843321905316,
"loss": 0.1887962967157364,
"mean_token_accuracy": 0.952616810798645,
"num_tokens": 4553242.0,
"step": 516
},
{
"entropy": 1.2938291728496552,
"epoch": 1.8687782805429864,
"grad_norm": 0.37741488218307495,
"learning_rate": 0.00018129821790291464,
"loss": 0.08777758479118347,
"mean_token_accuracy": 0.9782676845788956,
"num_tokens": 4561885.0,
"step": 517
},
{
"entropy": 1.3216747641563416,
"epoch": 1.8723981900452489,
"grad_norm": 0.632188618183136,
"learning_rate": 0.00018121785021465552,
"loss": 0.1349005103111267,
"mean_token_accuracy": 0.9619114547967911,
"num_tokens": 4570513.0,
"step": 518
},
{
"entropy": 1.342225968837738,
"epoch": 1.8760180995475113,
"grad_norm": 0.6463941335678101,
"learning_rate": 0.00018113733032603036,
"loss": 0.3203854560852051,
"mean_token_accuracy": 0.9548316597938538,
"num_tokens": 4579275.0,
"step": 519
},
{
"entropy": 1.2357032895088196,
"epoch": 1.8796380090497737,
"grad_norm": 0.43039470911026,
"learning_rate": 0.00018105665840911894,
"loss": 0.04711674526333809,
"mean_token_accuracy": 0.983293205499649,
"num_tokens": 4588329.0,
"step": 520
},
{
"entropy": 1.2061834037303925,
"epoch": 1.8832579185520362,
"grad_norm": 0.4798518419265747,
"learning_rate": 0.00018097583463632606,
"loss": 0.09780099987983704,
"mean_token_accuracy": 0.9670642167329788,
"num_tokens": 4597226.0,
"step": 521
},
{
"entropy": 1.267607867717743,
"epoch": 1.8868778280542986,
"grad_norm": 0.9377403855323792,
"learning_rate": 0.000180894859180381,
"loss": 0.19476808607578278,
"mean_token_accuracy": 0.95573590695858,
"num_tokens": 4605908.0,
"step": 522
},
{
"entropy": 1.140032321214676,
"epoch": 1.890497737556561,
"grad_norm": 0.4788985848426819,
"learning_rate": 0.00018081373221433717,
"loss": 0.1614486277103424,
"mean_token_accuracy": 0.9491380751132965,
"num_tokens": 4615514.0,
"step": 523
},
{
"entropy": 1.2712955176830292,
"epoch": 1.8941176470588235,
"grad_norm": 0.6569368243217468,
"learning_rate": 0.00018073245391157184,
"loss": 0.18930307030677795,
"mean_token_accuracy": 0.9584528356790543,
"num_tokens": 4624496.0,
"step": 524
},
{
"entropy": 1.2802537977695465,
"epoch": 1.897737556561086,
"grad_norm": 0.646899402141571,
"learning_rate": 0.00018065102444578566,
"loss": 0.292553186416626,
"mean_token_accuracy": 0.9424069970846176,
"num_tokens": 4633712.0,
"step": 525
},
{
"entropy": 1.2484091222286224,
"epoch": 1.9013574660633483,
"grad_norm": 0.5471976399421692,
"learning_rate": 0.0001805694439910023,
"loss": 0.07856341451406479,
"mean_token_accuracy": 0.9713937491178513,
"num_tokens": 4642625.0,
"step": 526
},
{
"entropy": 1.3004970252513885,
"epoch": 1.9049773755656108,
"grad_norm": 0.6762946248054504,
"learning_rate": 0.00018048771272156821,
"loss": 0.20788748562335968,
"mean_token_accuracy": 0.9558361321687698,
"num_tokens": 4651212.0,
"step": 527
},
{
"entropy": 1.2837709486484528,
"epoch": 1.9085972850678732,
"grad_norm": 0.5465518236160278,
"learning_rate": 0.00018040583081215206,
"loss": 0.06877493858337402,
"mean_token_accuracy": 0.9773479402065277,
"num_tokens": 4659999.0,
"step": 528
},
{
"entropy": 1.1701121926307678,
"epoch": 1.9122171945701356,
"grad_norm": 0.48923078179359436,
"learning_rate": 0.00018032379843774442,
"loss": 0.10492908954620361,
"mean_token_accuracy": 0.9667482525110245,
"num_tokens": 4669539.0,
"step": 529
},
{
"entropy": 1.265353113412857,
"epoch": 1.915837104072398,
"grad_norm": 0.5817871689796448,
"learning_rate": 0.0001802416157736576,
"loss": 0.1619410663843155,
"mean_token_accuracy": 0.9500089585781097,
"num_tokens": 4678597.0,
"step": 530
},
{
"entropy": 1.3536691665649414,
"epoch": 1.9194570135746605,
"grad_norm": 0.420358806848526,
"learning_rate": 0.0001801592829955249,
"loss": 0.05708397924900055,
"mean_token_accuracy": 0.9855902940034866,
"num_tokens": 4686961.0,
"step": 531
},
{
"entropy": 1.211147278547287,
"epoch": 1.9230769230769231,
"grad_norm": 0.6017248034477234,
"learning_rate": 0.00018007680027930053,
"loss": 0.14558392763137817,
"mean_token_accuracy": 0.957173079252243,
"num_tokens": 4696021.0,
"step": 532
},
{
"entropy": 1.2624102234840393,
"epoch": 1.9266968325791856,
"grad_norm": 0.409318208694458,
"learning_rate": 0.00017999416780125908,
"loss": 0.0590839758515358,
"mean_token_accuracy": 0.9783227145671844,
"num_tokens": 4704751.0,
"step": 533
},
{
"entropy": 1.2459297180175781,
"epoch": 1.930316742081448,
"grad_norm": 0.4089992046356201,
"learning_rate": 0.0001799113857379953,
"loss": 0.09553386270999908,
"mean_token_accuracy": 0.9749322384595871,
"num_tokens": 4713896.0,
"step": 534
},
{
"entropy": 1.23102468252182,
"epoch": 1.9339366515837104,
"grad_norm": 0.3723222315311432,
"learning_rate": 0.00017982845426642348,
"loss": 0.09167241305112839,
"mean_token_accuracy": 0.9732823669910431,
"num_tokens": 4723046.0,
"step": 535
},
{
"entropy": 1.3357162177562714,
"epoch": 1.9375565610859729,
"grad_norm": 0.48041215538978577,
"learning_rate": 0.00017974537356377733,
"loss": 0.09750382602214813,
"mean_token_accuracy": 0.9764254987239838,
"num_tokens": 4731993.0,
"step": 536
},
{
"entropy": 1.1988002955913544,
"epoch": 1.9411764705882353,
"grad_norm": 0.5414320826530457,
"learning_rate": 0.00017966214380760938,
"loss": 0.14568619430065155,
"mean_token_accuracy": 0.9629835188388824,
"num_tokens": 4741245.0,
"step": 537
},
{
"entropy": 1.2370244562625885,
"epoch": 1.9447963800904977,
"grad_norm": 0.45901384949684143,
"learning_rate": 0.00017957876517579076,
"loss": 0.0786663368344307,
"mean_token_accuracy": 0.9746132791042328,
"num_tokens": 4750000.0,
"step": 538
},
{
"entropy": 1.2960913479328156,
"epoch": 1.9484162895927601,
"grad_norm": 0.5607526898384094,
"learning_rate": 0.00017949523784651085,
"loss": 0.19930651783943176,
"mean_token_accuracy": 0.9484933167695999,
"num_tokens": 4758980.0,
"step": 539
},
{
"entropy": 1.3163366615772247,
"epoch": 1.9520361990950226,
"grad_norm": 0.4517749845981598,
"learning_rate": 0.00017941156199827664,
"loss": 0.05434092879295349,
"mean_token_accuracy": 0.9835511595010757,
"num_tokens": 4767348.0,
"step": 540
},
{
"entropy": 1.2590799927711487,
"epoch": 1.9556561085972852,
"grad_norm": 0.7361325025558472,
"learning_rate": 0.00017932773780991262,
"loss": 0.15642693638801575,
"mean_token_accuracy": 0.9532411098480225,
"num_tokens": 4776468.0,
"step": 541
},
{
"entropy": 1.2585523426532745,
"epoch": 1.9592760180995477,
"grad_norm": 0.7297177314758301,
"learning_rate": 0.00017924376546056035,
"loss": 0.18437303602695465,
"mean_token_accuracy": 0.9587585777044296,
"num_tokens": 4785492.0,
"step": 542
},
{
"entropy": 1.238587647676468,
"epoch": 1.96289592760181,
"grad_norm": 0.7112489342689514,
"learning_rate": 0.00017915964512967784,
"loss": 0.1504441499710083,
"mean_token_accuracy": 0.9600091278553009,
"num_tokens": 4794274.0,
"step": 543
},
{
"entropy": 1.2908321619033813,
"epoch": 1.9665158371040725,
"grad_norm": 0.6113592386245728,
"learning_rate": 0.00017907537699703955,
"loss": 0.11070753633975983,
"mean_token_accuracy": 0.9691345393657684,
"num_tokens": 4802874.0,
"step": 544
},
{
"entropy": 1.1804803311824799,
"epoch": 1.970135746606335,
"grad_norm": 0.5324094295501709,
"learning_rate": 0.00017899096124273576,
"loss": 0.10199148207902908,
"mean_token_accuracy": 0.9748995751142502,
"num_tokens": 4812257.0,
"step": 545
},
{
"entropy": 1.2372592389583588,
"epoch": 1.9737556561085974,
"grad_norm": 0.34164050221443176,
"learning_rate": 0.00017890639804717215,
"loss": 0.055172618478536606,
"mean_token_accuracy": 0.9877361357212067,
"num_tokens": 4821231.0,
"step": 546
},
{
"entropy": 1.2872498035430908,
"epoch": 1.9773755656108598,
"grad_norm": 0.5182923674583435,
"learning_rate": 0.00017882168759106957,
"loss": 0.10375868529081345,
"mean_token_accuracy": 0.9739436358213425,
"num_tokens": 4829899.0,
"step": 547
},
{
"entropy": 1.1443316638469696,
"epoch": 1.9809954751131222,
"grad_norm": 0.5763622522354126,
"learning_rate": 0.00017873683005546358,
"loss": 0.13787207007408142,
"mean_token_accuracy": 0.9625149518251419,
"num_tokens": 4839358.0,
"step": 548
},
{
"entropy": 1.1885304749011993,
"epoch": 1.9846153846153847,
"grad_norm": 0.734394907951355,
"learning_rate": 0.00017865182562170403,
"loss": 0.14687997102737427,
"mean_token_accuracy": 0.9644728451967239,
"num_tokens": 4848291.0,
"step": 549
},
{
"entropy": 1.25540229678154,
"epoch": 1.988235294117647,
"grad_norm": 0.609951913356781,
"learning_rate": 0.00017856667447145475,
"loss": 0.10373552143573761,
"mean_token_accuracy": 0.9681924432516098,
"num_tokens": 4856839.0,
"step": 550
},
{
"entropy": 1.2945852875709534,
"epoch": 1.9918552036199095,
"grad_norm": 0.9161359071731567,
"learning_rate": 0.00017848137678669307,
"loss": 0.1577455848455429,
"mean_token_accuracy": 0.963507279753685,
"num_tokens": 4865115.0,
"step": 551
},
{
"entropy": 1.208385318517685,
"epoch": 1.995475113122172,
"grad_norm": 0.5070799589157104,
"learning_rate": 0.00017839593274970953,
"loss": 0.14434640109539032,
"mean_token_accuracy": 0.9649564027786255,
"num_tokens": 4874000.0,
"step": 552
},
{
"entropy": 1.2376223504543304,
"epoch": 1.9990950226244344,
"grad_norm": 0.539681077003479,
"learning_rate": 0.00017831034254310748,
"loss": 0.1051136925816536,
"mean_token_accuracy": 0.9742441177368164,
"num_tokens": 4882778.0,
"step": 553
},
{
"entropy": 1.4333502054214478,
"epoch": 2.0,
"grad_norm": 4.310206890106201,
"learning_rate": 0.00017822460634980245,
"loss": 0.2112778276205063,
"mean_token_accuracy": 0.9484536051750183,
"num_tokens": 4883450.0,
"step": 554
},
{
"epoch": 2.0,
"eval_entropy": 1.2949430923151777,
"eval_loss": 0.1359202116727829,
"eval_mean_token_accuracy": 0.9644398606889616,
"eval_num_tokens": 4883450.0,
"eval_runtime": 31.7669,
"eval_samples_per_second": 11.616,
"eval_steps_per_second": 3.872,
"step": 554
},
{
"entropy": 1.267829716205597,
"epoch": 2.0036199095022624,
"grad_norm": 0.5096814632415771,
"learning_rate": 0.00017813872435302222,
"loss": 0.0520174577832222,
"mean_token_accuracy": 0.9827230721712112,
"num_tokens": 4892530.0,
"step": 555
},
{
"entropy": 1.336664855480194,
"epoch": 2.007239819004525,
"grad_norm": 0.648607075214386,
"learning_rate": 0.000178052696736306,
"loss": 0.12705332040786743,
"mean_token_accuracy": 0.9673191756010056,
"num_tokens": 4900989.0,
"step": 556
},
{
"entropy": 1.323121964931488,
"epoch": 2.0108597285067873,
"grad_norm": 0.5979547500610352,
"learning_rate": 0.00017796652368350422,
"loss": 0.13934022188186646,
"mean_token_accuracy": 0.9531149715185165,
"num_tokens": 4909811.0,
"step": 557
},
{
"entropy": 1.27756667137146,
"epoch": 2.0144796380090497,
"grad_norm": 0.40110835433006287,
"learning_rate": 0.00017788020537877822,
"loss": 0.06965979188680649,
"mean_token_accuracy": 0.9806021302938461,
"num_tokens": 4918656.0,
"step": 558
},
{
"entropy": 1.2799800336360931,
"epoch": 2.018099547511312,
"grad_norm": 0.4222742021083832,
"learning_rate": 0.00017779374200659967,
"loss": 0.06322097778320312,
"mean_token_accuracy": 0.9810404479503632,
"num_tokens": 4927468.0,
"step": 559
},
{
"entropy": 1.2439637184143066,
"epoch": 2.0217194570135746,
"grad_norm": 0.4971936047077179,
"learning_rate": 0.00017770713375175027,
"loss": 0.09368139505386353,
"mean_token_accuracy": 0.9739051014184952,
"num_tokens": 4936501.0,
"step": 560
},
{
"entropy": 1.2581075429916382,
"epoch": 2.025339366515837,
"grad_norm": 0.5043046474456787,
"learning_rate": 0.00017762038079932143,
"loss": 0.08156520873308182,
"mean_token_accuracy": 0.9788997769355774,
"num_tokens": 4945377.0,
"step": 561
},
{
"entropy": 1.2525224387645721,
"epoch": 2.0289592760180994,
"grad_norm": 0.47259730100631714,
"learning_rate": 0.00017753348333471368,
"loss": 0.06477752327919006,
"mean_token_accuracy": 0.9779137223958969,
"num_tokens": 4954230.0,
"step": 562
},
{
"entropy": 1.1896491348743439,
"epoch": 2.032579185520362,
"grad_norm": 0.68027663230896,
"learning_rate": 0.00017744644154363642,
"loss": 0.13440944254398346,
"mean_token_accuracy": 0.961474671959877,
"num_tokens": 4963326.0,
"step": 563
},
{
"entropy": 1.2167057394981384,
"epoch": 2.0361990950226243,
"grad_norm": 0.541343092918396,
"learning_rate": 0.0001773592556121076,
"loss": 0.11538562178611755,
"mean_token_accuracy": 0.9634369909763336,
"num_tokens": 4972140.0,
"step": 564
},
{
"entropy": 1.2127482295036316,
"epoch": 2.0398190045248867,
"grad_norm": 0.6436272263526917,
"learning_rate": 0.00017727192572645307,
"loss": 0.12803298234939575,
"mean_token_accuracy": 0.9709215462207794,
"num_tokens": 4981091.0,
"step": 565
},
{
"entropy": 1.2022112607955933,
"epoch": 2.043438914027149,
"grad_norm": 0.5350651144981384,
"learning_rate": 0.0001771844520733064,
"loss": 0.06252528727054596,
"mean_token_accuracy": 0.9814591854810715,
"num_tokens": 4989755.0,
"step": 566
},
{
"entropy": 1.1322139203548431,
"epoch": 2.0470588235294116,
"grad_norm": 0.4417763650417328,
"learning_rate": 0.00017709683483960837,
"loss": 0.057584479451179504,
"mean_token_accuracy": 0.9763814359903336,
"num_tokens": 4998071.0,
"step": 567
},
{
"entropy": 1.1674880683422089,
"epoch": 2.050678733031674,
"grad_norm": 0.8559631109237671,
"learning_rate": 0.00017700907421260668,
"loss": 0.14195309579372406,
"mean_token_accuracy": 0.9630620330572128,
"num_tokens": 5006275.0,
"step": 568
},
{
"entropy": 1.0746576488018036,
"epoch": 2.0542986425339365,
"grad_norm": 0.5514897108078003,
"learning_rate": 0.00017692117037985538,
"loss": 0.10994663834571838,
"mean_token_accuracy": 0.9645248502492905,
"num_tokens": 5015941.0,
"step": 569
},
{
"entropy": 1.0583149194717407,
"epoch": 2.057918552036199,
"grad_norm": 0.8742787837982178,
"learning_rate": 0.00017683312352921463,
"loss": 0.060930777341127396,
"mean_token_accuracy": 0.9797597676515579,
"num_tokens": 5025002.0,
"step": 570
},
{
"entropy": 1.0571471750736237,
"epoch": 2.0615384615384613,
"grad_norm": 0.5724367499351501,
"learning_rate": 0.00017674493384885022,
"loss": 0.09305945783853531,
"mean_token_accuracy": 0.9714621901512146,
"num_tokens": 5034508.0,
"step": 571
},
{
"entropy": 1.0819552540779114,
"epoch": 2.065158371040724,
"grad_norm": 0.4609355032444,
"learning_rate": 0.00017665660152723319,
"loss": 0.06915001571178436,
"mean_token_accuracy": 0.9803351610898972,
"num_tokens": 5043545.0,
"step": 572
},
{
"entropy": 1.1555139124393463,
"epoch": 2.0687782805429866,
"grad_norm": 0.6659719347953796,
"learning_rate": 0.00017656812675313936,
"loss": 0.0947500616312027,
"mean_token_accuracy": 0.9700139313936234,
"num_tokens": 5052212.0,
"step": 573
},
{
"entropy": 1.0778053104877472,
"epoch": 2.072398190045249,
"grad_norm": 0.4302602708339691,
"learning_rate": 0.00017647950971564914,
"loss": 0.11282960325479507,
"mean_token_accuracy": 0.9660980701446533,
"num_tokens": 5061672.0,
"step": 574
},
{
"entropy": 1.111078679561615,
"epoch": 2.0760180995475115,
"grad_norm": 0.4093020260334015,
"learning_rate": 0.00017639075060414675,
"loss": 0.07460740208625793,
"mean_token_accuracy": 0.9758316427469254,
"num_tokens": 5071120.0,
"step": 575
},
{
"entropy": 1.1735278069972992,
"epoch": 2.079638009049774,
"grad_norm": 0.726333737373352,
"learning_rate": 0.0001763018496083202,
"loss": 0.31053513288497925,
"mean_token_accuracy": 0.9477875083684921,
"num_tokens": 5079991.0,
"step": 576
},
{
"entropy": 1.214000791311264,
"epoch": 2.0832579185520363,
"grad_norm": 0.7157718539237976,
"learning_rate": 0.00017621280691816076,
"loss": 0.14849108457565308,
"mean_token_accuracy": 0.952953040599823,
"num_tokens": 5088629.0,
"step": 577
},
{
"entropy": 1.1892287135124207,
"epoch": 2.086877828054299,
"grad_norm": 0.8626441359519958,
"learning_rate": 0.00017612362272396233,
"loss": 0.11797386407852173,
"mean_token_accuracy": 0.9675682783126831,
"num_tokens": 5097362.0,
"step": 578
},
{
"entropy": 1.2235265970230103,
"epoch": 2.090497737556561,
"grad_norm": 0.6362661123275757,
"learning_rate": 0.00017603429721632134,
"loss": 0.09463687241077423,
"mean_token_accuracy": 0.9706145972013474,
"num_tokens": 5105848.0,
"step": 579
},
{
"entropy": 1.1600496768951416,
"epoch": 2.0941176470588236,
"grad_norm": 0.3596010208129883,
"learning_rate": 0.00017594483058613625,
"loss": 0.06019435077905655,
"mean_token_accuracy": 0.9838933050632477,
"num_tokens": 5115154.0,
"step": 580
},
{
"entropy": 1.202838271856308,
"epoch": 2.097737556561086,
"grad_norm": 0.5307016372680664,
"learning_rate": 0.00017585522302460707,
"loss": 0.14253605902194977,
"mean_token_accuracy": 0.9725675880908966,
"num_tokens": 5124126.0,
"step": 581
},
{
"entropy": 1.2034749388694763,
"epoch": 2.1013574660633485,
"grad_norm": 0.5540820956230164,
"learning_rate": 0.00017576547472323501,
"loss": 0.07838793098926544,
"mean_token_accuracy": 0.9791185557842255,
"num_tokens": 5132742.0,
"step": 582
},
{
"entropy": 1.1183785498142242,
"epoch": 2.104977375565611,
"grad_norm": 0.613678514957428,
"learning_rate": 0.00017567558587382198,
"loss": 0.12506228685379028,
"mean_token_accuracy": 0.9711886942386627,
"num_tokens": 5142111.0,
"step": 583
},
{
"entropy": 1.1404469907283783,
"epoch": 2.1085972850678734,
"grad_norm": 0.5098339915275574,
"learning_rate": 0.00017558555666847037,
"loss": 0.0471160002052784,
"mean_token_accuracy": 0.9831248819828033,
"num_tokens": 5151127.0,
"step": 584
},
{
"entropy": 1.1900439262390137,
"epoch": 2.112217194570136,
"grad_norm": 0.5168299674987793,
"learning_rate": 0.00017549538729958247,
"loss": 0.1095990240573883,
"mean_token_accuracy": 0.9677491337060928,
"num_tokens": 5159888.0,
"step": 585
},
{
"entropy": 1.1944100558757782,
"epoch": 2.1158371040723982,
"grad_norm": 0.7810997366905212,
"learning_rate": 0.00017540507795986014,
"loss": 0.19430337846279144,
"mean_token_accuracy": 0.9515476375818253,
"num_tokens": 5168845.0,
"step": 586
},
{
"entropy": 1.1583697199821472,
"epoch": 2.1194570135746607,
"grad_norm": 0.45587292313575745,
"learning_rate": 0.0001753146288423043,
"loss": 0.05915957689285278,
"mean_token_accuracy": 0.9794735610485077,
"num_tokens": 5177533.0,
"step": 587
},
{
"entropy": 1.1053299605846405,
"epoch": 2.123076923076923,
"grad_norm": 0.5066711902618408,
"learning_rate": 0.00017522404014021472,
"loss": 0.07206124812364578,
"mean_token_accuracy": 0.9787262827157974,
"num_tokens": 5186820.0,
"step": 588
},
{
"entropy": 1.211129367351532,
"epoch": 2.1266968325791855,
"grad_norm": 0.6494729518890381,
"learning_rate": 0.00017513331204718934,
"loss": 0.12303437292575836,
"mean_token_accuracy": 0.9687947034835815,
"num_tokens": 5195554.0,
"step": 589
},
{
"entropy": 1.217880368232727,
"epoch": 2.130316742081448,
"grad_norm": 0.6168936491012573,
"learning_rate": 0.0001750424447571241,
"loss": 0.08860359340906143,
"mean_token_accuracy": 0.9771160632371902,
"num_tokens": 5204399.0,
"step": 590
},
{
"entropy": 1.152166485786438,
"epoch": 2.1339366515837104,
"grad_norm": 0.4032968580722809,
"learning_rate": 0.00017495143846421235,
"loss": 0.07568950951099396,
"mean_token_accuracy": 0.9729534089565277,
"num_tokens": 5213389.0,
"step": 591
},
{
"entropy": 1.15593022108078,
"epoch": 2.137556561085973,
"grad_norm": 0.4182475805282593,
"learning_rate": 0.00017486029336294455,
"loss": 0.04834774509072304,
"mean_token_accuracy": 0.9854657202959061,
"num_tokens": 5222138.0,
"step": 592
},
{
"entropy": 1.1080695390701294,
"epoch": 2.1411764705882352,
"grad_norm": 0.5492544174194336,
"learning_rate": 0.00017476900964810777,
"loss": 0.06573710590600967,
"mean_token_accuracy": 0.9790465831756592,
"num_tokens": 5231123.0,
"step": 593
},
{
"entropy": 1.0967484414577484,
"epoch": 2.1447963800904977,
"grad_norm": 0.5121078491210938,
"learning_rate": 0.00017467758751478537,
"loss": 0.061242155730724335,
"mean_token_accuracy": 0.9807155132293701,
"num_tokens": 5239859.0,
"step": 594
},
{
"entropy": 1.1613673865795135,
"epoch": 2.14841628959276,
"grad_norm": 0.5526803731918335,
"learning_rate": 0.00017458602715835644,
"loss": 0.06058318912982941,
"mean_token_accuracy": 0.978880986571312,
"num_tokens": 5248469.0,
"step": 595
},
{
"entropy": 1.141857922077179,
"epoch": 2.1520361990950225,
"grad_norm": 0.4709974229335785,
"learning_rate": 0.00017449432877449553,
"loss": 0.06889600306749344,
"mean_token_accuracy": 0.9787448197603226,
"num_tokens": 5257459.0,
"step": 596
},
{
"entropy": 1.1636947095394135,
"epoch": 2.155656108597285,
"grad_norm": 0.5113511085510254,
"learning_rate": 0.00017440249255917218,
"loss": 0.060567114502191544,
"mean_token_accuracy": 0.9818770885467529,
"num_tokens": 5266237.0,
"step": 597
},
{
"entropy": 1.1132238507270813,
"epoch": 2.1592760180995474,
"grad_norm": 0.6330764889717102,
"learning_rate": 0.00017431051870865044,
"loss": 0.07782386243343353,
"mean_token_accuracy": 0.9758347868919373,
"num_tokens": 5275498.0,
"step": 598
},
{
"entropy": 1.1733905673027039,
"epoch": 2.16289592760181,
"grad_norm": 0.6248951554298401,
"learning_rate": 0.00017421840741948852,
"loss": 0.10059335827827454,
"mean_token_accuracy": 0.9720931500196457,
"num_tokens": 5284031.0,
"step": 599
},
{
"entropy": 1.1515400111675262,
"epoch": 2.1665158371040723,
"grad_norm": 0.6329942941665649,
"learning_rate": 0.00017412615888853837,
"loss": 0.08151932805776596,
"mean_token_accuracy": 0.9802259802818298,
"num_tokens": 5292855.0,
"step": 600
},
{
"entropy": 1.141521841287613,
"epoch": 2.1701357466063347,
"grad_norm": 0.6487274765968323,
"learning_rate": 0.0001740337733129453,
"loss": 0.08144231140613556,
"mean_token_accuracy": 0.9778266698122025,
"num_tokens": 5301800.0,
"step": 601
},
{
"entropy": 1.1145538836717606,
"epoch": 2.173755656108597,
"grad_norm": 0.5885897278785706,
"learning_rate": 0.0001739412508901473,
"loss": 0.08007162064313889,
"mean_token_accuracy": 0.9757736772298813,
"num_tokens": 5310489.0,
"step": 602
},
{
"entropy": 1.0910236835479736,
"epoch": 2.1773755656108595,
"grad_norm": 0.6552841067314148,
"learning_rate": 0.00017384859181787503,
"loss": 0.08375010639429092,
"mean_token_accuracy": 0.9668237864971161,
"num_tokens": 5319658.0,
"step": 603
},
{
"entropy": 1.1157051026821136,
"epoch": 2.180995475113122,
"grad_norm": 0.510016679763794,
"learning_rate": 0.00017375579629415105,
"loss": 0.05919726938009262,
"mean_token_accuracy": 0.9795113205909729,
"num_tokens": 5328730.0,
"step": 604
},
{
"entropy": 1.1671057641506195,
"epoch": 2.184615384615385,
"grad_norm": 0.688136100769043,
"learning_rate": 0.00017366286451728967,
"loss": 0.12569661438465118,
"mean_token_accuracy": 0.9674014300107956,
"num_tokens": 5337128.0,
"step": 605
},
{
"entropy": 1.1464425325393677,
"epoch": 2.1882352941176473,
"grad_norm": 0.61946040391922,
"learning_rate": 0.00017356979668589625,
"loss": 0.08147244155406952,
"mean_token_accuracy": 0.9739107489585876,
"num_tokens": 5345906.0,
"step": 606
},
{
"entropy": 1.1685318052768707,
"epoch": 2.1918552036199097,
"grad_norm": 0.6955191493034363,
"learning_rate": 0.00017347659299886693,
"loss": 0.15282778441905975,
"mean_token_accuracy": 0.9649801999330521,
"num_tokens": 5354505.0,
"step": 607
},
{
"entropy": 1.1361754834651947,
"epoch": 2.195475113122172,
"grad_norm": 0.5646288990974426,
"learning_rate": 0.00017338325365538827,
"loss": 0.06721736490726471,
"mean_token_accuracy": 0.9789784252643585,
"num_tokens": 5363098.0,
"step": 608
},
{
"entropy": 1.1541111469268799,
"epoch": 2.1990950226244346,
"grad_norm": 0.4780377745628357,
"learning_rate": 0.0001732897788549367,
"loss": 0.06245143711566925,
"mean_token_accuracy": 0.9757596999406815,
"num_tokens": 5371853.0,
"step": 609
},
{
"entropy": 1.113056868314743,
"epoch": 2.202714932126697,
"grad_norm": 0.5598439574241638,
"learning_rate": 0.0001731961687972781,
"loss": 0.09727580845355988,
"mean_token_accuracy": 0.9673656225204468,
"num_tokens": 5380346.0,
"step": 610
},
{
"entropy": 1.1484054028987885,
"epoch": 2.2063348416289594,
"grad_norm": 0.6644022464752197,
"learning_rate": 0.00017310242368246746,
"loss": 0.1005115807056427,
"mean_token_accuracy": 0.9728731662034988,
"num_tokens": 5389310.0,
"step": 611
},
{
"entropy": 1.0054269880056381,
"epoch": 2.209954751131222,
"grad_norm": 0.954916775226593,
"learning_rate": 0.0001730085437108484,
"loss": 0.07220233231782913,
"mean_token_accuracy": 0.9803072810173035,
"num_tokens": 5398815.0,
"step": 612
},
{
"entropy": 1.0945743322372437,
"epoch": 2.2135746606334843,
"grad_norm": 0.4745389223098755,
"learning_rate": 0.00017291452908305268,
"loss": 0.05182403326034546,
"mean_token_accuracy": 0.9830630421638489,
"num_tokens": 5407661.0,
"step": 613
},
{
"entropy": 1.1867251992225647,
"epoch": 2.2171945701357467,
"grad_norm": 0.7837973237037659,
"learning_rate": 0.00017282037999999996,
"loss": 0.14014287292957306,
"mean_token_accuracy": 0.9501726627349854,
"num_tokens": 5415977.0,
"step": 614
},
{
"entropy": 1.0867418944835663,
"epoch": 2.220814479638009,
"grad_norm": 0.4593866765499115,
"learning_rate": 0.0001727260966628971,
"loss": 0.08343060314655304,
"mean_token_accuracy": 0.977153941988945,
"num_tokens": 5425295.0,
"step": 615
},
{
"entropy": 1.094106286764145,
"epoch": 2.2244343891402716,
"grad_norm": 0.31321874260902405,
"learning_rate": 0.00017263167927323794,
"loss": 0.02154790610074997,
"mean_token_accuracy": 0.9943936318159103,
"num_tokens": 5433892.0,
"step": 616
},
{
"entropy": 1.1765957176685333,
"epoch": 2.228054298642534,
"grad_norm": 0.5103018879890442,
"learning_rate": 0.00017253712803280284,
"loss": 0.07026584446430206,
"mean_token_accuracy": 0.9814851880073547,
"num_tokens": 5442650.0,
"step": 617
},
{
"entropy": 1.15475395321846,
"epoch": 2.2316742081447964,
"grad_norm": 0.46093472838401794,
"learning_rate": 0.00017244244314365822,
"loss": 0.052377212792634964,
"mean_token_accuracy": 0.9835858196020126,
"num_tokens": 5451484.0,
"step": 618
},
{
"entropy": 1.084006518125534,
"epoch": 2.235294117647059,
"grad_norm": 0.43632498383522034,
"learning_rate": 0.000172347624808156,
"loss": 0.04849786311388016,
"mean_token_accuracy": 0.9841872155666351,
"num_tokens": 5460401.0,
"step": 619
},
{
"entropy": 1.1410707533359528,
"epoch": 2.2389140271493213,
"grad_norm": 0.6720604300498962,
"learning_rate": 0.00017225267322893345,
"loss": 0.13110417127609253,
"mean_token_accuracy": 0.9632755517959595,
"num_tokens": 5469141.0,
"step": 620
},
{
"entropy": 1.132197231054306,
"epoch": 2.2425339366515837,
"grad_norm": 0.6811819672584534,
"learning_rate": 0.00017215758860891246,
"loss": 0.05728556215763092,
"mean_token_accuracy": 0.9824267625808716,
"num_tokens": 5477947.0,
"step": 621
},
{
"entropy": 1.0879136621952057,
"epoch": 2.246153846153846,
"grad_norm": 0.8642657399177551,
"learning_rate": 0.00017206237115129937,
"loss": 0.16126996278762817,
"mean_token_accuracy": 0.9610114395618439,
"num_tokens": 5486895.0,
"step": 622
},
{
"entropy": 1.074364259839058,
"epoch": 2.2497737556561086,
"grad_norm": 0.5326434373855591,
"learning_rate": 0.00017196702105958428,
"loss": 0.04988791421055794,
"mean_token_accuracy": 0.9847123473882675,
"num_tokens": 5495604.0,
"step": 623
},
{
"entropy": 1.108335942029953,
"epoch": 2.253393665158371,
"grad_norm": 0.4806241989135742,
"learning_rate": 0.00017187153853754082,
"loss": 0.08469416946172714,
"mean_token_accuracy": 0.9805634319782257,
"num_tokens": 5504355.0,
"step": 624
},
{
"entropy": 1.0444909036159515,
"epoch": 2.2570135746606335,
"grad_norm": 0.48259949684143066,
"learning_rate": 0.00017177592378922566,
"loss": 0.08314619958400726,
"mean_token_accuracy": 0.9782232642173767,
"num_tokens": 5513744.0,
"step": 625
},
{
"entropy": 1.1189175844192505,
"epoch": 2.260633484162896,
"grad_norm": 0.5330878496170044,
"learning_rate": 0.00017168017701897802,
"loss": 0.11519207060337067,
"mean_token_accuracy": 0.9642496109008789,
"num_tokens": 5522549.0,
"step": 626
},
{
"entropy": 1.0698913782835007,
"epoch": 2.2642533936651583,
"grad_norm": 0.6421942710876465,
"learning_rate": 0.0001715842984314192,
"loss": 0.08139073848724365,
"mean_token_accuracy": 0.9752504527568817,
"num_tokens": 5531666.0,
"step": 627
},
{
"entropy": 1.0454991310834885,
"epoch": 2.2678733031674208,
"grad_norm": 0.5831551551818848,
"learning_rate": 0.0001714882882314523,
"loss": 0.10700318962335587,
"mean_token_accuracy": 0.9666198194026947,
"num_tokens": 5541331.0,
"step": 628
},
{
"entropy": 1.1714680790901184,
"epoch": 2.271493212669683,
"grad_norm": 0.5227723717689514,
"learning_rate": 0.00017139214662426167,
"loss": 0.08072106540203094,
"mean_token_accuracy": 0.9822585135698318,
"num_tokens": 5549828.0,
"step": 629
},
{
"entropy": 1.094650149345398,
"epoch": 2.2751131221719456,
"grad_norm": 0.5172683596611023,
"learning_rate": 0.00017129587381531247,
"loss": 0.05595247447490692,
"mean_token_accuracy": 0.977413684129715,
"num_tokens": 5558456.0,
"step": 630
},
{
"entropy": 1.0995945632457733,
"epoch": 2.278733031674208,
"grad_norm": 0.3441588878631592,
"learning_rate": 0.00017119947001035027,
"loss": 0.040461085736751556,
"mean_token_accuracy": 0.9847785383462906,
"num_tokens": 5567051.0,
"step": 631
},
{
"entropy": 1.0973523557186127,
"epoch": 2.2823529411764705,
"grad_norm": 0.49515512585639954,
"learning_rate": 0.0001711029354154006,
"loss": 0.08333179354667664,
"mean_token_accuracy": 0.9789102673530579,
"num_tokens": 5576195.0,
"step": 632
},
{
"entropy": 1.0842012166976929,
"epoch": 2.285972850678733,
"grad_norm": 0.36962664127349854,
"learning_rate": 0.00017100627023676848,
"loss": 0.04567601531744003,
"mean_token_accuracy": 0.9858750253915787,
"num_tokens": 5585076.0,
"step": 633
},
{
"entropy": 1.0841620564460754,
"epoch": 2.2895927601809953,
"grad_norm": 0.4125482738018036,
"learning_rate": 0.000170909474681038,
"loss": 0.04856008291244507,
"mean_token_accuracy": 0.9842440634965897,
"num_tokens": 5594132.0,
"step": 634
},
{
"entropy": 1.108697071671486,
"epoch": 2.2932126696832578,
"grad_norm": 0.4849383533000946,
"learning_rate": 0.0001708125489550719,
"loss": 0.05033132806420326,
"mean_token_accuracy": 0.9845256805419922,
"num_tokens": 5602901.0,
"step": 635
},
{
"entropy": 1.1090115308761597,
"epoch": 2.29683257918552,
"grad_norm": 0.5984789729118347,
"learning_rate": 0.00017071549326601107,
"loss": 0.09391042590141296,
"mean_token_accuracy": 0.9781784564256668,
"num_tokens": 5611778.0,
"step": 636
},
{
"entropy": 1.1380729377269745,
"epoch": 2.3004524886877826,
"grad_norm": 0.688556969165802,
"learning_rate": 0.0001706183078212742,
"loss": 0.07684573531150818,
"mean_token_accuracy": 0.9796445071697235,
"num_tokens": 5620319.0,
"step": 637
},
{
"entropy": 1.0685915052890778,
"epoch": 2.304072398190045,
"grad_norm": 0.43945908546447754,
"learning_rate": 0.00017052099282855728,
"loss": 0.06292518973350525,
"mean_token_accuracy": 0.9775279760360718,
"num_tokens": 5629494.0,
"step": 638
},
{
"entropy": 1.0556871443986893,
"epoch": 2.3076923076923075,
"grad_norm": 0.6430138349533081,
"learning_rate": 0.00017042354849583312,
"loss": 0.10716582834720612,
"mean_token_accuracy": 0.9718299359083176,
"num_tokens": 5638360.0,
"step": 639
},
{
"entropy": 1.0811335742473602,
"epoch": 2.31131221719457,
"grad_norm": 0.5156210660934448,
"learning_rate": 0.00017032597503135097,
"loss": 0.06649160385131836,
"mean_token_accuracy": 0.9791721701622009,
"num_tokens": 5647294.0,
"step": 640
},
{
"entropy": 1.0688489079475403,
"epoch": 2.3149321266968323,
"grad_norm": 0.6636553406715393,
"learning_rate": 0.000170228272643636,
"loss": 0.1042657196521759,
"mean_token_accuracy": 0.9654766619205475,
"num_tokens": 5656288.0,
"step": 641
},
{
"entropy": 1.0857466459274292,
"epoch": 2.318552036199095,
"grad_norm": 0.7920498847961426,
"learning_rate": 0.00017013044154148894,
"loss": 0.12234874814748764,
"mean_token_accuracy": 0.9632948487997055,
"num_tokens": 5665003.0,
"step": 642
},
{
"entropy": 1.0862885862588882,
"epoch": 2.3221719457013577,
"grad_norm": 0.6242631673812866,
"learning_rate": 0.00017003248193398564,
"loss": 0.0991068035364151,
"mean_token_accuracy": 0.9662000685930252,
"num_tokens": 5674000.0,
"step": 643
},
{
"entropy": 1.1462857723236084,
"epoch": 2.32579185520362,
"grad_norm": 0.596567690372467,
"learning_rate": 0.00016993439403047652,
"loss": 0.09591332077980042,
"mean_token_accuracy": 0.9745471328496933,
"num_tokens": 5682989.0,
"step": 644
},
{
"entropy": 1.1377845704555511,
"epoch": 2.3294117647058825,
"grad_norm": 0.5160103440284729,
"learning_rate": 0.0001698361780405862,
"loss": 0.08644409477710724,
"mean_token_accuracy": 0.9708640724420547,
"num_tokens": 5691812.0,
"step": 645
},
{
"entropy": 1.1150462329387665,
"epoch": 2.333031674208145,
"grad_norm": 0.45699501037597656,
"learning_rate": 0.00016973783417421304,
"loss": 0.05352473631501198,
"mean_token_accuracy": 0.984683021903038,
"num_tokens": 5700825.0,
"step": 646
},
{
"entropy": 1.2092556655406952,
"epoch": 2.3366515837104074,
"grad_norm": 0.8381008505821228,
"learning_rate": 0.00016963936264152867,
"loss": 0.08669580519199371,
"mean_token_accuracy": 0.980311319231987,
"num_tokens": 5709406.0,
"step": 647
},
{
"entropy": 1.1319242715835571,
"epoch": 2.34027149321267,
"grad_norm": 0.53270024061203,
"learning_rate": 0.00016954076365297758,
"loss": 0.05547341704368591,
"mean_token_accuracy": 0.9790041297674179,
"num_tokens": 5718379.0,
"step": 648
},
{
"entropy": 1.1626895368099213,
"epoch": 2.3438914027149322,
"grad_norm": 0.5577008128166199,
"learning_rate": 0.00016944203741927662,
"loss": 0.08301683515310287,
"mean_token_accuracy": 0.9779217094182968,
"num_tokens": 5726870.0,
"step": 649
},
{
"entropy": 1.1811838150024414,
"epoch": 2.3475113122171947,
"grad_norm": 1.1514310836791992,
"learning_rate": 0.00016934318415141457,
"loss": 0.22124511003494263,
"mean_token_accuracy": 0.9386986643075943,
"num_tokens": 5735262.0,
"step": 650
},
{
"entropy": 1.1047629415988922,
"epoch": 2.351131221719457,
"grad_norm": 0.42921116948127747,
"learning_rate": 0.00016924420406065177,
"loss": 0.07803885638713837,
"mean_token_accuracy": 0.977928563952446,
"num_tokens": 5744320.0,
"step": 651
},
{
"entropy": 1.187497317790985,
"epoch": 2.3547511312217195,
"grad_norm": 0.458602637052536,
"learning_rate": 0.00016914509735851954,
"loss": 0.05676557496190071,
"mean_token_accuracy": 0.983381986618042,
"num_tokens": 5752768.0,
"step": 652
},
{
"entropy": 1.1605049073696136,
"epoch": 2.358371040723982,
"grad_norm": 0.700816810131073,
"learning_rate": 0.00016904586425681975,
"loss": 0.1257372498512268,
"mean_token_accuracy": 0.9657402634620667,
"num_tokens": 5761355.0,
"step": 653
},
{
"entropy": 1.1560527682304382,
"epoch": 2.3619909502262444,
"grad_norm": 0.8418037295341492,
"learning_rate": 0.00016894650496762444,
"loss": 0.18459515273571014,
"mean_token_accuracy": 0.9515699297189713,
"num_tokens": 5770257.0,
"step": 654
},
{
"entropy": 1.1892625987529755,
"epoch": 2.365610859728507,
"grad_norm": 0.478832483291626,
"learning_rate": 0.00016884701970327538,
"loss": 0.052141569554805756,
"mean_token_accuracy": 0.9864660948514938,
"num_tokens": 5778834.0,
"step": 655
},
{
"entropy": 1.1170645952224731,
"epoch": 2.3692307692307693,
"grad_norm": 0.5372560024261475,
"learning_rate": 0.00016874740867638339,
"loss": 0.10951077938079834,
"mean_token_accuracy": 0.9724908918142319,
"num_tokens": 5788066.0,
"step": 656
},
{
"entropy": 1.1744292080402374,
"epoch": 2.3728506787330317,
"grad_norm": 0.6120523810386658,
"learning_rate": 0.00016864767209982825,
"loss": 0.09085190296173096,
"mean_token_accuracy": 0.9735588431358337,
"num_tokens": 5796324.0,
"step": 657
},
{
"entropy": 1.1730450987815857,
"epoch": 2.376470588235294,
"grad_norm": 0.5754011273384094,
"learning_rate": 0.00016854781018675797,
"loss": 0.08808492124080658,
"mean_token_accuracy": 0.9751751571893692,
"num_tokens": 5805217.0,
"step": 658
},
{
"entropy": 1.194241851568222,
"epoch": 2.3800904977375565,
"grad_norm": 0.4330058991909027,
"learning_rate": 0.00016844782315058847,
"loss": 0.0617540068924427,
"mean_token_accuracy": 0.982567548751831,
"num_tokens": 5813640.0,
"step": 659
},
{
"entropy": 1.1041472554206848,
"epoch": 2.383710407239819,
"grad_norm": 0.44740787148475647,
"learning_rate": 0.0001683477112050029,
"loss": 0.04697256535291672,
"mean_token_accuracy": 0.9860917925834656,
"num_tokens": 5822218.0,
"step": 660
},
{
"entropy": 1.1061788201332092,
"epoch": 2.3873303167420814,
"grad_norm": 0.7774989604949951,
"learning_rate": 0.0001682474745639516,
"loss": 0.09556365013122559,
"mean_token_accuracy": 0.975138321518898,
"num_tokens": 5831553.0,
"step": 661
},
{
"entropy": 1.1203438639640808,
"epoch": 2.390950226244344,
"grad_norm": 0.6026033759117126,
"learning_rate": 0.0001681471134416512,
"loss": 0.09611985087394714,
"mean_token_accuracy": 0.9710993468761444,
"num_tokens": 5840305.0,
"step": 662
},
{
"entropy": 1.109214961528778,
"epoch": 2.3945701357466063,
"grad_norm": 0.4015118479728699,
"learning_rate": 0.00016804662805258448,
"loss": 0.06578999012708664,
"mean_token_accuracy": 0.9815486073493958,
"num_tokens": 5848976.0,
"step": 663
},
{
"entropy": 1.1184172928333282,
"epoch": 2.3981900452488687,
"grad_norm": 0.7624635100364685,
"learning_rate": 0.00016794601861149977,
"loss": 0.15326440334320068,
"mean_token_accuracy": 0.9595926702022552,
"num_tokens": 5857668.0,
"step": 664
},
{
"entropy": 1.096958041191101,
"epoch": 2.401809954751131,
"grad_norm": 0.5075679421424866,
"learning_rate": 0.00016784528533341045,
"loss": 0.08701962232589722,
"mean_token_accuracy": 0.974994570016861,
"num_tokens": 5866959.0,
"step": 665
},
{
"entropy": 1.1943987607955933,
"epoch": 2.4054298642533936,
"grad_norm": 0.6924847960472107,
"learning_rate": 0.0001677444284335946,
"loss": 0.07678639143705368,
"mean_token_accuracy": 0.9794516116380692,
"num_tokens": 5875275.0,
"step": 666
},
{
"entropy": 1.1881066262722015,
"epoch": 2.409049773755656,
"grad_norm": 0.5888217091560364,
"learning_rate": 0.0001676434481275945,
"loss": 0.0933694839477539,
"mean_token_accuracy": 0.9765624105930328,
"num_tokens": 5883715.0,
"step": 667
},
{
"entropy": 1.0600565671920776,
"epoch": 2.4126696832579184,
"grad_norm": 0.846140444278717,
"learning_rate": 0.00016754234463121613,
"loss": 0.14402684569358826,
"mean_token_accuracy": 0.9649781733751297,
"num_tokens": 5892799.0,
"step": 668
},
{
"entropy": 1.1473102271556854,
"epoch": 2.416289592760181,
"grad_norm": 0.6479727029800415,
"learning_rate": 0.0001674411181605288,
"loss": 0.1251659095287323,
"mean_token_accuracy": 0.9626735299825668,
"num_tokens": 5901619.0,
"step": 669
},
{
"entropy": 1.1313286423683167,
"epoch": 2.4199095022624433,
"grad_norm": 0.418414443731308,
"learning_rate": 0.0001673397689318646,
"loss": 0.06655631214380264,
"mean_token_accuracy": 0.9822564423084259,
"num_tokens": 5910486.0,
"step": 670
},
{
"entropy": 1.1435776054859161,
"epoch": 2.4235294117647057,
"grad_norm": 0.5648186802864075,
"learning_rate": 0.00016723829716181797,
"loss": 0.05666976422071457,
"mean_token_accuracy": 0.982610896229744,
"num_tokens": 5918909.0,
"step": 671
},
{
"entropy": 1.07819664478302,
"epoch": 2.427149321266968,
"grad_norm": 0.500468909740448,
"learning_rate": 0.00016713670306724512,
"loss": 0.0956311747431755,
"mean_token_accuracy": 0.9763072431087494,
"num_tokens": 5927967.0,
"step": 672
},
{
"entropy": 1.1843810379505157,
"epoch": 2.430769230769231,
"grad_norm": 0.7661120891571045,
"learning_rate": 0.0001670349868652639,
"loss": 0.21462547779083252,
"mean_token_accuracy": 0.93952676653862,
"num_tokens": 5936451.0,
"step": 673
},
{
"entropy": 1.0327152162790298,
"epoch": 2.4343891402714934,
"grad_norm": 0.45229560136795044,
"learning_rate": 0.00016693314877325294,
"loss": 0.04901588708162308,
"mean_token_accuracy": 0.9859603494405746,
"num_tokens": 5945612.0,
"step": 674
},
{
"entropy": 1.0337743610143661,
"epoch": 2.438009049773756,
"grad_norm": 0.44453921914100647,
"learning_rate": 0.00016683118900885147,
"loss": 0.04502352327108383,
"mean_token_accuracy": 0.9859343767166138,
"num_tokens": 5954911.0,
"step": 675
},
{
"entropy": 1.1681455671787262,
"epoch": 2.4416289592760183,
"grad_norm": 0.7814058661460876,
"learning_rate": 0.00016672910778995866,
"loss": 0.15076348185539246,
"mean_token_accuracy": 0.9624718725681305,
"num_tokens": 5963327.0,
"step": 676
},
{
"entropy": 1.065615475177765,
"epoch": 2.4452488687782807,
"grad_norm": 0.6926926374435425,
"learning_rate": 0.00016662690533473333,
"loss": 0.1291479766368866,
"mean_token_accuracy": 0.9634927213191986,
"num_tokens": 5972300.0,
"step": 677
},
{
"entropy": 1.0294676572084427,
"epoch": 2.448868778280543,
"grad_norm": 0.3955880403518677,
"learning_rate": 0.0001665245818615933,
"loss": 0.07051679491996765,
"mean_token_accuracy": 0.9777989983558655,
"num_tokens": 5981742.0,
"step": 678
},
{
"entropy": 1.0838841944932938,
"epoch": 2.4524886877828056,
"grad_norm": 0.5345229506492615,
"learning_rate": 0.0001664221375892151,
"loss": 0.14341828227043152,
"mean_token_accuracy": 0.9667878448963165,
"num_tokens": 5990635.0,
"step": 679
},
{
"entropy": 1.1758331060409546,
"epoch": 2.456108597285068,
"grad_norm": 0.6868032813072205,
"learning_rate": 0.0001663195727365334,
"loss": 0.08597201108932495,
"mean_token_accuracy": 0.9740214198827744,
"num_tokens": 5999095.0,
"step": 680
},
{
"entropy": 1.0949173271656036,
"epoch": 2.4597285067873305,
"grad_norm": 0.589972198009491,
"learning_rate": 0.0001662168875227405,
"loss": 0.08794485032558441,
"mean_token_accuracy": 0.97372767329216,
"num_tokens": 6008205.0,
"step": 681
},
{
"entropy": 1.1521604359149933,
"epoch": 2.463348416289593,
"grad_norm": 0.6321046948432922,
"learning_rate": 0.00016611408216728603,
"loss": 0.11494572460651398,
"mean_token_accuracy": 0.9587929248809814,
"num_tokens": 6016678.0,
"step": 682
},
{
"entropy": 1.1596512496471405,
"epoch": 2.4669683257918553,
"grad_norm": 0.6953479647636414,
"learning_rate": 0.0001660111568898763,
"loss": 0.07764746993780136,
"mean_token_accuracy": 0.9718074202537537,
"num_tokens": 6025013.0,
"step": 683
},
{
"entropy": 1.0262929499149323,
"epoch": 2.4705882352941178,
"grad_norm": 0.5953262448310852,
"learning_rate": 0.00016590811191047393,
"loss": 0.07723493129014969,
"mean_token_accuracy": 0.9786953032016754,
"num_tokens": 6034590.0,
"step": 684
},
{
"entropy": 1.0716820657253265,
"epoch": 2.47420814479638,
"grad_norm": 0.516004204750061,
"learning_rate": 0.00016580494744929735,
"loss": 0.07066604495048523,
"mean_token_accuracy": 0.9768574684858322,
"num_tokens": 6043734.0,
"step": 685
},
{
"entropy": 1.1649815142154694,
"epoch": 2.4778280542986426,
"grad_norm": 0.5916234254837036,
"learning_rate": 0.00016570166372682034,
"loss": 0.07808145135641098,
"mean_token_accuracy": 0.9774973541498184,
"num_tokens": 6052374.0,
"step": 686
},
{
"entropy": 1.100892648100853,
"epoch": 2.481447963800905,
"grad_norm": 0.5512471795082092,
"learning_rate": 0.0001655982609637716,
"loss": 0.15753786265850067,
"mean_token_accuracy": 0.9573144018650055,
"num_tokens": 6061366.0,
"step": 687
},
{
"entropy": 1.0837741196155548,
"epoch": 2.4850678733031675,
"grad_norm": 0.47572416067123413,
"learning_rate": 0.00016549473938113414,
"loss": 0.07854226976633072,
"mean_token_accuracy": 0.9767839014530182,
"num_tokens": 6070370.0,
"step": 688
},
{
"entropy": 1.1012869477272034,
"epoch": 2.48868778280543,
"grad_norm": 0.545578122138977,
"learning_rate": 0.00016539109920014498,
"loss": 0.10669726133346558,
"mean_token_accuracy": 0.9735602736473083,
"num_tokens": 6079236.0,
"step": 689
},
{
"entropy": 1.1038524806499481,
"epoch": 2.4923076923076923,
"grad_norm": 0.6199021339416504,
"learning_rate": 0.0001652873406422945,
"loss": 0.09693003445863724,
"mean_token_accuracy": 0.9718763530254364,
"num_tokens": 6087610.0,
"step": 690
},
{
"entropy": 1.0678670406341553,
"epoch": 2.4959276018099548,
"grad_norm": 0.5688477754592896,
"learning_rate": 0.00016518346392932625,
"loss": 0.0637565553188324,
"mean_token_accuracy": 0.9771751910448074,
"num_tokens": 6096248.0,
"step": 691
},
{
"entropy": 1.0639970898628235,
"epoch": 2.499547511312217,
"grad_norm": 0.6916640400886536,
"learning_rate": 0.00016507946928323607,
"loss": 0.13879074156284332,
"mean_token_accuracy": 0.9657151252031326,
"num_tokens": 6105076.0,
"step": 692
},
{
"entropy": 0.9907592087984085,
"epoch": 2.5031674208144796,
"grad_norm": 0.6389366984367371,
"learning_rate": 0.000164975356926272,
"loss": 0.1363474577665329,
"mean_token_accuracy": 0.9586378186941147,
"num_tokens": 6114625.0,
"step": 693
},
{
"entropy": 1.131547451019287,
"epoch": 2.506787330316742,
"grad_norm": 1.4108179807662964,
"learning_rate": 0.0001648711270809335,
"loss": 0.09669815748929977,
"mean_token_accuracy": 0.9737755954265594,
"num_tokens": 6123053.0,
"step": 694
},
{
"entropy": 1.0598485469818115,
"epoch": 2.5104072398190045,
"grad_norm": 0.4932820796966553,
"learning_rate": 0.0001647667799699713,
"loss": 0.05943776294589043,
"mean_token_accuracy": 0.9825068265199661,
"num_tokens": 6131892.0,
"step": 695
},
{
"entropy": 1.1157889068126678,
"epoch": 2.514027149321267,
"grad_norm": 0.513300895690918,
"learning_rate": 0.00016466231581638654,
"loss": 0.0392400287091732,
"mean_token_accuracy": 0.9910160303115845,
"num_tokens": 6140337.0,
"step": 696
},
{
"entropy": 1.0774150341749191,
"epoch": 2.5176470588235293,
"grad_norm": 0.48255565762519836,
"learning_rate": 0.00016455773484343062,
"loss": 0.06849890947341919,
"mean_token_accuracy": 0.9777995198965073,
"num_tokens": 6149473.0,
"step": 697
},
{
"entropy": 1.0931665301322937,
"epoch": 2.521266968325792,
"grad_norm": 0.4410548806190491,
"learning_rate": 0.0001644530372746046,
"loss": 0.04295572638511658,
"mean_token_accuracy": 0.9881383031606674,
"num_tokens": 6158108.0,
"step": 698
},
{
"entropy": 1.0721315890550613,
"epoch": 2.524886877828054,
"grad_norm": 0.6796205639839172,
"learning_rate": 0.00016434822333365867,
"loss": 0.08365722000598907,
"mean_token_accuracy": 0.9753518998622894,
"num_tokens": 6167169.0,
"step": 699
},
{
"entropy": 1.0268315523862839,
"epoch": 2.5285067873303166,
"grad_norm": 0.4017026722431183,
"learning_rate": 0.00016424329324459167,
"loss": 0.04044954478740692,
"mean_token_accuracy": 0.985278993844986,
"num_tokens": 6176461.0,
"step": 700
},
{
"entropy": 1.1094456613063812,
"epoch": 2.532126696832579,
"grad_norm": 0.3827606439590454,
"learning_rate": 0.0001641382472316508,
"loss": 0.047150127589702606,
"mean_token_accuracy": 0.9888079762458801,
"num_tokens": 6185277.0,
"step": 701
},
{
"entropy": 1.0462537854909897,
"epoch": 2.5357466063348415,
"grad_norm": 0.5289177894592285,
"learning_rate": 0.00016403308551933085,
"loss": 0.0867241695523262,
"mean_token_accuracy": 0.9772706627845764,
"num_tokens": 6194094.0,
"step": 702
},
{
"entropy": 1.1250386536121368,
"epoch": 2.539366515837104,
"grad_norm": 0.586585521697998,
"learning_rate": 0.000163927808332374,
"loss": 0.04973389208316803,
"mean_token_accuracy": 0.9858016967773438,
"num_tokens": 6202301.0,
"step": 703
},
{
"entropy": 1.1375506520271301,
"epoch": 2.5429864253393664,
"grad_norm": 0.745067298412323,
"learning_rate": 0.00016382241589576918,
"loss": 0.0903111919760704,
"mean_token_accuracy": 0.9715902656316757,
"num_tokens": 6210728.0,
"step": 704
},
{
"entropy": 1.0829818844795227,
"epoch": 2.546606334841629,
"grad_norm": 0.5703446865081787,
"learning_rate": 0.00016371690843475153,
"loss": 0.08242139220237732,
"mean_token_accuracy": 0.9692940562963486,
"num_tokens": 6219287.0,
"step": 705
},
{
"entropy": 1.0283890515565872,
"epoch": 2.5502262443438912,
"grad_norm": 0.4306364357471466,
"learning_rate": 0.00016361128617480212,
"loss": 0.053607720881700516,
"mean_token_accuracy": 0.9828784912824631,
"num_tokens": 6227933.0,
"step": 706
},
{
"entropy": 1.0512140542268753,
"epoch": 2.5538461538461537,
"grad_norm": 0.6051420569419861,
"learning_rate": 0.0001635055493416473,
"loss": 0.09999658167362213,
"mean_token_accuracy": 0.9734647274017334,
"num_tokens": 6237019.0,
"step": 707
},
{
"entropy": 1.1175464391708374,
"epoch": 2.557466063348416,
"grad_norm": 0.5659928321838379,
"learning_rate": 0.00016339969816125832,
"loss": 0.0763474628329277,
"mean_token_accuracy": 0.9784427285194397,
"num_tokens": 6245720.0,
"step": 708
},
{
"entropy": 0.9877434521913528,
"epoch": 2.5610859728506785,
"grad_norm": 0.6605835556983948,
"learning_rate": 0.00016329373285985078,
"loss": 0.14345091581344604,
"mean_token_accuracy": 0.9613773822784424,
"num_tokens": 6255493.0,
"step": 709
},
{
"entropy": 1.0417379438877106,
"epoch": 2.564705882352941,
"grad_norm": 0.5875850915908813,
"learning_rate": 0.0001631876536638841,
"loss": 0.11363250762224197,
"mean_token_accuracy": 0.9701417088508606,
"num_tokens": 6264663.0,
"step": 710
},
{
"entropy": 1.1045846343040466,
"epoch": 2.5683257918552034,
"grad_norm": 0.543202817440033,
"learning_rate": 0.00016308146080006123,
"loss": 0.062342025339603424,
"mean_token_accuracy": 0.9822514802217484,
"num_tokens": 6272943.0,
"step": 711
},
{
"entropy": 1.1131125092506409,
"epoch": 2.571945701357466,
"grad_norm": 0.5200212001800537,
"learning_rate": 0.00016297515449532795,
"loss": 0.0743773877620697,
"mean_token_accuracy": 0.9790451228618622,
"num_tokens": 6282017.0,
"step": 712
},
{
"entropy": 1.0882102400064468,
"epoch": 2.5755656108597282,
"grad_norm": 0.2667261064052582,
"learning_rate": 0.0001628687349768726,
"loss": 0.02705930732190609,
"mean_token_accuracy": 0.9914503395557404,
"num_tokens": 6291228.0,
"step": 713
},
{
"entropy": 1.1583672761917114,
"epoch": 2.579185520361991,
"grad_norm": 0.3874809443950653,
"learning_rate": 0.00016276220247212522,
"loss": 0.027443181723356247,
"mean_token_accuracy": 0.9916303902864456,
"num_tokens": 6299957.0,
"step": 714
},
{
"entropy": 1.1209280490875244,
"epoch": 2.5828054298642535,
"grad_norm": 0.3878846764564514,
"learning_rate": 0.00016265555720875756,
"loss": 0.04895198345184326,
"mean_token_accuracy": 0.9888665974140167,
"num_tokens": 6309045.0,
"step": 715
},
{
"entropy": 1.0764774233102798,
"epoch": 2.586425339366516,
"grad_norm": 0.43458428978919983,
"learning_rate": 0.00016254879941468223,
"loss": 0.043831516057252884,
"mean_token_accuracy": 0.9901125580072403,
"num_tokens": 6317957.0,
"step": 716
},
{
"entropy": 1.0688991844654083,
"epoch": 2.5900452488687784,
"grad_norm": 0.4384344220161438,
"learning_rate": 0.0001624419293180524,
"loss": 0.0616069957613945,
"mean_token_accuracy": 0.983244925737381,
"num_tokens": 6327135.0,
"step": 717
},
{
"entropy": 1.1092505156993866,
"epoch": 2.593665158371041,
"grad_norm": 0.5844867825508118,
"learning_rate": 0.00016233494714726118,
"loss": 0.04842917621135712,
"mean_token_accuracy": 0.9868716895580292,
"num_tokens": 6336285.0,
"step": 718
},
{
"entropy": 1.0994287133216858,
"epoch": 2.5972850678733033,
"grad_norm": 0.6489019393920898,
"learning_rate": 0.0001622278531309412,
"loss": 0.14275093376636505,
"mean_token_accuracy": 0.9666349589824677,
"num_tokens": 6345608.0,
"step": 719
},
{
"entropy": 1.1014614403247833,
"epoch": 2.6009049773755657,
"grad_norm": 0.8203519582748413,
"learning_rate": 0.00016212064749796418,
"loss": 0.12135060131549835,
"mean_token_accuracy": 0.9609973132610321,
"num_tokens": 6354345.0,
"step": 720
},
{
"entropy": 1.1402380466461182,
"epoch": 2.604524886877828,
"grad_norm": 0.3748965263366699,
"learning_rate": 0.00016201333047744025,
"loss": 0.0338396318256855,
"mean_token_accuracy": 0.9894662201404572,
"num_tokens": 6362603.0,
"step": 721
},
{
"entropy": 1.1779527068138123,
"epoch": 2.6081447963800906,
"grad_norm": 0.5257209539413452,
"learning_rate": 0.00016190590229871773,
"loss": 0.08586816489696503,
"mean_token_accuracy": 0.9766449332237244,
"num_tokens": 6370579.0,
"step": 722
},
{
"entropy": 1.13621586561203,
"epoch": 2.611764705882353,
"grad_norm": 0.7259751558303833,
"learning_rate": 0.00016179836319138243,
"loss": 0.13181325793266296,
"mean_token_accuracy": 0.9618227183818817,
"num_tokens": 6378869.0,
"step": 723
},
{
"entropy": 1.113575041294098,
"epoch": 2.6153846153846154,
"grad_norm": 0.49848487973213196,
"learning_rate": 0.00016169071338525718,
"loss": 0.09116464853286743,
"mean_token_accuracy": 0.9724429994821548,
"num_tokens": 6388111.0,
"step": 724
},
{
"entropy": 1.07393079996109,
"epoch": 2.619004524886878,
"grad_norm": 0.742392897605896,
"learning_rate": 0.0001615829531104015,
"loss": 0.16890200972557068,
"mean_token_accuracy": 0.9505817890167236,
"num_tokens": 6397238.0,
"step": 725
},
{
"entropy": 1.0620396435260773,
"epoch": 2.6226244343891403,
"grad_norm": 0.5371675491333008,
"learning_rate": 0.00016147508259711088,
"loss": 0.11606789380311966,
"mean_token_accuracy": 0.9657173007726669,
"num_tokens": 6406558.0,
"step": 726
},
{
"entropy": 1.0380123108625412,
"epoch": 2.6262443438914027,
"grad_norm": 0.4158170521259308,
"learning_rate": 0.00016136710207591653,
"loss": 0.06112787127494812,
"mean_token_accuracy": 0.9832183867692947,
"num_tokens": 6415614.0,
"step": 727
},
{
"entropy": 1.1722862720489502,
"epoch": 2.629864253393665,
"grad_norm": 0.6638741493225098,
"learning_rate": 0.00016125901177758457,
"loss": 0.06863709539175034,
"mean_token_accuracy": 0.9769297689199448,
"num_tokens": 6423840.0,
"step": 728
},
{
"entropy": 1.1042883694171906,
"epoch": 2.6334841628959276,
"grad_norm": 0.6980579495429993,
"learning_rate": 0.00016115081193311592,
"loss": 0.3179304003715515,
"mean_token_accuracy": 0.9388009011745453,
"num_tokens": 6433056.0,
"step": 729
},
{
"entropy": 1.0506968647241592,
"epoch": 2.63710407239819,
"grad_norm": 0.4494501054286957,
"learning_rate": 0.00016104250277374548,
"loss": 0.06316471844911575,
"mean_token_accuracy": 0.9813105314970016,
"num_tokens": 6442272.0,
"step": 730
},
{
"entropy": 1.0719667375087738,
"epoch": 2.6407239819004524,
"grad_norm": 0.49762746691703796,
"learning_rate": 0.00016093408453094182,
"loss": 0.07798872143030167,
"mean_token_accuracy": 0.9814620614051819,
"num_tokens": 6451215.0,
"step": 731
},
{
"entropy": 1.0985628068447113,
"epoch": 2.644343891402715,
"grad_norm": 0.5135414600372314,
"learning_rate": 0.00016082555743640668,
"loss": 0.05784185230731964,
"mean_token_accuracy": 0.981594517827034,
"num_tokens": 6460140.0,
"step": 732
},
{
"entropy": 1.1157509833574295,
"epoch": 2.6479638009049773,
"grad_norm": 0.9371573328971863,
"learning_rate": 0.00016071692172207435,
"loss": 0.19296394288539886,
"mean_token_accuracy": 0.9558616280555725,
"num_tokens": 6469215.0,
"step": 733
},
{
"entropy": 1.0665288716554642,
"epoch": 2.6515837104072397,
"grad_norm": 0.4882467985153198,
"learning_rate": 0.00016060817762011126,
"loss": 0.06225307658314705,
"mean_token_accuracy": 0.9782555550336838,
"num_tokens": 6477975.0,
"step": 734
},
{
"entropy": 1.058951660990715,
"epoch": 2.655203619909502,
"grad_norm": 0.44127899408340454,
"learning_rate": 0.00016049932536291552,
"loss": 0.054334647953510284,
"mean_token_accuracy": 0.9814004898071289,
"num_tokens": 6486967.0,
"step": 735
},
{
"entropy": 1.0905370116233826,
"epoch": 2.6588235294117646,
"grad_norm": 0.43531718850135803,
"learning_rate": 0.00016039036518311633,
"loss": 0.047857142984867096,
"mean_token_accuracy": 0.9830509722232819,
"num_tokens": 6495636.0,
"step": 736
},
{
"entropy": 1.0584595501422882,
"epoch": 2.662443438914027,
"grad_norm": 0.5107179284095764,
"learning_rate": 0.00016028129731357366,
"loss": 0.069837786257267,
"mean_token_accuracy": 0.9805668294429779,
"num_tokens": 6504726.0,
"step": 737
},
{
"entropy": 1.0202111154794693,
"epoch": 2.6660633484162894,
"grad_norm": 0.4959315359592438,
"learning_rate": 0.00016017212198737732,
"loss": 0.10300014168024063,
"mean_token_accuracy": 0.9703309834003448,
"num_tokens": 6514345.0,
"step": 738
},
{
"entropy": 1.127130776643753,
"epoch": 2.669683257918552,
"grad_norm": 0.6754599213600159,
"learning_rate": 0.00016006283943784715,
"loss": 0.10128761827945709,
"mean_token_accuracy": 0.9754067361354828,
"num_tokens": 6523052.0,
"step": 739
},
{
"entropy": 1.1106500625610352,
"epoch": 2.6733031674208148,
"grad_norm": 0.7177493572235107,
"learning_rate": 0.00015995344989853193,
"loss": 0.19780850410461426,
"mean_token_accuracy": 0.9586081951856613,
"num_tokens": 6532090.0,
"step": 740
},
{
"entropy": 1.0810956358909607,
"epoch": 2.676923076923077,
"grad_norm": 0.5800163149833679,
"learning_rate": 0.00015984395360320902,
"loss": 0.07419327646493912,
"mean_token_accuracy": 0.9828365594148636,
"num_tokens": 6541418.0,
"step": 741
},
{
"entropy": 1.1289043724536896,
"epoch": 2.6805429864253396,
"grad_norm": 0.41637271642684937,
"learning_rate": 0.0001597343507858841,
"loss": 0.07509341835975647,
"mean_token_accuracy": 0.9790188372135162,
"num_tokens": 6550296.0,
"step": 742
},
{
"entropy": 1.150795191526413,
"epoch": 2.684162895927602,
"grad_norm": 0.6585742831230164,
"learning_rate": 0.00015962464168079045,
"loss": 0.10690723359584808,
"mean_token_accuracy": 0.9658952206373215,
"num_tokens": 6558723.0,
"step": 743
},
{
"entropy": 1.1867717504501343,
"epoch": 2.6877828054298645,
"grad_norm": 0.6974299550056458,
"learning_rate": 0.00015951482652238843,
"loss": 0.1916104406118393,
"mean_token_accuracy": 0.9603984951972961,
"num_tokens": 6567642.0,
"step": 744
},
{
"entropy": 1.1048817336559296,
"epoch": 2.691402714932127,
"grad_norm": 0.4655424952507019,
"learning_rate": 0.0001594049055453651,
"loss": 0.12130922079086304,
"mean_token_accuracy": 0.9698162972927094,
"num_tokens": 6577045.0,
"step": 745
},
{
"entropy": 1.162132978439331,
"epoch": 2.6950226244343893,
"grad_norm": 0.6110576391220093,
"learning_rate": 0.00015929487898463368,
"loss": 0.07808665931224823,
"mean_token_accuracy": 0.9791989624500275,
"num_tokens": 6585933.0,
"step": 746
},
{
"entropy": 1.1560872793197632,
"epoch": 2.6986425339366518,
"grad_norm": 0.6684656739234924,
"learning_rate": 0.00015918474707533298,
"loss": 0.08515496551990509,
"mean_token_accuracy": 0.9754028916358948,
"num_tokens": 6594832.0,
"step": 747
},
{
"entropy": 1.2077683806419373,
"epoch": 2.702262443438914,
"grad_norm": 0.624879002571106,
"learning_rate": 0.00015907451005282698,
"loss": 0.1127995178103447,
"mean_token_accuracy": 0.9747825711965561,
"num_tokens": 6603677.0,
"step": 748
},
{
"entropy": 1.1817778050899506,
"epoch": 2.7058823529411766,
"grad_norm": 0.4592488408088684,
"learning_rate": 0.00015896416815270437,
"loss": 0.0558847077190876,
"mean_token_accuracy": 0.9848069846630096,
"num_tokens": 6612569.0,
"step": 749
},
{
"entropy": 1.1611264646053314,
"epoch": 2.709502262443439,
"grad_norm": 0.4479089677333832,
"learning_rate": 0.0001588537216107778,
"loss": 0.07405956834554672,
"mean_token_accuracy": 0.9793966263532639,
"num_tokens": 6621219.0,
"step": 750
},
{
"entropy": 1.1612786650657654,
"epoch": 2.7131221719457015,
"grad_norm": 0.42668989300727844,
"learning_rate": 0.00015874317066308372,
"loss": 0.06808929890394211,
"mean_token_accuracy": 0.985272541642189,
"num_tokens": 6630187.0,
"step": 751
},
{
"entropy": 1.2155989408493042,
"epoch": 2.716742081447964,
"grad_norm": 0.73838210105896,
"learning_rate": 0.00015863251554588167,
"loss": 0.13851481676101685,
"mean_token_accuracy": 0.963520884513855,
"num_tokens": 6638579.0,
"step": 752
},
{
"entropy": 1.1978608965873718,
"epoch": 2.7203619909502263,
"grad_norm": 0.5378918647766113,
"learning_rate": 0.00015852175649565375,
"loss": 0.08914665132761002,
"mean_token_accuracy": 0.9813847839832306,
"num_tokens": 6646986.0,
"step": 753
},
{
"entropy": 1.203873634338379,
"epoch": 2.723981900452489,
"grad_norm": 0.518156111240387,
"learning_rate": 0.0001584108937491042,
"loss": 0.07944593578577042,
"mean_token_accuracy": 0.9775935709476471,
"num_tokens": 6655534.0,
"step": 754
},
{
"entropy": 1.1785966455936432,
"epoch": 2.727601809954751,
"grad_norm": 0.6289830803871155,
"learning_rate": 0.00015829992754315893,
"loss": 0.08792146295309067,
"mean_token_accuracy": 0.9775703102350235,
"num_tokens": 6664173.0,
"step": 755
},
{
"entropy": 1.0918410420417786,
"epoch": 2.7312217194570136,
"grad_norm": 0.553460955619812,
"learning_rate": 0.00015818885811496485,
"loss": 0.10784870386123657,
"mean_token_accuracy": 0.9730519503355026,
"num_tokens": 6673280.0,
"step": 756
},
{
"entropy": 1.141640067100525,
"epoch": 2.734841628959276,
"grad_norm": 0.5629732012748718,
"learning_rate": 0.0001580776857018895,
"loss": 0.10254421085119247,
"mean_token_accuracy": 0.967141255736351,
"num_tokens": 6682098.0,
"step": 757
},
{
"entropy": 1.093793347477913,
"epoch": 2.7384615384615385,
"grad_norm": 0.4974505603313446,
"learning_rate": 0.00015796641054152067,
"loss": 0.08042000234127045,
"mean_token_accuracy": 0.9785896241664886,
"num_tokens": 6690846.0,
"step": 758
},
{
"entropy": 1.1073049008846283,
"epoch": 2.742081447963801,
"grad_norm": 0.4363830089569092,
"learning_rate": 0.00015785503287166547,
"loss": 0.07343566417694092,
"mean_token_accuracy": 0.974511981010437,
"num_tokens": 6699819.0,
"step": 759
},
{
"entropy": 1.0879010558128357,
"epoch": 2.7457013574660634,
"grad_norm": 0.38586366176605225,
"learning_rate": 0.00015774355293035025,
"loss": 0.04004283994436264,
"mean_token_accuracy": 0.9880010634660721,
"num_tokens": 6708498.0,
"step": 760
},
{
"entropy": 1.078858882188797,
"epoch": 2.749321266968326,
"grad_norm": 0.6422889232635498,
"learning_rate": 0.0001576319709558199,
"loss": 0.10385136306285858,
"mean_token_accuracy": 0.974150076508522,
"num_tokens": 6717185.0,
"step": 761
},
{
"entropy": 1.077678918838501,
"epoch": 2.7529411764705882,
"grad_norm": 0.7637373805046082,
"learning_rate": 0.00015752028718653735,
"loss": 0.06696485728025436,
"mean_token_accuracy": 0.9813330173492432,
"num_tokens": 6725915.0,
"step": 762
},
{
"entropy": 1.155141294002533,
"epoch": 2.7565610859728507,
"grad_norm": 0.6038472056388855,
"learning_rate": 0.00015740850186118306,
"loss": 0.06791059672832489,
"mean_token_accuracy": 0.9810560345649719,
"num_tokens": 6734292.0,
"step": 763
},
{
"entropy": 1.161244884133339,
"epoch": 2.760180995475113,
"grad_norm": 0.5682697892189026,
"learning_rate": 0.00015729661521865452,
"loss": 0.06183531507849693,
"mean_token_accuracy": 0.9830152094364166,
"num_tokens": 6742663.0,
"step": 764
},
{
"entropy": 1.0876408368349075,
"epoch": 2.7638009049773755,
"grad_norm": 0.4295364320278168,
"learning_rate": 0.00015718462749806587,
"loss": 0.0469270683825016,
"mean_token_accuracy": 0.9854940623044968,
"num_tokens": 6751726.0,
"step": 765
},
{
"entropy": 1.1176190674304962,
"epoch": 2.767420814479638,
"grad_norm": 0.5521655678749084,
"learning_rate": 0.00015707253893874705,
"loss": 0.10531380772590637,
"mean_token_accuracy": 0.9681271910667419,
"num_tokens": 6760728.0,
"step": 766
},
{
"entropy": 1.1246620118618011,
"epoch": 2.7710407239819004,
"grad_norm": 0.5280638337135315,
"learning_rate": 0.00015696034978024368,
"loss": 0.05504276230931282,
"mean_token_accuracy": 0.9822255373001099,
"num_tokens": 6769862.0,
"step": 767
},
{
"entropy": 1.167225182056427,
"epoch": 2.774660633484163,
"grad_norm": 0.7716050148010254,
"learning_rate": 0.0001568480602623163,
"loss": 0.08834150433540344,
"mean_token_accuracy": 0.9732990860939026,
"num_tokens": 6778769.0,
"step": 768
},
{
"entropy": 1.1058853566646576,
"epoch": 2.7782805429864252,
"grad_norm": 0.5791372060775757,
"learning_rate": 0.00015673567062493993,
"loss": 0.06552623212337494,
"mean_token_accuracy": 0.9831362217664719,
"num_tokens": 6787847.0,
"step": 769
},
{
"entropy": 1.1691460013389587,
"epoch": 2.7819004524886877,
"grad_norm": 0.5165508389472961,
"learning_rate": 0.00015662318110830356,
"loss": 0.10017089545726776,
"mean_token_accuracy": 0.9761621057987213,
"num_tokens": 6796419.0,
"step": 770
},
{
"entropy": 1.1316891312599182,
"epoch": 2.78552036199095,
"grad_norm": 0.5280610918998718,
"learning_rate": 0.00015651059195280972,
"loss": 0.04987496882677078,
"mean_token_accuracy": 0.984514445066452,
"num_tokens": 6805123.0,
"step": 771
},
{
"entropy": 1.104190319776535,
"epoch": 2.7891402714932125,
"grad_norm": 0.4536793828010559,
"learning_rate": 0.0001563979033990737,
"loss": 0.06398440897464752,
"mean_token_accuracy": 0.9832892119884491,
"num_tokens": 6814257.0,
"step": 772
},
{
"entropy": 1.0896694660186768,
"epoch": 2.792760180995475,
"grad_norm": 0.45330801606178284,
"learning_rate": 0.0001562851156879233,
"loss": 0.06826596707105637,
"mean_token_accuracy": 0.9796571433544159,
"num_tokens": 6823932.0,
"step": 773
},
{
"entropy": 1.067289099097252,
"epoch": 2.7963800904977374,
"grad_norm": 0.3646707236766815,
"learning_rate": 0.0001561722290603983,
"loss": 0.056463152170181274,
"mean_token_accuracy": 0.9811888188123703,
"num_tokens": 6833045.0,
"step": 774
},
{
"entropy": 1.1854591965675354,
"epoch": 2.8,
"grad_norm": 0.7943421602249146,
"learning_rate": 0.00015605924375774986,
"loss": 0.22466346621513367,
"mean_token_accuracy": 0.9558106809854507,
"num_tokens": 6841858.0,
"step": 775
},
{
"entropy": 1.2166031897068024,
"epoch": 2.8036199095022623,
"grad_norm": 0.5690715909004211,
"learning_rate": 0.0001559461600214399,
"loss": 0.07189326733350754,
"mean_token_accuracy": 0.9798267781734467,
"num_tokens": 6849972.0,
"step": 776
},
{
"entropy": 1.0468733608722687,
"epoch": 2.8072398190045247,
"grad_norm": 0.39525169134140015,
"learning_rate": 0.0001558329780931408,
"loss": 0.07288673520088196,
"mean_token_accuracy": 0.9749600887298584,
"num_tokens": 6859355.0,
"step": 777
},
{
"entropy": 1.176623821258545,
"epoch": 2.810859728506787,
"grad_norm": 0.5643757581710815,
"learning_rate": 0.0001557196982147348,
"loss": 0.06095210090279579,
"mean_token_accuracy": 0.9794624149799347,
"num_tokens": 6867749.0,
"step": 778
},
{
"entropy": 1.1942293345928192,
"epoch": 2.8144796380090495,
"grad_norm": 0.5817746520042419,
"learning_rate": 0.00015560632062831337,
"loss": 0.06656618416309357,
"mean_token_accuracy": 0.9780125468969345,
"num_tokens": 6876216.0,
"step": 779
},
{
"entropy": 1.0938580930233002,
"epoch": 2.818099547511312,
"grad_norm": 0.3751153349876404,
"learning_rate": 0.00015549284557617697,
"loss": 0.06173507869243622,
"mean_token_accuracy": 0.9801245480775833,
"num_tokens": 6885009.0,
"step": 780
},
{
"entropy": 1.147508054971695,
"epoch": 2.8217194570135744,
"grad_norm": 0.5696470141410828,
"learning_rate": 0.00015537927330083412,
"loss": 0.11268901824951172,
"mean_token_accuracy": 0.9691483378410339,
"num_tokens": 6893473.0,
"step": 781
},
{
"entropy": 1.1333416998386383,
"epoch": 2.825339366515837,
"grad_norm": 0.5513712167739868,
"learning_rate": 0.00015526560404500138,
"loss": 0.07699988782405853,
"mean_token_accuracy": 0.9764687865972519,
"num_tokens": 6901878.0,
"step": 782
},
{
"entropy": 1.1387991607189178,
"epoch": 2.8289592760180997,
"grad_norm": 0.6534690260887146,
"learning_rate": 0.00015515183805160228,
"loss": 0.10432648658752441,
"mean_token_accuracy": 0.9748246222734451,
"num_tokens": 6910566.0,
"step": 783
},
{
"entropy": 1.0964215993881226,
"epoch": 2.832579185520362,
"grad_norm": 0.5456584095954895,
"learning_rate": 0.00015503797556376737,
"loss": 0.09832392632961273,
"mean_token_accuracy": 0.9682436734437943,
"num_tokens": 6919451.0,
"step": 784
},
{
"entropy": 1.100774735212326,
"epoch": 2.8361990950226246,
"grad_norm": 0.499659925699234,
"learning_rate": 0.00015492401682483324,
"loss": 0.08655116707086563,
"mean_token_accuracy": 0.9732284247875214,
"num_tokens": 6928533.0,
"step": 785
},
{
"entropy": 1.0831056982278824,
"epoch": 2.839819004524887,
"grad_norm": 0.5561793446540833,
"learning_rate": 0.0001548099620783422,
"loss": 0.06871826946735382,
"mean_token_accuracy": 0.9839567542076111,
"num_tokens": 6937781.0,
"step": 786
},
{
"entropy": 1.0972970724105835,
"epoch": 2.8434389140271494,
"grad_norm": 0.46581363677978516,
"learning_rate": 0.0001546958115680418,
"loss": 0.06161344051361084,
"mean_token_accuracy": 0.9849719405174255,
"num_tokens": 6946751.0,
"step": 787
},
{
"entropy": 1.1013003289699554,
"epoch": 2.847058823529412,
"grad_norm": 0.5828675031661987,
"learning_rate": 0.00015458156553788423,
"loss": 0.2391076385974884,
"mean_token_accuracy": 0.9569680541753769,
"num_tokens": 6956178.0,
"step": 788
},
{
"entropy": 1.129268318414688,
"epoch": 2.8506787330316743,
"grad_norm": 0.5637741088867188,
"learning_rate": 0.00015446722423202575,
"loss": 0.1284228265285492,
"mean_token_accuracy": 0.9719979614019394,
"num_tokens": 6965201.0,
"step": 789
},
{
"entropy": 1.1032065749168396,
"epoch": 2.8542986425339367,
"grad_norm": 0.3668762147426605,
"learning_rate": 0.00015435278789482636,
"loss": 0.04885539412498474,
"mean_token_accuracy": 0.9844310134649277,
"num_tokens": 6974144.0,
"step": 790
},
{
"entropy": 1.10554239153862,
"epoch": 2.857918552036199,
"grad_norm": 0.5248369574546814,
"learning_rate": 0.00015423825677084895,
"loss": 0.04060475528240204,
"mean_token_accuracy": 0.988097608089447,
"num_tokens": 6982816.0,
"step": 791
},
{
"entropy": 1.1331679821014404,
"epoch": 2.8615384615384616,
"grad_norm": 0.7115232348442078,
"learning_rate": 0.00015412363110485928,
"loss": 0.10422416776418686,
"mean_token_accuracy": 0.9733791649341583,
"num_tokens": 6991605.0,
"step": 792
},
{
"entropy": 1.0728564262390137,
"epoch": 2.865158371040724,
"grad_norm": 0.7061241269111633,
"learning_rate": 0.00015400891114182488,
"loss": 0.10765457153320312,
"mean_token_accuracy": 0.9669128805398941,
"num_tokens": 7000489.0,
"step": 793
},
{
"entropy": 1.1403984129428864,
"epoch": 2.8687782805429864,
"grad_norm": 0.6950392127037048,
"learning_rate": 0.0001538940971269149,
"loss": 0.0980193242430687,
"mean_token_accuracy": 0.9763042032718658,
"num_tokens": 7008774.0,
"step": 794
},
{
"entropy": 1.069732904434204,
"epoch": 2.872398190045249,
"grad_norm": 0.3522324860095978,
"learning_rate": 0.00015377918930549952,
"loss": 0.0528433583676815,
"mean_token_accuracy": 0.9891404360532761,
"num_tokens": 7017656.0,
"step": 795
},
{
"entropy": 1.1608846485614777,
"epoch": 2.8760180995475113,
"grad_norm": 0.6504537463188171,
"learning_rate": 0.00015366418792314937,
"loss": 0.053374141454696655,
"mean_token_accuracy": 0.982115313410759,
"num_tokens": 7025980.0,
"step": 796
},
{
"entropy": 1.0937940925359726,
"epoch": 2.8796380090497737,
"grad_norm": 0.6381659507751465,
"learning_rate": 0.0001535490932256351,
"loss": 0.07902165502309799,
"mean_token_accuracy": 0.9712257385253906,
"num_tokens": 7034682.0,
"step": 797
},
{
"entropy": 1.1293490529060364,
"epoch": 2.883257918552036,
"grad_norm": 0.6040161848068237,
"learning_rate": 0.00015343390545892658,
"loss": 0.059771500527858734,
"mean_token_accuracy": 0.9815134108066559,
"num_tokens": 7043427.0,
"step": 798
},
{
"entropy": 1.0549205243587494,
"epoch": 2.8868778280542986,
"grad_norm": 0.5692182779312134,
"learning_rate": 0.00015331862486919282,
"loss": 0.09364207834005356,
"mean_token_accuracy": 0.9750859886407852,
"num_tokens": 7052551.0,
"step": 799
},
{
"entropy": 1.097869873046875,
"epoch": 2.890497737556561,
"grad_norm": 0.42280399799346924,
"learning_rate": 0.00015320325170280107,
"loss": 0.04443611204624176,
"mean_token_accuracy": 0.9883531183004379,
"num_tokens": 7061476.0,
"step": 800
},
{
"entropy": 1.082559585571289,
"epoch": 2.8941176470588235,
"grad_norm": 0.263492614030838,
"learning_rate": 0.00015308778620631643,
"loss": 0.02307066321372986,
"mean_token_accuracy": 0.9891369044780731,
"num_tokens": 7070227.0,
"step": 801
},
{
"entropy": 1.131628304719925,
"epoch": 2.897737556561086,
"grad_norm": 0.5446733236312866,
"learning_rate": 0.0001529722286265014,
"loss": 0.059960536658763885,
"mean_token_accuracy": 0.979179710149765,
"num_tokens": 7079092.0,
"step": 802
},
{
"entropy": 1.0937869250774384,
"epoch": 2.9013574660633483,
"grad_norm": 0.5581566691398621,
"learning_rate": 0.00015285657921031514,
"loss": 0.08178013563156128,
"mean_token_accuracy": 0.9795632362365723,
"num_tokens": 7088409.0,
"step": 803
},
{
"entropy": 1.185722142457962,
"epoch": 2.9049773755656108,
"grad_norm": 0.9806889295578003,
"learning_rate": 0.00015274083820491325,
"loss": 0.14850257337093353,
"mean_token_accuracy": 0.9594536870718002,
"num_tokens": 7096631.0,
"step": 804
},
{
"entropy": 1.0202390849590302,
"epoch": 2.908597285067873,
"grad_norm": 0.5378230214118958,
"learning_rate": 0.00015262500585764687,
"loss": 0.08723378926515579,
"mean_token_accuracy": 0.9726832509040833,
"num_tokens": 7106014.0,
"step": 805
},
{
"entropy": 1.1566883027553558,
"epoch": 2.9122171945701356,
"grad_norm": 0.4863153100013733,
"learning_rate": 0.00015250908241606253,
"loss": 0.04234391450881958,
"mean_token_accuracy": 0.9871827512979507,
"num_tokens": 7114443.0,
"step": 806
},
{
"entropy": 1.1635094583034515,
"epoch": 2.915837104072398,
"grad_norm": 0.7457255125045776,
"learning_rate": 0.00015239306812790129,
"loss": 0.11422393471002579,
"mean_token_accuracy": 0.969372570514679,
"num_tokens": 7123016.0,
"step": 807
},
{
"entropy": 1.1335663199424744,
"epoch": 2.9194570135746605,
"grad_norm": 0.7970643043518066,
"learning_rate": 0.00015227696324109845,
"loss": 0.18115636706352234,
"mean_token_accuracy": 0.9552556723356247,
"num_tokens": 7131986.0,
"step": 808
},
{
"entropy": 1.0883623361587524,
"epoch": 2.9230769230769234,
"grad_norm": 0.3898361623287201,
"learning_rate": 0.00015216076800378286,
"loss": 0.042640797793865204,
"mean_token_accuracy": 0.9852175265550613,
"num_tokens": 7140496.0,
"step": 809
},
{
"entropy": 1.1370824575424194,
"epoch": 2.926696832579186,
"grad_norm": 0.6726946234703064,
"learning_rate": 0.0001520444826642766,
"loss": 0.08559117466211319,
"mean_token_accuracy": 0.9699132144451141,
"num_tokens": 7149329.0,
"step": 810
},
{
"entropy": 1.1126140654087067,
"epoch": 2.930316742081448,
"grad_norm": 0.6819010972976685,
"learning_rate": 0.00015192810747109413,
"loss": 0.07975786924362183,
"mean_token_accuracy": 0.9801275730133057,
"num_tokens": 7157890.0,
"step": 811
},
{
"entropy": 1.0616440922021866,
"epoch": 2.9339366515837106,
"grad_norm": 0.5790680050849915,
"learning_rate": 0.000151811642672942,
"loss": 0.09595339745283127,
"mean_token_accuracy": 0.9808698296546936,
"num_tokens": 7166656.0,
"step": 812
},
{
"entropy": 1.0721414238214493,
"epoch": 2.937556561085973,
"grad_norm": 0.44250181317329407,
"learning_rate": 0.00015169508851871835,
"loss": 0.05376347899436951,
"mean_token_accuracy": 0.9847523421049118,
"num_tokens": 7175308.0,
"step": 813
},
{
"entropy": 1.0640469789505005,
"epoch": 2.9411764705882355,
"grad_norm": 0.4585307240486145,
"learning_rate": 0.00015157844525751213,
"loss": 0.05569484457373619,
"mean_token_accuracy": 0.9836412966251373,
"num_tokens": 7184404.0,
"step": 814
},
{
"entropy": 1.1266240775585175,
"epoch": 2.944796380090498,
"grad_norm": 0.5807170271873474,
"learning_rate": 0.00015146171313860284,
"loss": 0.04534313082695007,
"mean_token_accuracy": 0.9861873239278793,
"num_tokens": 7192704.0,
"step": 815
},
{
"entropy": 1.04159614443779,
"epoch": 2.9484162895927604,
"grad_norm": 0.6643418073654175,
"learning_rate": 0.00015134489241145984,
"loss": 0.08871917426586151,
"mean_token_accuracy": 0.9774074703454971,
"num_tokens": 7201775.0,
"step": 816
},
{
"entropy": 1.059510976076126,
"epoch": 2.952036199095023,
"grad_norm": 0.6120842695236206,
"learning_rate": 0.00015122798332574183,
"loss": 0.10533600300550461,
"mean_token_accuracy": 0.9733265042304993,
"num_tokens": 7210515.0,
"step": 817
},
{
"entropy": 1.126617282629013,
"epoch": 2.9556561085972852,
"grad_norm": 0.605033814907074,
"learning_rate": 0.00015111098613129637,
"loss": 0.07193339616060257,
"mean_token_accuracy": 0.9838567227125168,
"num_tokens": 7218761.0,
"step": 818
},
{
"entropy": 1.1339809894561768,
"epoch": 2.9592760180995477,
"grad_norm": 0.7737305164337158,
"learning_rate": 0.0001509939010781593,
"loss": 0.1462799310684204,
"mean_token_accuracy": 0.9620816707611084,
"num_tokens": 7227422.0,
"step": 819
},
{
"entropy": 1.0626371949911118,
"epoch": 2.96289592760181,
"grad_norm": 0.4753362834453583,
"learning_rate": 0.0001508767284165543,
"loss": 0.06507248431444168,
"mean_token_accuracy": 0.9830497950315475,
"num_tokens": 7236243.0,
"step": 820
},
{
"entropy": 1.0648977309465408,
"epoch": 2.9665158371040725,
"grad_norm": 0.6420553922653198,
"learning_rate": 0.0001507594683968921,
"loss": 0.0898992195725441,
"mean_token_accuracy": 0.9699191451072693,
"num_tokens": 7245278.0,
"step": 821
},
{
"entropy": 1.1461690664291382,
"epoch": 2.970135746606335,
"grad_norm": 0.738278865814209,
"learning_rate": 0.0001506421212697703,
"loss": 0.10916579514741898,
"mean_token_accuracy": 0.9676524996757507,
"num_tokens": 7254069.0,
"step": 822
},
{
"entropy": 1.1318964213132858,
"epoch": 2.9737556561085974,
"grad_norm": 0.5524135231971741,
"learning_rate": 0.00015052468728597265,
"loss": 0.0710684061050415,
"mean_token_accuracy": 0.9867733418941498,
"num_tokens": 7262846.0,
"step": 823
},
{
"entropy": 1.190606713294983,
"epoch": 2.97737556561086,
"grad_norm": 0.5717188119888306,
"learning_rate": 0.00015040716669646837,
"loss": 0.09186422824859619,
"mean_token_accuracy": 0.9754447788000107,
"num_tokens": 7271169.0,
"step": 824
},
{
"entropy": 1.147791177034378,
"epoch": 2.9809954751131222,
"grad_norm": 0.5148852467536926,
"learning_rate": 0.0001502895597524119,
"loss": 0.05435680225491524,
"mean_token_accuracy": 0.9840258657932281,
"num_tokens": 7280020.0,
"step": 825
},
{
"entropy": 1.1764528155326843,
"epoch": 2.9846153846153847,
"grad_norm": 0.7775517702102661,
"learning_rate": 0.00015017186670514225,
"loss": 0.09064161032438278,
"mean_token_accuracy": 0.9730498492717743,
"num_tokens": 7288874.0,
"step": 826
},
{
"entropy": 1.1749920845031738,
"epoch": 2.988235294117647,
"grad_norm": 0.40038371086120605,
"learning_rate": 0.0001500540878061823,
"loss": 0.07684603333473206,
"mean_token_accuracy": 0.9770887941122055,
"num_tokens": 7298183.0,
"step": 827
},
{
"entropy": 1.1883585751056671,
"epoch": 2.9918552036199095,
"grad_norm": 0.34793826937675476,
"learning_rate": 0.00014993622330723857,
"loss": 0.046813275665044785,
"mean_token_accuracy": 0.9880518466234207,
"num_tokens": 7306947.0,
"step": 828
},
{
"entropy": 1.204830139875412,
"epoch": 2.995475113122172,
"grad_norm": 0.6548672318458557,
"learning_rate": 0.00014981827346020033,
"loss": 0.1149185448884964,
"mean_token_accuracy": 0.9740531295537949,
"num_tokens": 7315487.0,
"step": 829
},
{
"entropy": 1.1531266868114471,
"epoch": 2.9990950226244344,
"grad_norm": 0.40315133333206177,
"learning_rate": 0.00014970023851713945,
"loss": 0.07841235399246216,
"mean_token_accuracy": 0.9789654463529587,
"num_tokens": 7324480.0,
"step": 830
},
{
"entropy": 1.1628870964050293,
"epoch": 3.0,
"grad_norm": 1.4110596179962158,
"learning_rate": 0.0001495821187303095,
"loss": 0.05664234235882759,
"mean_token_accuracy": 0.981249988079071,
"num_tokens": 7325175.0,
"step": 831
},
{
"epoch": 3.0,
"eval_entropy": 1.1410768734730357,
"eval_loss": 0.12047121673822403,
"eval_mean_token_accuracy": 0.9686469343619618,
"eval_num_tokens": 7325175.0,
"eval_runtime": 31.7876,
"eval_samples_per_second": 11.608,
"eval_steps_per_second": 3.869,
"step": 831
},
{
"entropy": 1.1343726515769958,
"epoch": 3.0036199095022624,
"grad_norm": 0.49150392413139343,
"learning_rate": 0.00014946391435214555,
"loss": 0.06470951437950134,
"mean_token_accuracy": 0.9820976257324219,
"num_tokens": 7334031.0,
"step": 832
},
{
"entropy": 1.0768441557884216,
"epoch": 3.007239819004525,
"grad_norm": 0.37447497248649597,
"learning_rate": 0.0001493456256352632,
"loss": 0.05001193284988403,
"mean_token_accuracy": 0.9848204553127289,
"num_tokens": 7342799.0,
"step": 833
},
{
"entropy": 1.1359966397285461,
"epoch": 3.0108597285067873,
"grad_norm": 0.4405897557735443,
"learning_rate": 0.00014922725283245846,
"loss": 0.05152616277337074,
"mean_token_accuracy": 0.9811764508485794,
"num_tokens": 7351473.0,
"step": 834
},
{
"entropy": 1.0891236364841461,
"epoch": 3.0144796380090497,
"grad_norm": 0.5045367479324341,
"learning_rate": 0.00014910879619670704,
"loss": 0.0627930611371994,
"mean_token_accuracy": 0.981208473443985,
"num_tokens": 7360361.0,
"step": 835
},
{
"entropy": 1.0107998549938202,
"epoch": 3.018099547511312,
"grad_norm": 0.574455976486206,
"learning_rate": 0.00014899025598116378,
"loss": 0.05342878773808479,
"mean_token_accuracy": 0.9806361496448517,
"num_tokens": 7369519.0,
"step": 836
},
{
"entropy": 1.087983787059784,
"epoch": 3.0217194570135746,
"grad_norm": 0.49305710196495056,
"learning_rate": 0.00014887163243916212,
"loss": 0.04466189816594124,
"mean_token_accuracy": 0.9900805652141571,
"num_tokens": 7378108.0,
"step": 837
},
{
"entropy": 0.9926144480705261,
"epoch": 3.025339366515837,
"grad_norm": 0.5207587480545044,
"learning_rate": 0.00014875292582421361,
"loss": 0.07304289937019348,
"mean_token_accuracy": 0.9777405709028244,
"num_tokens": 7387315.0,
"step": 838
},
{
"entropy": 1.00730961561203,
"epoch": 3.0289592760180994,
"grad_norm": 0.7438811659812927,
"learning_rate": 0.00014863413639000728,
"loss": 0.05742796137928963,
"mean_token_accuracy": 0.9818450957536697,
"num_tokens": 7396562.0,
"step": 839
},
{
"entropy": 0.9555287957191467,
"epoch": 3.032579185520362,
"grad_norm": 0.3822678029537201,
"learning_rate": 0.00014851526439040922,
"loss": 0.03751285746693611,
"mean_token_accuracy": 0.9855490773916245,
"num_tokens": 7405764.0,
"step": 840
},
{
"entropy": 0.974100261926651,
"epoch": 3.0361990950226243,
"grad_norm": 0.6839661598205566,
"learning_rate": 0.000148396310079462,
"loss": 0.08734611421823502,
"mean_token_accuracy": 0.9757889807224274,
"num_tokens": 7414847.0,
"step": 841
},
{
"entropy": 0.9459673017263412,
"epoch": 3.0398190045248867,
"grad_norm": 0.5624176263809204,
"learning_rate": 0.00014827727371138392,
"loss": 0.07012228667736053,
"mean_token_accuracy": 0.9756064116954803,
"num_tokens": 7424403.0,
"step": 842
},
{
"entropy": 0.9462547451257706,
"epoch": 3.043438914027149,
"grad_norm": 0.6111705899238586,
"learning_rate": 0.00014815815554056888,
"loss": 0.046397194266319275,
"mean_token_accuracy": 0.9845404624938965,
"num_tokens": 7433163.0,
"step": 843
},
{
"entropy": 0.9449738264083862,
"epoch": 3.0470588235294116,
"grad_norm": 0.48121562600135803,
"learning_rate": 0.0001480389558215855,
"loss": 0.06078047305345535,
"mean_token_accuracy": 0.9810367077589035,
"num_tokens": 7441782.0,
"step": 844
},
{
"entropy": 1.069021388888359,
"epoch": 3.050678733031674,
"grad_norm": 0.7874725461006165,
"learning_rate": 0.00014791967480917657,
"loss": 0.08393625169992447,
"mean_token_accuracy": 0.9736702889204025,
"num_tokens": 7449859.0,
"step": 845
},
{
"entropy": 0.9736010432243347,
"epoch": 3.0542986425339365,
"grad_norm": 0.6149342656135559,
"learning_rate": 0.00014780031275825873,
"loss": 0.11650148779153824,
"mean_token_accuracy": 0.9748559445142746,
"num_tokens": 7458763.0,
"step": 846
},
{
"entropy": 0.9672216325998306,
"epoch": 3.057918552036199,
"grad_norm": 0.7002614140510559,
"learning_rate": 0.00014768086992392187,
"loss": 0.06093838810920715,
"mean_token_accuracy": 0.9850212782621384,
"num_tokens": 7467393.0,
"step": 847
},
{
"entropy": 1.0159880816936493,
"epoch": 3.0615384615384613,
"grad_norm": 0.5158461332321167,
"learning_rate": 0.00014756134656142842,
"loss": 0.03521204739809036,
"mean_token_accuracy": 0.9884093105792999,
"num_tokens": 7475902.0,
"step": 848
},
{
"entropy": 0.9773909002542496,
"epoch": 3.065158371040724,
"grad_norm": 0.6164082288742065,
"learning_rate": 0.00014744174292621284,
"loss": 0.09098848700523376,
"mean_token_accuracy": 0.9765210151672363,
"num_tokens": 7484956.0,
"step": 849
},
{
"entropy": 0.9457692354917526,
"epoch": 3.0687782805429866,
"grad_norm": 0.49424877762794495,
"learning_rate": 0.00014732205927388135,
"loss": 0.07613382488489151,
"mean_token_accuracy": 0.9721274673938751,
"num_tokens": 7494091.0,
"step": 850
},
{
"entropy": 1.0335682481527328,
"epoch": 3.072398190045249,
"grad_norm": 0.5534892082214355,
"learning_rate": 0.00014720229586021098,
"loss": 0.06013672798871994,
"mean_token_accuracy": 0.9849141836166382,
"num_tokens": 7502371.0,
"step": 851
},
{
"entropy": 1.015167698264122,
"epoch": 3.0760180995475115,
"grad_norm": 0.6271040439605713,
"learning_rate": 0.00014708245294114933,
"loss": 0.08082443475723267,
"mean_token_accuracy": 0.9719363301992416,
"num_tokens": 7511051.0,
"step": 852
},
{
"entropy": 0.8989782184362411,
"epoch": 3.079638009049774,
"grad_norm": 0.4136255979537964,
"learning_rate": 0.00014696253077281385,
"loss": 0.046568505465984344,
"mean_token_accuracy": 0.9837394803762436,
"num_tokens": 7520417.0,
"step": 853
},
{
"entropy": 1.008704587817192,
"epoch": 3.0832579185520363,
"grad_norm": 0.664092481136322,
"learning_rate": 0.00014684252961149144,
"loss": 0.08081446588039398,
"mean_token_accuracy": 0.9798438847064972,
"num_tokens": 7529050.0,
"step": 854
},
{
"entropy": 0.9838613867759705,
"epoch": 3.086877828054299,
"grad_norm": 0.6071294546127319,
"learning_rate": 0.00014672244971363768,
"loss": 0.05976680666208267,
"mean_token_accuracy": 0.9791270047426224,
"num_tokens": 7537868.0,
"step": 855
},
{
"entropy": 1.026266947388649,
"epoch": 3.090497737556561,
"grad_norm": 0.8460846543312073,
"learning_rate": 0.00014660229133587653,
"loss": 0.0821947306394577,
"mean_token_accuracy": 0.9793416410684586,
"num_tokens": 7546543.0,
"step": 856
},
{
"entropy": 0.9946709871292114,
"epoch": 3.0941176470588236,
"grad_norm": 0.5959231853485107,
"learning_rate": 0.00014648205473499963,
"loss": 0.13812144100666046,
"mean_token_accuracy": 0.9636797457933426,
"num_tokens": 7556035.0,
"step": 857
},
{
"entropy": 0.9716462343931198,
"epoch": 3.097737556561086,
"grad_norm": 0.30362147092819214,
"learning_rate": 0.00014636174016796583,
"loss": 0.020999953150749207,
"mean_token_accuracy": 0.9924765825271606,
"num_tokens": 7564738.0,
"step": 858
},
{
"entropy": 1.0234932005405426,
"epoch": 3.1013574660633485,
"grad_norm": 0.6290194988250732,
"learning_rate": 0.0001462413478919006,
"loss": 0.056703150272369385,
"mean_token_accuracy": 0.9818439483642578,
"num_tokens": 7573699.0,
"step": 859
},
{
"entropy": 1.0116705000400543,
"epoch": 3.104977375565611,
"grad_norm": 0.5160185098648071,
"learning_rate": 0.00014612087816409533,
"loss": 0.058757197111845016,
"mean_token_accuracy": 0.9875614196062088,
"num_tokens": 7582626.0,
"step": 860
},
{
"entropy": 1.0897808521986008,
"epoch": 3.1085972850678734,
"grad_norm": 0.5133566856384277,
"learning_rate": 0.00014600033124200718,
"loss": 0.08064649254083633,
"mean_token_accuracy": 0.9845390319824219,
"num_tokens": 7591273.0,
"step": 861
},
{
"entropy": 1.081803947687149,
"epoch": 3.112217194570136,
"grad_norm": 1.003861904144287,
"learning_rate": 0.00014587970738325808,
"loss": 0.1418905109167099,
"mean_token_accuracy": 0.9711157530546188,
"num_tokens": 7600159.0,
"step": 862
},
{
"entropy": 1.0890114307403564,
"epoch": 3.1158371040723982,
"grad_norm": 0.441112220287323,
"learning_rate": 0.00014575900684563452,
"loss": 0.05492641404271126,
"mean_token_accuracy": 0.9856597781181335,
"num_tokens": 7608829.0,
"step": 863
},
{
"entropy": 1.0998935103416443,
"epoch": 3.1194570135746607,
"grad_norm": 0.749271035194397,
"learning_rate": 0.0001456382298870868,
"loss": 0.13943280279636383,
"mean_token_accuracy": 0.9622896611690521,
"num_tokens": 7617472.0,
"step": 864
},
{
"entropy": 1.0612820237874985,
"epoch": 3.123076923076923,
"grad_norm": 0.4013805091381073,
"learning_rate": 0.00014551737676572846,
"loss": 0.037314120680093765,
"mean_token_accuracy": 0.9867268055677414,
"num_tokens": 7626380.0,
"step": 865
},
{
"entropy": 1.0484765768051147,
"epoch": 3.1266968325791855,
"grad_norm": 0.4148269593715668,
"learning_rate": 0.00014539644773983599,
"loss": 0.0693223774433136,
"mean_token_accuracy": 0.9800146818161011,
"num_tokens": 7635156.0,
"step": 866
},
{
"entropy": 1.0329798758029938,
"epoch": 3.130316742081448,
"grad_norm": 0.33298373222351074,
"learning_rate": 0.00014527544306784792,
"loss": 0.038426462560892105,
"mean_token_accuracy": 0.9857963621616364,
"num_tokens": 7644301.0,
"step": 867
},
{
"entropy": 1.0733687579631805,
"epoch": 3.1339366515837104,
"grad_norm": 0.5466052889823914,
"learning_rate": 0.0001451543630083646,
"loss": 0.03809817135334015,
"mean_token_accuracy": 0.9909420758485794,
"num_tokens": 7652600.0,
"step": 868
},
{
"entropy": 1.0370618999004364,
"epoch": 3.137556561085973,
"grad_norm": 0.5771836042404175,
"learning_rate": 0.00014503320782014735,
"loss": 0.07035155594348907,
"mean_token_accuracy": 0.9797674417495728,
"num_tokens": 7661663.0,
"step": 869
},
{
"entropy": 1.0356185287237167,
"epoch": 3.1411764705882352,
"grad_norm": 0.4902818202972412,
"learning_rate": 0.0001449119777621181,
"loss": 0.03925105184316635,
"mean_token_accuracy": 0.9836414456367493,
"num_tokens": 7670373.0,
"step": 870
},
{
"entropy": 1.0262203961610794,
"epoch": 3.1447963800904977,
"grad_norm": 0.6004799604415894,
"learning_rate": 0.00014479067309335888,
"loss": 0.055851537734270096,
"mean_token_accuracy": 0.9868045449256897,
"num_tokens": 7679173.0,
"step": 871
},
{
"entropy": 1.0273671448230743,
"epoch": 3.14841628959276,
"grad_norm": 0.4942399263381958,
"learning_rate": 0.00014466929407311102,
"loss": 0.07361710071563721,
"mean_token_accuracy": 0.9777687937021255,
"num_tokens": 7687847.0,
"step": 872
},
{
"entropy": 1.0208537876605988,
"epoch": 3.1520361990950225,
"grad_norm": 0.47028565406799316,
"learning_rate": 0.0001445478409607748,
"loss": 0.06834034621715546,
"mean_token_accuracy": 0.9840661883354187,
"num_tokens": 7696853.0,
"step": 873
},
{
"entropy": 0.9666854441165924,
"epoch": 3.155656108597285,
"grad_norm": 0.4551008939743042,
"learning_rate": 0.00014442631401590889,
"loss": 0.05758052319288254,
"mean_token_accuracy": 0.9800277948379517,
"num_tokens": 7706188.0,
"step": 874
},
{
"entropy": 1.0102877765893936,
"epoch": 3.1592760180995474,
"grad_norm": 0.4480641186237335,
"learning_rate": 0.00014430471349822973,
"loss": 0.04771336168050766,
"mean_token_accuracy": 0.9888424724340439,
"num_tokens": 7714582.0,
"step": 875
},
{
"entropy": 1.0120342820882797,
"epoch": 3.16289592760181,
"grad_norm": 0.4889662563800812,
"learning_rate": 0.00014418303966761095,
"loss": 0.04983747750520706,
"mean_token_accuracy": 0.984560638666153,
"num_tokens": 7723527.0,
"step": 876
},
{
"entropy": 1.0261083096265793,
"epoch": 3.1665158371040723,
"grad_norm": 0.5745841860771179,
"learning_rate": 0.0001440612927840829,
"loss": 0.03039124421775341,
"mean_token_accuracy": 0.99087293446064,
"num_tokens": 7731911.0,
"step": 877
},
{
"entropy": 0.9984003603458405,
"epoch": 3.1701357466063347,
"grad_norm": 0.6179273724555969,
"learning_rate": 0.00014393947310783204,
"loss": 0.06010914221405983,
"mean_token_accuracy": 0.9845625460147858,
"num_tokens": 7740814.0,
"step": 878
},
{
"entropy": 0.9657698571681976,
"epoch": 3.173755656108597,
"grad_norm": 0.6100254654884338,
"learning_rate": 0.00014381758089920037,
"loss": 0.07441750913858414,
"mean_token_accuracy": 0.9785300642251968,
"num_tokens": 7750064.0,
"step": 879
},
{
"entropy": 0.9885086268186569,
"epoch": 3.1773755656108595,
"grad_norm": 0.5612260103225708,
"learning_rate": 0.00014369561641868497,
"loss": 0.04010923579335213,
"mean_token_accuracy": 0.9847719967365265,
"num_tokens": 7758761.0,
"step": 880
},
{
"entropy": 0.9970448017120361,
"epoch": 3.180995475113122,
"grad_norm": 0.741584062576294,
"learning_rate": 0.00014357357992693726,
"loss": 0.22217880189418793,
"mean_token_accuracy": 0.9596049934625626,
"num_tokens": 7768137.0,
"step": 881
},
{
"entropy": 1.0056591778993607,
"epoch": 3.184615384615385,
"grad_norm": 0.6077441573143005,
"learning_rate": 0.0001434514716847627,
"loss": 0.06942348182201385,
"mean_token_accuracy": 0.9832926243543625,
"num_tokens": 7776809.0,
"step": 882
},
{
"entropy": 0.9155485332012177,
"epoch": 3.1882352941176473,
"grad_norm": 0.49946412444114685,
"learning_rate": 0.00014332929195311997,
"loss": 0.05635019764304161,
"mean_token_accuracy": 0.9822289496660233,
"num_tokens": 7786610.0,
"step": 883
},
{
"entropy": 1.0032794624567032,
"epoch": 3.1918552036199097,
"grad_norm": 0.508366584777832,
"learning_rate": 0.00014320704099312053,
"loss": 0.0505295991897583,
"mean_token_accuracy": 0.9848868101835251,
"num_tokens": 7795865.0,
"step": 884
},
{
"entropy": 1.02556973695755,
"epoch": 3.195475113122172,
"grad_norm": 0.5340198278427124,
"learning_rate": 0.00014308471906602824,
"loss": 0.04925612732768059,
"mean_token_accuracy": 0.9854798167943954,
"num_tokens": 7804601.0,
"step": 885
},
{
"entropy": 0.9870988875627518,
"epoch": 3.1990950226244346,
"grad_norm": 0.5888339877128601,
"learning_rate": 0.00014296232643325836,
"loss": 0.0761546716094017,
"mean_token_accuracy": 0.9741184711456299,
"num_tokens": 7813665.0,
"step": 886
},
{
"entropy": 1.003835067152977,
"epoch": 3.202714932126697,
"grad_norm": 0.5220237970352173,
"learning_rate": 0.00014283986335637743,
"loss": 0.07831569761037827,
"mean_token_accuracy": 0.9766727834939957,
"num_tokens": 7822827.0,
"step": 887
},
{
"entropy": 0.994221106171608,
"epoch": 3.2063348416289594,
"grad_norm": 0.38538306951522827,
"learning_rate": 0.00014271733009710245,
"loss": 0.04108966886997223,
"mean_token_accuracy": 0.9864413440227509,
"num_tokens": 7832062.0,
"step": 888
},
{
"entropy": 1.0374914705753326,
"epoch": 3.209954751131222,
"grad_norm": 0.49359604716300964,
"learning_rate": 0.0001425947269173006,
"loss": 0.07969976961612701,
"mean_token_accuracy": 0.9739227145910263,
"num_tokens": 7840683.0,
"step": 889
},
{
"entropy": 1.0366221368312836,
"epoch": 3.2135746606334843,
"grad_norm": 0.6392092704772949,
"learning_rate": 0.00014247205407898813,
"loss": 0.07518687844276428,
"mean_token_accuracy": 0.9690025746822357,
"num_tokens": 7849354.0,
"step": 890
},
{
"entropy": 1.0584574043750763,
"epoch": 3.2171945701357467,
"grad_norm": 0.5655986666679382,
"learning_rate": 0.0001423493118443305,
"loss": 0.05813714116811752,
"mean_token_accuracy": 0.9855156242847443,
"num_tokens": 7857837.0,
"step": 891
},
{
"entropy": 1.0073893517255783,
"epoch": 3.220814479638009,
"grad_norm": 0.5855907201766968,
"learning_rate": 0.00014222650047564128,
"loss": 0.060413941740989685,
"mean_token_accuracy": 0.983014851808548,
"num_tokens": 7866804.0,
"step": 892
},
{
"entropy": 1.0234777629375458,
"epoch": 3.2244343891402716,
"grad_norm": 0.5941819548606873,
"learning_rate": 0.00014210362023538194,
"loss": 0.07761058211326599,
"mean_token_accuracy": 0.977308601140976,
"num_tokens": 7875824.0,
"step": 893
},
{
"entropy": 0.985499694943428,
"epoch": 3.228054298642534,
"grad_norm": 0.3823990523815155,
"learning_rate": 0.00014198067138616096,
"loss": 0.03702878952026367,
"mean_token_accuracy": 0.9904143661260605,
"num_tokens": 7884781.0,
"step": 894
},
{
"entropy": 1.0092906057834625,
"epoch": 3.2316742081447964,
"grad_norm": 0.6509421467781067,
"learning_rate": 0.00014185765419073352,
"loss": 0.07195041328668594,
"mean_token_accuracy": 0.9753110408782959,
"num_tokens": 7894018.0,
"step": 895
},
{
"entropy": 1.0174790173768997,
"epoch": 3.235294117647059,
"grad_norm": 0.5773764848709106,
"learning_rate": 0.00014173456891200097,
"loss": 0.0730120837688446,
"mean_token_accuracy": 0.9764125943183899,
"num_tokens": 7903077.0,
"step": 896
},
{
"entropy": 1.0282800495624542,
"epoch": 3.2389140271493213,
"grad_norm": 0.46587687730789185,
"learning_rate": 0.00014161141581300993,
"loss": 0.022020503878593445,
"mean_token_accuracy": 0.9938695281744003,
"num_tokens": 7911650.0,
"step": 897
},
{
"entropy": 0.9804821610450745,
"epoch": 3.2425339366515837,
"grad_norm": 0.571721076965332,
"learning_rate": 0.00014148819515695226,
"loss": 0.07927205413579941,
"mean_token_accuracy": 0.9796456694602966,
"num_tokens": 7921082.0,
"step": 898
},
{
"entropy": 1.031940370798111,
"epoch": 3.246153846153846,
"grad_norm": 0.49029776453971863,
"learning_rate": 0.0001413649072071639,
"loss": 0.02996247261762619,
"mean_token_accuracy": 0.9895152151584625,
"num_tokens": 7929219.0,
"step": 899
},
{
"entropy": 0.9698035717010498,
"epoch": 3.2497737556561086,
"grad_norm": 0.7558808922767639,
"learning_rate": 0.00014124155222712477,
"loss": 0.07464667409658432,
"mean_token_accuracy": 0.9750736951828003,
"num_tokens": 7938458.0,
"step": 900
},
{
"entropy": 1.0520759522914886,
"epoch": 3.253393665158371,
"grad_norm": 0.6176110506057739,
"learning_rate": 0.00014111813048045804,
"loss": 0.08047696202993393,
"mean_token_accuracy": 0.9749610275030136,
"num_tokens": 7947000.0,
"step": 901
},
{
"entropy": 1.073697492480278,
"epoch": 3.2570135746606335,
"grad_norm": 0.9040740132331848,
"learning_rate": 0.00014099464223092951,
"loss": 0.09678997844457626,
"mean_token_accuracy": 0.9677305668592453,
"num_tokens": 7955392.0,
"step": 902
},
{
"entropy": 0.9997714757919312,
"epoch": 3.260633484162896,
"grad_norm": 0.4019973576068878,
"learning_rate": 0.00014087108774244714,
"loss": 0.05037511885166168,
"mean_token_accuracy": 0.9856746792793274,
"num_tokens": 7963894.0,
"step": 903
},
{
"entropy": 1.0553741455078125,
"epoch": 3.2642533936651583,
"grad_norm": 0.49038296937942505,
"learning_rate": 0.00014074746727906046,
"loss": 0.05171579495072365,
"mean_token_accuracy": 0.9860897809267044,
"num_tokens": 7972712.0,
"step": 904
},
{
"entropy": 0.9860815703868866,
"epoch": 3.2678733031674208,
"grad_norm": 0.4992769658565521,
"learning_rate": 0.00014062378110495989,
"loss": 0.05135425552725792,
"mean_token_accuracy": 0.9861588776111603,
"num_tokens": 7981964.0,
"step": 905
},
{
"entropy": 1.0063879191875458,
"epoch": 3.271493212669683,
"grad_norm": 0.5779732465744019,
"learning_rate": 0.00014050002948447644,
"loss": 0.07176823914051056,
"mean_token_accuracy": 0.9783128350973129,
"num_tokens": 7990478.0,
"step": 906
},
{
"entropy": 1.0536756813526154,
"epoch": 3.2751131221719456,
"grad_norm": 0.35298392176628113,
"learning_rate": 0.00014037621268208093,
"loss": 0.0257625300437212,
"mean_token_accuracy": 0.9923644959926605,
"num_tokens": 7999213.0,
"step": 907
},
{
"entropy": 1.0033773183822632,
"epoch": 3.278733031674208,
"grad_norm": 0.5744296312332153,
"learning_rate": 0.00014025233096238337,
"loss": 0.05902718007564545,
"mean_token_accuracy": 0.9848368018865585,
"num_tokens": 8008151.0,
"step": 908
},
{
"entropy": 1.0164627432823181,
"epoch": 3.2823529411764705,
"grad_norm": 0.625365138053894,
"learning_rate": 0.0001401283845901327,
"loss": 0.05885080248117447,
"mean_token_accuracy": 0.9817720204591751,
"num_tokens": 8016336.0,
"step": 909
},
{
"entropy": 0.9507493227720261,
"epoch": 3.285972850678733,
"grad_norm": 0.5023903250694275,
"learning_rate": 0.00014000437383021586,
"loss": 0.04318719357252121,
"mean_token_accuracy": 0.9899467080831528,
"num_tokens": 8025425.0,
"step": 910
},
{
"entropy": 0.9879760593175888,
"epoch": 3.2895927601809953,
"grad_norm": 0.629055380821228,
"learning_rate": 0.00013988029894765748,
"loss": 0.08269868046045303,
"mean_token_accuracy": 0.9814896881580353,
"num_tokens": 8034580.0,
"step": 911
},
{
"entropy": 1.0588575601577759,
"epoch": 3.2932126696832578,
"grad_norm": 0.6117694973945618,
"learning_rate": 0.00013975616020761922,
"loss": 0.06348450481891632,
"mean_token_accuracy": 0.9816050231456757,
"num_tokens": 8042841.0,
"step": 912
},
{
"entropy": 1.0400816798210144,
"epoch": 3.29683257918552,
"grad_norm": 0.5118631720542908,
"learning_rate": 0.0001396319578753992,
"loss": 0.040266815572977066,
"mean_token_accuracy": 0.9846054464578629,
"num_tokens": 8051443.0,
"step": 913
},
{
"entropy": 1.1316498517990112,
"epoch": 3.3004524886877826,
"grad_norm": 0.7367774248123169,
"learning_rate": 0.0001395076922164314,
"loss": 0.08550389111042023,
"mean_token_accuracy": 0.9712842255830765,
"num_tokens": 8059492.0,
"step": 914
},
{
"entropy": 1.084427922964096,
"epoch": 3.304072398190045,
"grad_norm": 0.7586991786956787,
"learning_rate": 0.00013938336349628524,
"loss": 0.06349455565214157,
"mean_token_accuracy": 0.9829322099685669,
"num_tokens": 8067797.0,
"step": 915
},
{
"entropy": 1.0688078999519348,
"epoch": 3.3076923076923075,
"grad_norm": 0.5953513383865356,
"learning_rate": 0.0001392589719806648,
"loss": 0.0380900502204895,
"mean_token_accuracy": 0.9876203536987305,
"num_tokens": 8075988.0,
"step": 916
},
{
"entropy": 1.0175382196903229,
"epoch": 3.31131221719457,
"grad_norm": 0.5219187140464783,
"learning_rate": 0.00013913451793540844,
"loss": 0.026124723255634308,
"mean_token_accuracy": 0.9945862740278244,
"num_tokens": 8084620.0,
"step": 917
},
{
"entropy": 1.0590780079364777,
"epoch": 3.3149321266968323,
"grad_norm": 0.4491603672504425,
"learning_rate": 0.0001390100016264881,
"loss": 0.051901232451200485,
"mean_token_accuracy": 0.9856987297534943,
"num_tokens": 8093124.0,
"step": 918
},
{
"entropy": 0.9357586652040482,
"epoch": 3.318552036199095,
"grad_norm": 0.5711967945098877,
"learning_rate": 0.00013888542332000882,
"loss": 0.05333176627755165,
"mean_token_accuracy": 0.9833570569753647,
"num_tokens": 8102615.0,
"step": 919
},
{
"entropy": 1.0148942023515701,
"epoch": 3.3221719457013577,
"grad_norm": 0.6172460913658142,
"learning_rate": 0.0001387607832822081,
"loss": 0.07704110443592072,
"mean_token_accuracy": 0.9769087731838226,
"num_tokens": 8111712.0,
"step": 920
},
{
"entropy": 1.005068227648735,
"epoch": 3.32579185520362,
"grad_norm": 0.48708808422088623,
"learning_rate": 0.0001386360817794554,
"loss": 0.057994648814201355,
"mean_token_accuracy": 0.9857281446456909,
"num_tokens": 8120661.0,
"step": 921
},
{
"entropy": 1.0857775509357452,
"epoch": 3.3294117647058825,
"grad_norm": 0.521118700504303,
"learning_rate": 0.00013851131907825152,
"loss": 0.0432361476123333,
"mean_token_accuracy": 0.9871759116649628,
"num_tokens": 8128962.0,
"step": 922
},
{
"entropy": 0.9878742694854736,
"epoch": 3.333031674208145,
"grad_norm": 0.5457652807235718,
"learning_rate": 0.00013838649544522803,
"loss": 0.057160355150699615,
"mean_token_accuracy": 0.9802255481481552,
"num_tokens": 8138079.0,
"step": 923
},
{
"entropy": 0.9153740406036377,
"epoch": 3.3366515837104074,
"grad_norm": 0.6113324761390686,
"learning_rate": 0.00013826161114714682,
"loss": 0.05363360047340393,
"mean_token_accuracy": 0.9866549521684647,
"num_tokens": 8147296.0,
"step": 924
},
{
"entropy": 0.9832455366849899,
"epoch": 3.34027149321267,
"grad_norm": 0.7847388386726379,
"learning_rate": 0.00013813666645089926,
"loss": 0.10442715883255005,
"mean_token_accuracy": 0.9688615947961807,
"num_tokens": 8156139.0,
"step": 925
},
{
"entropy": 1.0013352185487747,
"epoch": 3.3438914027149322,
"grad_norm": 0.4540573060512543,
"learning_rate": 0.0001380116616235059,
"loss": 0.04038259759545326,
"mean_token_accuracy": 0.9876029193401337,
"num_tokens": 8164879.0,
"step": 926
},
{
"entropy": 0.9603820294141769,
"epoch": 3.3475113122171947,
"grad_norm": 0.5983673930168152,
"learning_rate": 0.00013788659693211584,
"loss": 0.08187659829854965,
"mean_token_accuracy": 0.9774649292230606,
"num_tokens": 8174389.0,
"step": 927
},
{
"entropy": 1.0243980884552002,
"epoch": 3.351131221719457,
"grad_norm": 0.6228734850883484,
"learning_rate": 0.000137761472644006,
"loss": 0.048151057213544846,
"mean_token_accuracy": 0.9839256554841995,
"num_tokens": 8182842.0,
"step": 928
},
{
"entropy": 0.9616681337356567,
"epoch": 3.3547511312217195,
"grad_norm": 0.5719891786575317,
"learning_rate": 0.00013763628902658075,
"loss": 0.03624703735113144,
"mean_token_accuracy": 0.9900500476360321,
"num_tokens": 8192132.0,
"step": 929
},
{
"entropy": 1.0459087640047073,
"epoch": 3.358371040723982,
"grad_norm": 0.6618694067001343,
"learning_rate": 0.0001375110463473712,
"loss": 0.06819941848516464,
"mean_token_accuracy": 0.9793660789728165,
"num_tokens": 8200462.0,
"step": 930
},
{
"entropy": 0.9984315633773804,
"epoch": 3.3619909502262444,
"grad_norm": 0.4620678722858429,
"learning_rate": 0.00013738574487403475,
"loss": 0.034341007471084595,
"mean_token_accuracy": 0.9875331819057465,
"num_tokens": 8209284.0,
"step": 931
},
{
"entropy": 1.0406230092048645,
"epoch": 3.365610859728507,
"grad_norm": 0.5373427867889404,
"learning_rate": 0.00013726038487435436,
"loss": 0.043672651052474976,
"mean_token_accuracy": 0.9890551418066025,
"num_tokens": 8217959.0,
"step": 932
},
{
"entropy": 0.9317538440227509,
"epoch": 3.3692307692307693,
"grad_norm": 0.48770636320114136,
"learning_rate": 0.00013713496661623816,
"loss": 0.0499286986887455,
"mean_token_accuracy": 0.9822133630514145,
"num_tokens": 8227583.0,
"step": 933
},
{
"entropy": 0.9775789231061935,
"epoch": 3.3728506787330317,
"grad_norm": 0.5296841263771057,
"learning_rate": 0.00013700949036771874,
"loss": 0.056918881833553314,
"mean_token_accuracy": 0.9848699420690536,
"num_tokens": 8237155.0,
"step": 934
},
{
"entropy": 1.0716052204370499,
"epoch": 3.376470588235294,
"grad_norm": 0.6076134443283081,
"learning_rate": 0.00013688395639695252,
"loss": 0.05968927592039108,
"mean_token_accuracy": 0.9806047230958939,
"num_tokens": 8245583.0,
"step": 935
},
{
"entropy": 1.0416816025972366,
"epoch": 3.3800904977375565,
"grad_norm": 0.45776641368865967,
"learning_rate": 0.00013675836497221953,
"loss": 0.04919914901256561,
"mean_token_accuracy": 0.9881732165813446,
"num_tokens": 8254310.0,
"step": 936
},
{
"entropy": 0.9982000887393951,
"epoch": 3.383710407239819,
"grad_norm": 0.43318697810173035,
"learning_rate": 0.00013663271636192234,
"loss": 0.034604959189891815,
"mean_token_accuracy": 0.9873620271682739,
"num_tokens": 8263199.0,
"step": 937
},
{
"entropy": 1.0052271336317062,
"epoch": 3.3873303167420814,
"grad_norm": 0.44714435935020447,
"learning_rate": 0.00013650701083458585,
"loss": 0.08507149666547775,
"mean_token_accuracy": 0.9845927953720093,
"num_tokens": 8272493.0,
"step": 938
},
{
"entropy": 1.0300282388925552,
"epoch": 3.390950226244344,
"grad_norm": 0.3920552730560303,
"learning_rate": 0.0001363812486588566,
"loss": 0.0559050627052784,
"mean_token_accuracy": 0.9840603917837143,
"num_tokens": 8281216.0,
"step": 939
},
{
"entropy": 1.0108753889799118,
"epoch": 3.3945701357466063,
"grad_norm": 0.5175376534461975,
"learning_rate": 0.0001362554301035021,
"loss": 0.056878913193941116,
"mean_token_accuracy": 0.9844557046890259,
"num_tokens": 8290001.0,
"step": 940
},
{
"entropy": 1.0616537183523178,
"epoch": 3.3981900452488687,
"grad_norm": 0.5512950420379639,
"learning_rate": 0.0001361295554374105,
"loss": 0.05054660886526108,
"mean_token_accuracy": 0.9832082837820053,
"num_tokens": 8298553.0,
"step": 941
},
{
"entropy": 1.1058834791183472,
"epoch": 3.401809954751131,
"grad_norm": 0.4575974941253662,
"learning_rate": 0.00013600362492958976,
"loss": 0.0616161935031414,
"mean_token_accuracy": 0.9793710261583328,
"num_tokens": 8307283.0,
"step": 942
},
{
"entropy": 1.0524688065052032,
"epoch": 3.4054298642533936,
"grad_norm": 0.4527650773525238,
"learning_rate": 0.00013587763884916716,
"loss": 0.045646894723176956,
"mean_token_accuracy": 0.9848221987485886,
"num_tokens": 8316116.0,
"step": 943
},
{
"entropy": 1.0609974563121796,
"epoch": 3.409049773755656,
"grad_norm": 0.6199139952659607,
"learning_rate": 0.0001357515974653888,
"loss": 0.10065983235836029,
"mean_token_accuracy": 0.9717418551445007,
"num_tokens": 8324716.0,
"step": 944
},
{
"entropy": 0.958422839641571,
"epoch": 3.4126696832579184,
"grad_norm": 0.5813620090484619,
"learning_rate": 0.000135625501047619,
"loss": 0.07302285730838776,
"mean_token_accuracy": 0.9771098643541336,
"num_tokens": 8334091.0,
"step": 945
},
{
"entropy": 1.0653769075870514,
"epoch": 3.416289592760181,
"grad_norm": 0.5696032047271729,
"learning_rate": 0.00013549934986533966,
"loss": 0.06528802961111069,
"mean_token_accuracy": 0.9849737137556076,
"num_tokens": 8343075.0,
"step": 946
},
{
"entropy": 1.0103659331798553,
"epoch": 3.4199095022624433,
"grad_norm": 0.36156147718429565,
"learning_rate": 0.0001353731441881496,
"loss": 0.041538454592227936,
"mean_token_accuracy": 0.9860167354345322,
"num_tokens": 8352196.0,
"step": 947
},
{
"entropy": 0.9813169538974762,
"epoch": 3.4235294117647057,
"grad_norm": 0.465168833732605,
"learning_rate": 0.00013524688428576435,
"loss": 0.0380837507545948,
"mean_token_accuracy": 0.9855641424655914,
"num_tokens": 8361142.0,
"step": 948
},
{
"entropy": 0.9958767145872116,
"epoch": 3.427149321266968,
"grad_norm": 0.3552955090999603,
"learning_rate": 0.0001351205704280151,
"loss": 0.026597965508699417,
"mean_token_accuracy": 0.9929200410842896,
"num_tokens": 8370017.0,
"step": 949
},
{
"entropy": 0.970636785030365,
"epoch": 3.430769230769231,
"grad_norm": 0.5098865032196045,
"learning_rate": 0.00013499420288484842,
"loss": 0.03246006369590759,
"mean_token_accuracy": 0.9889640510082245,
"num_tokens": 8379008.0,
"step": 950
},
{
"entropy": 1.012205347418785,
"epoch": 3.4343891402714934,
"grad_norm": 0.7393912672996521,
"learning_rate": 0.00013486778192632574,
"loss": 0.07227301597595215,
"mean_token_accuracy": 0.977458193898201,
"num_tokens": 8387896.0,
"step": 951
},
{
"entropy": 0.9507156610488892,
"epoch": 3.438009049773756,
"grad_norm": 0.6013923287391663,
"learning_rate": 0.0001347413078226224,
"loss": 0.0642247200012207,
"mean_token_accuracy": 0.9781456738710403,
"num_tokens": 8396889.0,
"step": 952
},
{
"entropy": 0.9904819428920746,
"epoch": 3.4416289592760183,
"grad_norm": 0.5121738314628601,
"learning_rate": 0.00013461478084402745,
"loss": 0.04460640996694565,
"mean_token_accuracy": 0.9875061810016632,
"num_tokens": 8405690.0,
"step": 953
},
{
"entropy": 0.9784403592348099,
"epoch": 3.4452488687782807,
"grad_norm": 0.43736669421195984,
"learning_rate": 0.00013448820126094307,
"loss": 0.038092780858278275,
"mean_token_accuracy": 0.9895127415657043,
"num_tokens": 8414500.0,
"step": 954
},
{
"entropy": 0.9568644464015961,
"epoch": 3.448868778280543,
"grad_norm": 0.3654727637767792,
"learning_rate": 0.0001343615693438836,
"loss": 0.029216358438134193,
"mean_token_accuracy": 0.9893685132265091,
"num_tokens": 8423326.0,
"step": 955
},
{
"entropy": 0.927936390042305,
"epoch": 3.4524886877828056,
"grad_norm": 0.4584848880767822,
"learning_rate": 0.0001342348853634754,
"loss": 0.04364006593823433,
"mean_token_accuracy": 0.9881165325641632,
"num_tokens": 8432485.0,
"step": 956
},
{
"entropy": 0.9923946708440781,
"epoch": 3.456108597285068,
"grad_norm": 0.7116801142692566,
"learning_rate": 0.00013410814959045607,
"loss": 0.10070855170488358,
"mean_token_accuracy": 0.9667374640703201,
"num_tokens": 8441344.0,
"step": 957
},
{
"entropy": 0.9566802680492401,
"epoch": 3.4597285067873305,
"grad_norm": 0.5868293046951294,
"learning_rate": 0.00013398136229567383,
"loss": 0.04937519505620003,
"mean_token_accuracy": 0.982713058590889,
"num_tokens": 8450202.0,
"step": 958
},
{
"entropy": 0.9900821000337601,
"epoch": 3.463348416289593,
"grad_norm": 0.4907667338848114,
"learning_rate": 0.00013385452375008704,
"loss": 0.052656762301921844,
"mean_token_accuracy": 0.9915094673633575,
"num_tokens": 8459117.0,
"step": 959
},
{
"entropy": 0.926659345626831,
"epoch": 3.4669683257918553,
"grad_norm": 0.5979401469230652,
"learning_rate": 0.00013372763422476365,
"loss": 0.06121910735964775,
"mean_token_accuracy": 0.9823627024888992,
"num_tokens": 8468190.0,
"step": 960
},
{
"entropy": 0.9869116097688675,
"epoch": 3.4705882352941178,
"grad_norm": 0.47893184423446655,
"learning_rate": 0.00013360069399088044,
"loss": 0.06325532495975494,
"mean_token_accuracy": 0.982891395688057,
"num_tokens": 8477414.0,
"step": 961
},
{
"entropy": 0.9895572513341904,
"epoch": 3.47420814479638,
"grad_norm": 0.6157852411270142,
"learning_rate": 0.00013347370331972272,
"loss": 0.06221451610326767,
"mean_token_accuracy": 0.9826388210058212,
"num_tokens": 8486453.0,
"step": 962
},
{
"entropy": 0.9911331236362457,
"epoch": 3.4778280542986426,
"grad_norm": 0.540799617767334,
"learning_rate": 0.0001333466624826834,
"loss": 0.09207938611507416,
"mean_token_accuracy": 0.9749413877725601,
"num_tokens": 8495741.0,
"step": 963
},
{
"entropy": 0.9546624422073364,
"epoch": 3.481447963800905,
"grad_norm": 0.7074460387229919,
"learning_rate": 0.0001332195717512628,
"loss": 0.05557447671890259,
"mean_token_accuracy": 0.9830146133899689,
"num_tokens": 8505116.0,
"step": 964
},
{
"entropy": 0.980869710445404,
"epoch": 3.4850678733031675,
"grad_norm": 0.49514511227607727,
"learning_rate": 0.00013309243139706772,
"loss": 0.04955790191888809,
"mean_token_accuracy": 0.9866456240415573,
"num_tokens": 8514060.0,
"step": 965
},
{
"entropy": 1.0564606338739395,
"epoch": 3.48868778280543,
"grad_norm": 0.6254670023918152,
"learning_rate": 0.00013296524169181107,
"loss": 0.060992419719696045,
"mean_token_accuracy": 0.9791516810655594,
"num_tokens": 8522527.0,
"step": 966
},
{
"entropy": 1.0110660940408707,
"epoch": 3.4923076923076923,
"grad_norm": 0.49232155084609985,
"learning_rate": 0.00013283800290731114,
"loss": 0.065431147813797,
"mean_token_accuracy": 0.9852920174598694,
"num_tokens": 8531494.0,
"step": 967
},
{
"entropy": 1.02008418738842,
"epoch": 3.4959276018099548,
"grad_norm": 0.5096694827079773,
"learning_rate": 0.0001327107153154913,
"loss": 0.06423597782850266,
"mean_token_accuracy": 0.9796927273273468,
"num_tokens": 8540545.0,
"step": 968
},
{
"entropy": 1.0431571304798126,
"epoch": 3.499547511312217,
"grad_norm": 0.5744512677192688,
"learning_rate": 0.00013258337918837905,
"loss": 0.07912938296794891,
"mean_token_accuracy": 0.980916902422905,
"num_tokens": 8549127.0,
"step": 969
},
{
"entropy": 0.9832931458950043,
"epoch": 3.5031674208144796,
"grad_norm": 0.5376607775688171,
"learning_rate": 0.00013245599479810564,
"loss": 0.05653414875268936,
"mean_token_accuracy": 0.9854852706193924,
"num_tokens": 8558086.0,
"step": 970
},
{
"entropy": 1.053595095872879,
"epoch": 3.506787330316742,
"grad_norm": 0.6880918741226196,
"learning_rate": 0.00013232856241690555,
"loss": 0.10130049288272858,
"mean_token_accuracy": 0.9788567274808884,
"num_tokens": 8566974.0,
"step": 971
},
{
"entropy": 1.090879499912262,
"epoch": 3.5104072398190045,
"grad_norm": 0.5793285965919495,
"learning_rate": 0.0001322010823171158,
"loss": 0.11286645382642746,
"mean_token_accuracy": 0.9682568907737732,
"num_tokens": 8575648.0,
"step": 972
},
{
"entropy": 1.0895802229642868,
"epoch": 3.514027149321267,
"grad_norm": 0.45659247040748596,
"learning_rate": 0.00013207355477117534,
"loss": 0.04402415081858635,
"mean_token_accuracy": 0.9873204827308655,
"num_tokens": 8584145.0,
"step": 973
},
{
"entropy": 1.0633756816387177,
"epoch": 3.5176470588235293,
"grad_norm": 0.35166093707084656,
"learning_rate": 0.00013194598005162447,
"loss": 0.028780082240700722,
"mean_token_accuracy": 0.9902430325746536,
"num_tokens": 8592757.0,
"step": 974
},
{
"entropy": 0.9252981543540955,
"epoch": 3.521266968325792,
"grad_norm": 0.3973873257637024,
"learning_rate": 0.00013181835843110448,
"loss": 0.04340490698814392,
"mean_token_accuracy": 0.9855764210224152,
"num_tokens": 8602304.0,
"step": 975
},
{
"entropy": 0.9904424697160721,
"epoch": 3.524886877828054,
"grad_norm": 0.5329234600067139,
"learning_rate": 0.0001316906901823567,
"loss": 0.07568858563899994,
"mean_token_accuracy": 0.9819598495960236,
"num_tokens": 8611730.0,
"step": 976
},
{
"entropy": 0.9927194565534592,
"epoch": 3.5285067873303166,
"grad_norm": 0.5585746169090271,
"learning_rate": 0.00013156297557822224,
"loss": 0.06918197870254517,
"mean_token_accuracy": 0.9821109473705292,
"num_tokens": 8620786.0,
"step": 977
},
{
"entropy": 0.9288990050554276,
"epoch": 3.532126696832579,
"grad_norm": 0.530066728591919,
"learning_rate": 0.00013143521489164124,
"loss": 0.06363017857074738,
"mean_token_accuracy": 0.9863996803760529,
"num_tokens": 8630041.0,
"step": 978
},
{
"entropy": 0.9934940189123154,
"epoch": 3.5357466063348415,
"grad_norm": 0.6356056928634644,
"learning_rate": 0.00013130740839565228,
"loss": 0.11892964690923691,
"mean_token_accuracy": 0.9646638482809067,
"num_tokens": 8639083.0,
"step": 979
},
{
"entropy": 1.0539715886116028,
"epoch": 3.539366515837104,
"grad_norm": 0.6019116044044495,
"learning_rate": 0.0001311795563633919,
"loss": 0.05613447353243828,
"mean_token_accuracy": 0.981610581278801,
"num_tokens": 8647585.0,
"step": 980
},
{
"entropy": 0.9783657044172287,
"epoch": 3.5429864253393664,
"grad_norm": 0.5022520422935486,
"learning_rate": 0.00013105165906809394,
"loss": 0.06036650761961937,
"mean_token_accuracy": 0.9770310521125793,
"num_tokens": 8656253.0,
"step": 981
},
{
"entropy": 0.9798354804515839,
"epoch": 3.546606334841629,
"grad_norm": 0.45951077342033386,
"learning_rate": 0.00013092371678308896,
"loss": 0.04008646309375763,
"mean_token_accuracy": 0.9884042888879776,
"num_tokens": 8664814.0,
"step": 982
},
{
"entropy": 1.0052898228168488,
"epoch": 3.5502262443438912,
"grad_norm": 0.9089862108230591,
"learning_rate": 0.0001307957297818036,
"loss": 0.07838708907365799,
"mean_token_accuracy": 0.971778929233551,
"num_tokens": 8672936.0,
"step": 983
},
{
"entropy": 0.9371594786643982,
"epoch": 3.5538461538461537,
"grad_norm": 0.517698347568512,
"learning_rate": 0.00013066769833776026,
"loss": 0.052979908883571625,
"mean_token_accuracy": 0.9832516461610794,
"num_tokens": 8682346.0,
"step": 984
},
{
"entropy": 0.9779858440160751,
"epoch": 3.557466063348416,
"grad_norm": 0.47408440709114075,
"learning_rate": 0.00013053962272457613,
"loss": 0.048960305750370026,
"mean_token_accuracy": 0.9822594523429871,
"num_tokens": 8690870.0,
"step": 985
},
{
"entropy": 0.915733277797699,
"epoch": 3.5610859728506785,
"grad_norm": 0.6086940765380859,
"learning_rate": 0.00013041150321596286,
"loss": 0.07182019203901291,
"mean_token_accuracy": 0.9770045280456543,
"num_tokens": 8699907.0,
"step": 986
},
{
"entropy": 0.9649381190538406,
"epoch": 3.564705882352941,
"grad_norm": 0.5936797261238098,
"learning_rate": 0.00013028334008572588,
"loss": 0.06510870158672333,
"mean_token_accuracy": 0.9784034192562103,
"num_tokens": 8708771.0,
"step": 987
},
{
"entropy": 0.9736950397491455,
"epoch": 3.5683257918552034,
"grad_norm": 0.6395031809806824,
"learning_rate": 0.00013015513360776392,
"loss": 0.09703779220581055,
"mean_token_accuracy": 0.9763300269842148,
"num_tokens": 8717543.0,
"step": 988
},
{
"entropy": 0.9976915121078491,
"epoch": 3.571945701357466,
"grad_norm": 0.725874662399292,
"learning_rate": 0.00013002688405606828,
"loss": 0.09122011065483093,
"mean_token_accuracy": 0.9732316583395004,
"num_tokens": 8726064.0,
"step": 989
},
{
"entropy": 0.998377114534378,
"epoch": 3.5755656108597282,
"grad_norm": 0.6217262148857117,
"learning_rate": 0.0001298985917047224,
"loss": 0.06388919055461884,
"mean_token_accuracy": 0.9831690788269043,
"num_tokens": 8734777.0,
"step": 990
},
{
"entropy": 0.9755560904741287,
"epoch": 3.579185520361991,
"grad_norm": 0.5802934169769287,
"learning_rate": 0.000129770256827901,
"loss": 0.0830259919166565,
"mean_token_accuracy": 0.9810828566551208,
"num_tokens": 8743671.0,
"step": 991
},
{
"entropy": 0.9394370019435883,
"epoch": 3.5828054298642535,
"grad_norm": 0.4319647252559662,
"learning_rate": 0.00012964187969986986,
"loss": 0.029521089047193527,
"mean_token_accuracy": 0.9876839071512222,
"num_tokens": 8752588.0,
"step": 992
},
{
"entropy": 1.0086267590522766,
"epoch": 3.586425339366516,
"grad_norm": 0.44192439317703247,
"learning_rate": 0.00012951346059498505,
"loss": 0.044906750321388245,
"mean_token_accuracy": 0.9855604767799377,
"num_tokens": 8761348.0,
"step": 993
},
{
"entropy": 1.0500630140304565,
"epoch": 3.5900452488687784,
"grad_norm": 0.7281487584114075,
"learning_rate": 0.00012938499978769222,
"loss": 0.0938921645283699,
"mean_token_accuracy": 0.9799060225486755,
"num_tokens": 8770331.0,
"step": 994
},
{
"entropy": 1.1058216989040375,
"epoch": 3.593665158371041,
"grad_norm": 0.6030604243278503,
"learning_rate": 0.00012925649755252624,
"loss": 0.07762658596038818,
"mean_token_accuracy": 0.9789818972349167,
"num_tokens": 8778983.0,
"step": 995
},
{
"entropy": 1.0310122072696686,
"epoch": 3.5972850678733033,
"grad_norm": 0.44901198148727417,
"learning_rate": 0.00012912795416411056,
"loss": 0.04441501572728157,
"mean_token_accuracy": 0.9828421622514725,
"num_tokens": 8787524.0,
"step": 996
},
{
"entropy": 1.0446508675813675,
"epoch": 3.6009049773755657,
"grad_norm": 0.5148265957832336,
"learning_rate": 0.0001289993698971564,
"loss": 0.046136148273944855,
"mean_token_accuracy": 0.9823300242424011,
"num_tokens": 8796366.0,
"step": 997
},
{
"entropy": 1.0721295475959778,
"epoch": 3.604524886877828,
"grad_norm": 0.5323315858840942,
"learning_rate": 0.00012887074502646257,
"loss": 0.037365175783634186,
"mean_token_accuracy": 0.988229975104332,
"num_tokens": 8804941.0,
"step": 998
},
{
"entropy": 1.097947746515274,
"epoch": 3.6081447963800906,
"grad_norm": 0.7002111077308655,
"learning_rate": 0.00012874207982691447,
"loss": 0.07946470379829407,
"mean_token_accuracy": 0.9763154089450836,
"num_tokens": 8813676.0,
"step": 999
},
{
"entropy": 1.0364596843719482,
"epoch": 3.611764705882353,
"grad_norm": 0.620373547077179,
"learning_rate": 0.00012861337457348383,
"loss": 0.08317571133375168,
"mean_token_accuracy": 0.9791288524866104,
"num_tokens": 8822412.0,
"step": 1000
},
{
"entropy": 1.0231077075004578,
"epoch": 3.6153846153846154,
"grad_norm": 0.36507073044776917,
"learning_rate": 0.0001284846295412278,
"loss": 0.039691824465990067,
"mean_token_accuracy": 0.9890045672655106,
"num_tokens": 8831109.0,
"step": 1001
},
{
"entropy": 1.0747565776109695,
"epoch": 3.619004524886878,
"grad_norm": 0.5191677808761597,
"learning_rate": 0.00012835584500528875,
"loss": 0.05060931667685509,
"mean_token_accuracy": 0.9855721145868301,
"num_tokens": 8839568.0,
"step": 1002
},
{
"entropy": 1.0877159237861633,
"epoch": 3.6226244343891403,
"grad_norm": 0.5678452849388123,
"learning_rate": 0.00012822702124089337,
"loss": 0.05626006796956062,
"mean_token_accuracy": 0.97838294506073,
"num_tokens": 8848116.0,
"step": 1003
},
{
"entropy": 1.0909876823425293,
"epoch": 3.6262443438914027,
"grad_norm": 0.5667127966880798,
"learning_rate": 0.00012809815852335213,
"loss": 0.061909645795822144,
"mean_token_accuracy": 0.9774574041366577,
"num_tokens": 8856312.0,
"step": 1004
},
{
"entropy": 1.0426032990217209,
"epoch": 3.629864253393665,
"grad_norm": 0.4420037269592285,
"learning_rate": 0.00012796925712805883,
"loss": 0.05646451935172081,
"mean_token_accuracy": 0.9806726723909378,
"num_tokens": 8864934.0,
"step": 1005
},
{
"entropy": 0.9995952397584915,
"epoch": 3.6334841628959276,
"grad_norm": 0.4093227684497833,
"learning_rate": 0.00012784031733048992,
"loss": 0.04071081057190895,
"mean_token_accuracy": 0.9907932132482529,
"num_tokens": 8873915.0,
"step": 1006
},
{
"entropy": 0.9970235526561737,
"epoch": 3.63710407239819,
"grad_norm": 0.5725744962692261,
"learning_rate": 0.0001277113394062039,
"loss": 0.06767860054969788,
"mean_token_accuracy": 0.9846918284893036,
"num_tokens": 8882883.0,
"step": 1007
},
{
"entropy": 0.9805562347173691,
"epoch": 3.6407239819004524,
"grad_norm": 0.4164693057537079,
"learning_rate": 0.0001275823236308408,
"loss": 0.03698574751615524,
"mean_token_accuracy": 0.9875341504812241,
"num_tokens": 8891980.0,
"step": 1008
},
{
"entropy": 1.0244318395853043,
"epoch": 3.644343891402715,
"grad_norm": 0.608647882938385,
"learning_rate": 0.0001274532702801214,
"loss": 0.06296362727880478,
"mean_token_accuracy": 0.9802230894565582,
"num_tokens": 8900866.0,
"step": 1009
},
{
"entropy": 0.9997225403785706,
"epoch": 3.6479638009049773,
"grad_norm": 0.5184244513511658,
"learning_rate": 0.00012732417962984697,
"loss": 0.07126037776470184,
"mean_token_accuracy": 0.982243612408638,
"num_tokens": 8909823.0,
"step": 1010
},
{
"entropy": 1.006606712937355,
"epoch": 3.6515837104072397,
"grad_norm": 0.4548531770706177,
"learning_rate": 0.00012719505195589833,
"loss": 0.04628659039735794,
"mean_token_accuracy": 0.9868118911981583,
"num_tokens": 8918370.0,
"step": 1011
},
{
"entropy": 0.9653205871582031,
"epoch": 3.655203619909502,
"grad_norm": 0.6441347599029541,
"learning_rate": 0.0001270658875342356,
"loss": 0.07759731262922287,
"mean_token_accuracy": 0.9780310839414597,
"num_tokens": 8927424.0,
"step": 1012
},
{
"entropy": 1.0112672001123428,
"epoch": 3.6588235294117646,
"grad_norm": 0.5434349775314331,
"learning_rate": 0.00012693668664089724,
"loss": 0.06205587089061737,
"mean_token_accuracy": 0.9839424788951874,
"num_tokens": 8935733.0,
"step": 1013
},
{
"entropy": 0.9845980405807495,
"epoch": 3.662443438914027,
"grad_norm": 0.39817336201667786,
"learning_rate": 0.00012680744955199976,
"loss": 0.048281848430633545,
"mean_token_accuracy": 0.9853204637765884,
"num_tokens": 8944307.0,
"step": 1014
},
{
"entropy": 1.0732997953891754,
"epoch": 3.6660633484162894,
"grad_norm": 0.5867156982421875,
"learning_rate": 0.00012667817654373704,
"loss": 0.0651404857635498,
"mean_token_accuracy": 0.9803767651319504,
"num_tokens": 8952575.0,
"step": 1015
},
{
"entropy": 1.0137926042079926,
"epoch": 3.669683257918552,
"grad_norm": 0.5951899290084839,
"learning_rate": 0.0001265488678923797,
"loss": 0.09196449816226959,
"mean_token_accuracy": 0.9773082733154297,
"num_tokens": 8961217.0,
"step": 1016
},
{
"entropy": 1.0144722759723663,
"epoch": 3.6733031674208148,
"grad_norm": 0.599597692489624,
"learning_rate": 0.00012641952387427448,
"loss": 0.07395292818546295,
"mean_token_accuracy": 0.9723586440086365,
"num_tokens": 8969507.0,
"step": 1017
},
{
"entropy": 0.9949919432401657,
"epoch": 3.676923076923077,
"grad_norm": 0.5899074673652649,
"learning_rate": 0.0001262901447658438,
"loss": 0.057064566761255264,
"mean_token_accuracy": 0.9813538044691086,
"num_tokens": 8978229.0,
"step": 1018
},
{
"entropy": 0.9935206919908524,
"epoch": 3.6805429864253396,
"grad_norm": 0.5249025225639343,
"learning_rate": 0.000126160730843585,
"loss": 0.03331971541047096,
"mean_token_accuracy": 0.9901821464300156,
"num_tokens": 8987107.0,
"step": 1019
},
{
"entropy": 0.9692054241895676,
"epoch": 3.684162895927602,
"grad_norm": 0.7538760900497437,
"learning_rate": 0.00012603128238406985,
"loss": 0.09024970233440399,
"mean_token_accuracy": 0.9682414084672928,
"num_tokens": 8996471.0,
"step": 1020
},
{
"entropy": 0.9907859861850739,
"epoch": 3.6877828054298645,
"grad_norm": 0.541009783744812,
"learning_rate": 0.00012590179966394388,
"loss": 0.04950612783432007,
"mean_token_accuracy": 0.9824161380529404,
"num_tokens": 9005068.0,
"step": 1021
},
{
"entropy": 1.0388163626194,
"epoch": 3.691402714932127,
"grad_norm": 0.5293216705322266,
"learning_rate": 0.0001257722829599259,
"loss": 0.04979352653026581,
"mean_token_accuracy": 0.9849574863910675,
"num_tokens": 9013415.0,
"step": 1022
},
{
"entropy": 0.9590071588754654,
"epoch": 3.6950226244343893,
"grad_norm": 0.5384345054626465,
"learning_rate": 0.0001256427325488074,
"loss": 0.05108953267335892,
"mean_token_accuracy": 0.9861488491296768,
"num_tokens": 9022908.0,
"step": 1023
},
{
"entropy": 1.0496671795845032,
"epoch": 3.6986425339366518,
"grad_norm": 0.598530113697052,
"learning_rate": 0.00012551314870745174,
"loss": 0.04889511317014694,
"mean_token_accuracy": 0.9841330647468567,
"num_tokens": 9031377.0,
"step": 1024
},
{
"entropy": 1.06731316447258,
"epoch": 3.702262443438914,
"grad_norm": 0.7431966066360474,
"learning_rate": 0.00012538353171279387,
"loss": 0.09418823570013046,
"mean_token_accuracy": 0.9767041355371475,
"num_tokens": 9039580.0,
"step": 1025
},
{
"entropy": 1.0664094239473343,
"epoch": 3.7058823529411766,
"grad_norm": 0.5317267775535583,
"learning_rate": 0.00012525388184183952,
"loss": 0.05456814914941788,
"mean_token_accuracy": 0.9801364839076996,
"num_tokens": 9047994.0,
"step": 1026
},
{
"entropy": 0.9594382792711258,
"epoch": 3.709502262443439,
"grad_norm": 0.32988351583480835,
"learning_rate": 0.00012512419937166474,
"loss": 0.03360046446323395,
"mean_token_accuracy": 0.9874261766672134,
"num_tokens": 9057249.0,
"step": 1027
},
{
"entropy": 1.0023006796836853,
"epoch": 3.7131221719457015,
"grad_norm": 0.47979024052619934,
"learning_rate": 0.0001249944845794151,
"loss": 0.060380980372428894,
"mean_token_accuracy": 0.98314069211483,
"num_tokens": 9065777.0,
"step": 1028
},
{
"entropy": 0.9864871054887772,
"epoch": 3.716742081447964,
"grad_norm": 0.5115134716033936,
"learning_rate": 0.00012486473774230548,
"loss": 0.05596606805920601,
"mean_token_accuracy": 0.9851608425378799,
"num_tokens": 9075135.0,
"step": 1029
},
{
"entropy": 1.0391730964183807,
"epoch": 3.7203619909502263,
"grad_norm": 0.867382287979126,
"learning_rate": 0.00012473495913761906,
"loss": 0.15137547254562378,
"mean_token_accuracy": 0.9702001363039017,
"num_tokens": 9083899.0,
"step": 1030
},
{
"entropy": 1.0278279185295105,
"epoch": 3.723981900452489,
"grad_norm": 0.6137107014656067,
"learning_rate": 0.00012460514904270696,
"loss": 0.04550578072667122,
"mean_token_accuracy": 0.9855063706636429,
"num_tokens": 9092262.0,
"step": 1031
},
{
"entropy": 0.9615179747343063,
"epoch": 3.727601809954751,
"grad_norm": 0.5605074763298035,
"learning_rate": 0.00012447530773498764,
"loss": 0.05297612026333809,
"mean_token_accuracy": 0.9862687736749649,
"num_tokens": 9101021.0,
"step": 1032
},
{
"entropy": 1.0526714771986008,
"epoch": 3.7312217194570136,
"grad_norm": 0.5493430495262146,
"learning_rate": 0.0001243454354919462,
"loss": 0.048210758715867996,
"mean_token_accuracy": 0.9809322506189346,
"num_tokens": 9109829.0,
"step": 1033
},
{
"entropy": 1.0210031270980835,
"epoch": 3.734841628959276,
"grad_norm": 0.6273694634437561,
"learning_rate": 0.00012421553259113393,
"loss": 0.07619710266590118,
"mean_token_accuracy": 0.9756554067134857,
"num_tokens": 9118711.0,
"step": 1034
},
{
"entropy": 0.999250665307045,
"epoch": 3.7384615384615385,
"grad_norm": 0.5953567028045654,
"learning_rate": 0.00012408559931016753,
"loss": 0.03722090646624565,
"mean_token_accuracy": 0.9891637414693832,
"num_tokens": 9127649.0,
"step": 1035
},
{
"entropy": 1.0046712160110474,
"epoch": 3.742081447963801,
"grad_norm": 0.5995718836784363,
"learning_rate": 0.0001239556359267287,
"loss": 0.07123475521802902,
"mean_token_accuracy": 0.9800655543804169,
"num_tokens": 9136631.0,
"step": 1036
},
{
"entropy": 0.9638098627328873,
"epoch": 3.7457013574660634,
"grad_norm": 0.35988542437553406,
"learning_rate": 0.0001238256427185635,
"loss": 0.028186630457639694,
"mean_token_accuracy": 0.9930652678012848,
"num_tokens": 9145889.0,
"step": 1037
},
{
"entropy": 0.9974884688854218,
"epoch": 3.749321266968326,
"grad_norm": 0.6581419110298157,
"learning_rate": 0.0001236956199634817,
"loss": 0.07928231358528137,
"mean_token_accuracy": 0.977179154753685,
"num_tokens": 9155071.0,
"step": 1038
},
{
"entropy": 0.9659110605716705,
"epoch": 3.7529411764705882,
"grad_norm": 0.5204628705978394,
"learning_rate": 0.00012356556793935615,
"loss": 0.07529903948307037,
"mean_token_accuracy": 0.9794208407402039,
"num_tokens": 9164329.0,
"step": 1039
},
{
"entropy": 0.9618893265724182,
"epoch": 3.7565610859728507,
"grad_norm": 0.43765532970428467,
"learning_rate": 0.00012343548692412233,
"loss": 0.04690020531415939,
"mean_token_accuracy": 0.986578032374382,
"num_tokens": 9173806.0,
"step": 1040
},
{
"entropy": 1.0338812470436096,
"epoch": 3.760180995475113,
"grad_norm": 0.4304349422454834,
"learning_rate": 0.00012330537719577766,
"loss": 0.03645741939544678,
"mean_token_accuracy": 0.9854725003242493,
"num_tokens": 9182477.0,
"step": 1041
},
{
"entropy": 1.0175089985132217,
"epoch": 3.7638009049773755,
"grad_norm": 0.4084160029888153,
"learning_rate": 0.00012317523903238094,
"loss": 0.03105458803474903,
"mean_token_accuracy": 0.9912384748458862,
"num_tokens": 9191539.0,
"step": 1042
},
{
"entropy": 1.0527340471744537,
"epoch": 3.767420814479638,
"grad_norm": 0.5655209422111511,
"learning_rate": 0.00012304507271205167,
"loss": 0.05545002967119217,
"mean_token_accuracy": 0.9856874346733093,
"num_tokens": 9200288.0,
"step": 1043
},
{
"entropy": 1.0532978475093842,
"epoch": 3.7710407239819004,
"grad_norm": 0.4896293878555298,
"learning_rate": 0.00012291487851296955,
"loss": 0.03723525255918503,
"mean_token_accuracy": 0.9878070503473282,
"num_tokens": 9208798.0,
"step": 1044
},
{
"entropy": 0.9996777772903442,
"epoch": 3.774660633484163,
"grad_norm": 0.2994062900543213,
"learning_rate": 0.00012278465671337394,
"loss": 0.021810417994856834,
"mean_token_accuracy": 0.9923857599496841,
"num_tokens": 9217755.0,
"step": 1045
},
{
"entropy": 1.0102759897708893,
"epoch": 3.7782805429864252,
"grad_norm": 0.7357338070869446,
"learning_rate": 0.0001226544075915631,
"loss": 0.08538160473108292,
"mean_token_accuracy": 0.9771738797426224,
"num_tokens": 9226530.0,
"step": 1046
},
{
"entropy": 1.0522404909133911,
"epoch": 3.7819004524886877,
"grad_norm": 0.6169455647468567,
"learning_rate": 0.0001225241314258937,
"loss": 0.054077863693237305,
"mean_token_accuracy": 0.9810755997896194,
"num_tokens": 9235491.0,
"step": 1047
},
{
"entropy": 0.9845469892024994,
"epoch": 3.78552036199095,
"grad_norm": 0.6194866895675659,
"learning_rate": 0.00012239382849478026,
"loss": 0.09550972282886505,
"mean_token_accuracy": 0.9748479872941971,
"num_tokens": 9244320.0,
"step": 1048
},
{
"entropy": 0.9689999371767044,
"epoch": 3.7891402714932125,
"grad_norm": 0.599198043346405,
"learning_rate": 0.0001222634990766944,
"loss": 0.06929125636816025,
"mean_token_accuracy": 0.9795756936073303,
"num_tokens": 9253358.0,
"step": 1049
},
{
"entropy": 0.9611286520957947,
"epoch": 3.792760180995475,
"grad_norm": 0.5120612978935242,
"learning_rate": 0.00012213314345016434,
"loss": 0.04920945689082146,
"mean_token_accuracy": 0.987091675400734,
"num_tokens": 9262305.0,
"step": 1050
},
{
"entropy": 0.9123319238424301,
"epoch": 3.7963800904977374,
"grad_norm": 0.47379791736602783,
"learning_rate": 0.00012200276189377449,
"loss": 0.04485338181257248,
"mean_token_accuracy": 0.9874735176563263,
"num_tokens": 9271327.0,
"step": 1051
},
{
"entropy": 0.9740808606147766,
"epoch": 3.8,
"grad_norm": 0.6042353510856628,
"learning_rate": 0.00012187235468616449,
"loss": 0.06215674430131912,
"mean_token_accuracy": 0.9854099005460739,
"num_tokens": 9279720.0,
"step": 1052
},
{
"entropy": 1.0416322499513626,
"epoch": 3.8036199095022623,
"grad_norm": 0.5319172739982605,
"learning_rate": 0.00012174192210602886,
"loss": 0.03299910947680473,
"mean_token_accuracy": 0.990405261516571,
"num_tokens": 9287790.0,
"step": 1053
},
{
"entropy": 0.9842112064361572,
"epoch": 3.8072398190045247,
"grad_norm": 0.4761062264442444,
"learning_rate": 0.00012161146443211635,
"loss": 0.04322975501418114,
"mean_token_accuracy": 0.9890616089105606,
"num_tokens": 9296546.0,
"step": 1054
},
{
"entropy": 0.9370725750923157,
"epoch": 3.810859728506787,
"grad_norm": 0.6730228662490845,
"learning_rate": 0.00012148098194322936,
"loss": 0.13308462500572205,
"mean_token_accuracy": 0.9631438553333282,
"num_tokens": 9305702.0,
"step": 1055
},
{
"entropy": 0.9374968558549881,
"epoch": 3.8144796380090495,
"grad_norm": 0.48976925015449524,
"learning_rate": 0.00012135047491822329,
"loss": 0.04608523100614548,
"mean_token_accuracy": 0.9850935637950897,
"num_tokens": 9314649.0,
"step": 1056
},
{
"entropy": 1.0022027492523193,
"epoch": 3.818099547511312,
"grad_norm": 0.5045378804206848,
"learning_rate": 0.00012121994363600593,
"loss": 0.053614161908626556,
"mean_token_accuracy": 0.9847323000431061,
"num_tokens": 9323366.0,
"step": 1057
},
{
"entropy": 1.0107389986515045,
"epoch": 3.8217194570135744,
"grad_norm": 0.8898943066596985,
"learning_rate": 0.00012108938837553703,
"loss": 0.23248156905174255,
"mean_token_accuracy": 0.9641059786081314,
"num_tokens": 9332078.0,
"step": 1058
},
{
"entropy": 1.0229197144508362,
"epoch": 3.825339366515837,
"grad_norm": 0.6454104781150818,
"learning_rate": 0.00012095880941582744,
"loss": 0.07357359677553177,
"mean_token_accuracy": 0.9791290163993835,
"num_tokens": 9340736.0,
"step": 1059
},
{
"entropy": 1.0136512219905853,
"epoch": 3.8289592760180997,
"grad_norm": 0.4896777272224426,
"learning_rate": 0.00012082820703593885,
"loss": 0.056676387786865234,
"mean_token_accuracy": 0.9834884107112885,
"num_tokens": 9349618.0,
"step": 1060
},
{
"entropy": 0.9840548485517502,
"epoch": 3.832579185520362,
"grad_norm": 0.7807687520980835,
"learning_rate": 0.00012069758151498279,
"loss": 0.08798709511756897,
"mean_token_accuracy": 0.9721374362707138,
"num_tokens": 9358452.0,
"step": 1061
},
{
"entropy": 1.0566797703504562,
"epoch": 3.8361990950226246,
"grad_norm": 0.9508253335952759,
"learning_rate": 0.0001205669331321204,
"loss": 0.08302375674247742,
"mean_token_accuracy": 0.9773988276720047,
"num_tokens": 9367052.0,
"step": 1062
},
{
"entropy": 1.0889401733875275,
"epoch": 3.839819004524887,
"grad_norm": 0.3949955999851227,
"learning_rate": 0.00012043626216656154,
"loss": 0.023118160665035248,
"mean_token_accuracy": 0.9940158426761627,
"num_tokens": 9375475.0,
"step": 1063
},
{
"entropy": 1.0559107959270477,
"epoch": 3.8434389140271494,
"grad_norm": 0.9540901780128479,
"learning_rate": 0.00012030556889756451,
"loss": 0.09784621000289917,
"mean_token_accuracy": 0.9844755232334137,
"num_tokens": 9384455.0,
"step": 1064
},
{
"entropy": 1.0997906029224396,
"epoch": 3.847058823529412,
"grad_norm": 0.7329124212265015,
"learning_rate": 0.00012017485360443512,
"loss": 0.05017324537038803,
"mean_token_accuracy": 0.9844279885292053,
"num_tokens": 9392971.0,
"step": 1065
},
{
"entropy": 1.0585920810699463,
"epoch": 3.8506787330316743,
"grad_norm": 0.35517770051956177,
"learning_rate": 0.00012004411656652629,
"loss": 0.033513545989990234,
"mean_token_accuracy": 0.9896632134914398,
"num_tokens": 9401493.0,
"step": 1066
},
{
"entropy": 0.9919628500938416,
"epoch": 3.8542986425339367,
"grad_norm": 0.4272564649581909,
"learning_rate": 0.00011991335806323751,
"loss": 0.05677922070026398,
"mean_token_accuracy": 0.9791074395179749,
"num_tokens": 9410389.0,
"step": 1067
},
{
"entropy": 1.0546822249889374,
"epoch": 3.857918552036199,
"grad_norm": 0.5334388017654419,
"learning_rate": 0.00011978257837401396,
"loss": 0.05315824970602989,
"mean_token_accuracy": 0.9821736663579941,
"num_tokens": 9419227.0,
"step": 1068
},
{
"entropy": 1.041745737195015,
"epoch": 3.8615384615384616,
"grad_norm": 0.534310519695282,
"learning_rate": 0.00011965177777834627,
"loss": 0.07340014725923538,
"mean_token_accuracy": 0.9823083132505417,
"num_tokens": 9428116.0,
"step": 1069
},
{
"entropy": 0.9816903918981552,
"epoch": 3.865158371040724,
"grad_norm": 0.5933845639228821,
"learning_rate": 0.0001195209565557696,
"loss": 0.07548420131206512,
"mean_token_accuracy": 0.9744589179754257,
"num_tokens": 9437254.0,
"step": 1070
},
{
"entropy": 1.0700978338718414,
"epoch": 3.8687782805429864,
"grad_norm": 0.606353223323822,
"learning_rate": 0.00011939011498586333,
"loss": 0.039710596203804016,
"mean_token_accuracy": 0.9877680093050003,
"num_tokens": 9445784.0,
"step": 1071
},
{
"entropy": 1.0325657278299332,
"epoch": 3.872398190045249,
"grad_norm": 0.8289651870727539,
"learning_rate": 0.00011925925334825026,
"loss": 0.05568333715200424,
"mean_token_accuracy": 0.9832515269517899,
"num_tokens": 9454906.0,
"step": 1072
},
{
"entropy": 0.9403438866138458,
"epoch": 3.8760180995475113,
"grad_norm": 0.5779698491096497,
"learning_rate": 0.00011912837192259605,
"loss": 0.06453107297420502,
"mean_token_accuracy": 0.9814022779464722,
"num_tokens": 9463771.0,
"step": 1073
},
{
"entropy": 1.06302210688591,
"epoch": 3.8796380090497737,
"grad_norm": 0.6681597232818604,
"learning_rate": 0.0001189974709886087,
"loss": 0.07519081979990005,
"mean_token_accuracy": 0.9822188168764114,
"num_tokens": 9472487.0,
"step": 1074
},
{
"entropy": 0.9794299155473709,
"epoch": 3.883257918552036,
"grad_norm": 0.6864436864852905,
"learning_rate": 0.00011886655082603784,
"loss": 0.06897931545972824,
"mean_token_accuracy": 0.9769734293222427,
"num_tokens": 9481545.0,
"step": 1075
},
{
"entropy": 0.9836923331022263,
"epoch": 3.8868778280542986,
"grad_norm": 0.470438688993454,
"learning_rate": 0.00011873561171467428,
"loss": 0.05008576065301895,
"mean_token_accuracy": 0.9836294203996658,
"num_tokens": 9491031.0,
"step": 1076
},
{
"entropy": 1.048751562833786,
"epoch": 3.890497737556561,
"grad_norm": 0.36586275696754456,
"learning_rate": 0.0001186046539343493,
"loss": 0.032275643199682236,
"mean_token_accuracy": 0.9896851181983948,
"num_tokens": 9499811.0,
"step": 1077
},
{
"entropy": 1.0775998830795288,
"epoch": 3.8941176470588235,
"grad_norm": 0.4112566113471985,
"learning_rate": 0.00011847367776493398,
"loss": 0.04865328222513199,
"mean_token_accuracy": 0.9793375581502914,
"num_tokens": 9508692.0,
"step": 1078
},
{
"entropy": 1.0528927445411682,
"epoch": 3.897737556561086,
"grad_norm": 0.5616442561149597,
"learning_rate": 0.00011834268348633883,
"loss": 0.07484862208366394,
"mean_token_accuracy": 0.9722632467746735,
"num_tokens": 9517646.0,
"step": 1079
},
{
"entropy": 1.1230517029762268,
"epoch": 3.9013574660633483,
"grad_norm": 0.5908562541007996,
"learning_rate": 0.00011821167137851299,
"loss": 0.07327218353748322,
"mean_token_accuracy": 0.9811713099479675,
"num_tokens": 9526143.0,
"step": 1080
},
{
"entropy": 1.0355236679315567,
"epoch": 3.9049773755656108,
"grad_norm": 0.3904285728931427,
"learning_rate": 0.0001180806417214437,
"loss": 0.03152093291282654,
"mean_token_accuracy": 0.9888577461242676,
"num_tokens": 9535160.0,
"step": 1081
},
{
"entropy": 1.0083343833684921,
"epoch": 3.908597285067873,
"grad_norm": 0.5973058938980103,
"learning_rate": 0.00011794959479515577,
"loss": 0.05609864741563797,
"mean_token_accuracy": 0.9796222299337387,
"num_tokens": 9544285.0,
"step": 1082
},
{
"entropy": 1.0868518203496933,
"epoch": 3.9122171945701356,
"grad_norm": 0.5905822515487671,
"learning_rate": 0.00011781853087971087,
"loss": 0.09653709828853607,
"mean_token_accuracy": 0.97333624958992,
"num_tokens": 9553206.0,
"step": 1083
},
{
"entropy": 1.0268581211566925,
"epoch": 3.915837104072398,
"grad_norm": 0.5431144833564758,
"learning_rate": 0.00011768745025520694,
"loss": 0.03400646895170212,
"mean_token_accuracy": 0.9925975948572159,
"num_tokens": 9561982.0,
"step": 1084
},
{
"entropy": 0.9988400340080261,
"epoch": 3.9194570135746605,
"grad_norm": 0.5526976585388184,
"learning_rate": 0.00011755635320177765,
"loss": 0.06047695130109787,
"mean_token_accuracy": 0.9864342510700226,
"num_tokens": 9570908.0,
"step": 1085
},
{
"entropy": 1.0497360080480576,
"epoch": 3.9230769230769234,
"grad_norm": 0.48714399337768555,
"learning_rate": 0.00011742523999959189,
"loss": 0.05991292744874954,
"mean_token_accuracy": 0.9843471348285675,
"num_tokens": 9579709.0,
"step": 1086
},
{
"entropy": 0.9997294843196869,
"epoch": 3.926696832579186,
"grad_norm": 0.40273338556289673,
"learning_rate": 0.0001172941109288529,
"loss": 0.0249432772397995,
"mean_token_accuracy": 0.9916731268167496,
"num_tokens": 9587982.0,
"step": 1087
},
{
"entropy": 1.071354240179062,
"epoch": 3.930316742081448,
"grad_norm": 0.5073276162147522,
"learning_rate": 0.00011716296626979789,
"loss": 0.052621759474277496,
"mean_token_accuracy": 0.9854983687400818,
"num_tokens": 9596660.0,
"step": 1088
},
{
"entropy": 1.0524671822786331,
"epoch": 3.9339366515837106,
"grad_norm": 0.44608592987060547,
"learning_rate": 0.00011703180630269743,
"loss": 0.0402885302901268,
"mean_token_accuracy": 0.9871297627687454,
"num_tokens": 9605415.0,
"step": 1089
},
{
"entropy": 0.9869890660047531,
"epoch": 3.937556561085973,
"grad_norm": 0.5331403613090515,
"learning_rate": 0.00011690063130785478,
"loss": 0.05267741531133652,
"mean_token_accuracy": 0.9868313521146774,
"num_tokens": 9614049.0,
"step": 1090
},
{
"entropy": 1.0029538869857788,
"epoch": 3.9411764705882355,
"grad_norm": 0.40601280331611633,
"learning_rate": 0.00011676944156560532,
"loss": 0.03916563838720322,
"mean_token_accuracy": 0.9879664182662964,
"num_tokens": 9623099.0,
"step": 1091
},
{
"entropy": 1.0587478280067444,
"epoch": 3.944796380090498,
"grad_norm": 0.6124465465545654,
"learning_rate": 0.00011663823735631585,
"loss": 0.06757491081953049,
"mean_token_accuracy": 0.9832141548395157,
"num_tokens": 9631984.0,
"step": 1092
},
{
"entropy": 1.0624504685401917,
"epoch": 3.9484162895927604,
"grad_norm": 0.5510776042938232,
"learning_rate": 0.00011650701896038428,
"loss": 0.04322856664657593,
"mean_token_accuracy": 0.9892711043357849,
"num_tokens": 9640701.0,
"step": 1093
},
{
"entropy": 1.0204610973596573,
"epoch": 3.952036199095023,
"grad_norm": 0.5441774725914001,
"learning_rate": 0.00011637578665823865,
"loss": 0.2228085845708847,
"mean_token_accuracy": 0.9644595384597778,
"num_tokens": 9649828.0,
"step": 1094
},
{
"entropy": 0.9337645173072815,
"epoch": 3.9556561085972852,
"grad_norm": 0.5637805461883545,
"learning_rate": 0.00011624454073033686,
"loss": 0.06843124330043793,
"mean_token_accuracy": 0.9804229438304901,
"num_tokens": 9659279.0,
"step": 1095
},
{
"entropy": 0.9486549347639084,
"epoch": 3.9592760180995477,
"grad_norm": 0.5789593458175659,
"learning_rate": 0.00011611328145716582,
"loss": 0.05753622204065323,
"mean_token_accuracy": 0.9850260466337204,
"num_tokens": 9668201.0,
"step": 1096
},
{
"entropy": 0.9881956428289413,
"epoch": 3.96289592760181,
"grad_norm": 0.4987047612667084,
"learning_rate": 0.00011598200911924104,
"loss": 0.029422685503959656,
"mean_token_accuracy": 0.989275798201561,
"num_tokens": 9677075.0,
"step": 1097
},
{
"entropy": 0.9573028832674026,
"epoch": 3.9665158371040725,
"grad_norm": 0.7386200428009033,
"learning_rate": 0.00011585072399710588,
"loss": 0.03801329806447029,
"mean_token_accuracy": 0.9843745678663254,
"num_tokens": 9686251.0,
"step": 1098
},
{
"entropy": 0.9610146731138229,
"epoch": 3.970135746606335,
"grad_norm": 0.4404725134372711,
"learning_rate": 0.00011571942637133115,
"loss": 0.057003386318683624,
"mean_token_accuracy": 0.9834477603435516,
"num_tokens": 9695626.0,
"step": 1099
},
{
"entropy": 0.9653383642435074,
"epoch": 3.9737556561085974,
"grad_norm": 0.8406949043273926,
"learning_rate": 0.00011558811652251422,
"loss": 0.11011351644992828,
"mean_token_accuracy": 0.9752728492021561,
"num_tokens": 9704667.0,
"step": 1100
},
{
"entropy": 1.060383379459381,
"epoch": 3.97737556561086,
"grad_norm": 0.4521214962005615,
"learning_rate": 0.00011545679473127864,
"loss": 0.050808414816856384,
"mean_token_accuracy": 0.9801691472530365,
"num_tokens": 9713336.0,
"step": 1101
},
{
"entropy": 0.9874586910009384,
"epoch": 3.9809954751131222,
"grad_norm": 0.6479194760322571,
"learning_rate": 0.00011532546127827355,
"loss": 0.05640149861574173,
"mean_token_accuracy": 0.978395476937294,
"num_tokens": 9722165.0,
"step": 1102
},
{
"entropy": 1.0500174909830093,
"epoch": 3.9846153846153847,
"grad_norm": 0.4811345934867859,
"learning_rate": 0.00011519411644417296,
"loss": 0.03901681676506996,
"mean_token_accuracy": 0.9876944124698639,
"num_tokens": 9730697.0,
"step": 1103
},
{
"entropy": 0.9794940203428268,
"epoch": 3.988235294117647,
"grad_norm": 0.5242920517921448,
"learning_rate": 0.00011506276050967518,
"loss": 0.08005990833044052,
"mean_token_accuracy": 0.9715511202812195,
"num_tokens": 9740258.0,
"step": 1104
},
{
"entropy": 1.0328656435012817,
"epoch": 3.9918552036199095,
"grad_norm": 0.4445230960845947,
"learning_rate": 0.00011493139375550222,
"loss": 0.0437643937766552,
"mean_token_accuracy": 0.9823936223983765,
"num_tokens": 9748823.0,
"step": 1105
},
{
"entropy": 1.0510858297348022,
"epoch": 3.995475113122172,
"grad_norm": 0.848091185092926,
"learning_rate": 0.00011480001646239935,
"loss": 0.10476335883140564,
"mean_token_accuracy": 0.9740482717752457,
"num_tokens": 9757407.0,
"step": 1106
},
{
"entropy": 1.046503335237503,
"epoch": 3.9990950226244344,
"grad_norm": 0.5196622014045715,
"learning_rate": 0.00011466862891113424,
"loss": 0.03807297721505165,
"mean_token_accuracy": 0.9873451888561249,
"num_tokens": 9766046.0,
"step": 1107
},
{
"entropy": 0.8666110038757324,
"epoch": 4.0,
"grad_norm": 0.6595801115036011,
"learning_rate": 0.00011453723138249647,
"loss": 0.02082827128469944,
"mean_token_accuracy": 0.9935275316238403,
"num_tokens": 9766900.0,
"step": 1108
},
{
"epoch": 4.0,
"eval_entropy": 1.0280046855531089,
"eval_loss": 0.11439266055822372,
"eval_mean_token_accuracy": 0.9721979295335165,
"eval_num_tokens": 9766900.0,
"eval_runtime": 31.7938,
"eval_samples_per_second": 11.606,
"eval_steps_per_second": 3.869,
"step": 1108
},
{
"entropy": 1.0500973612070084,
"epoch": 4.003619909502262,
"grad_norm": 0.5530663132667542,
"learning_rate": 0.00011440582415729704,
"loss": 0.04492343217134476,
"mean_token_accuracy": 0.985475093126297,
"num_tokens": 9775335.0,
"step": 1109
},
{
"entropy": 0.9695511311292648,
"epoch": 4.007239819004525,
"grad_norm": 0.41322094202041626,
"learning_rate": 0.0001142744075163676,
"loss": 0.03918066620826721,
"mean_token_accuracy": 0.9863087087869644,
"num_tokens": 9784958.0,
"step": 1110
},
{
"entropy": 1.0054823160171509,
"epoch": 4.010859728506787,
"grad_norm": 0.29584264755249023,
"learning_rate": 0.0001141429817405599,
"loss": 0.024759791791439056,
"mean_token_accuracy": 0.9934576153755188,
"num_tokens": 9793859.0,
"step": 1111
},
{
"entropy": 0.9745224118232727,
"epoch": 4.01447963800905,
"grad_norm": 0.4821024239063263,
"learning_rate": 0.00011401154711074536,
"loss": 0.03338223323225975,
"mean_token_accuracy": 0.9919255524873734,
"num_tokens": 9803033.0,
"step": 1112
},
{
"entropy": 0.9960066974163055,
"epoch": 4.018099547511312,
"grad_norm": 0.5458202958106995,
"learning_rate": 0.00011388010390781412,
"loss": 0.046578116714954376,
"mean_token_accuracy": 0.9831000417470932,
"num_tokens": 9812187.0,
"step": 1113
},
{
"entropy": 1.008404940366745,
"epoch": 4.021719457013575,
"grad_norm": 0.45678576827049255,
"learning_rate": 0.00011374865241267478,
"loss": 0.024580247700214386,
"mean_token_accuracy": 0.9908605217933655,
"num_tokens": 9820761.0,
"step": 1114
},
{
"entropy": 0.9565981924533844,
"epoch": 4.025339366515837,
"grad_norm": 0.5080620050430298,
"learning_rate": 0.00011361719290625359,
"loss": 0.06319691240787506,
"mean_token_accuracy": 0.9872463345527649,
"num_tokens": 9829985.0,
"step": 1115
},
{
"entropy": 0.9249087870121002,
"epoch": 4.0289592760180994,
"grad_norm": 0.47282564640045166,
"learning_rate": 0.000113485725669494,
"loss": 0.034620530903339386,
"mean_token_accuracy": 0.9901333600282669,
"num_tokens": 9839520.0,
"step": 1116
},
{
"entropy": 0.9898876994848251,
"epoch": 4.032579185520362,
"grad_norm": 0.35816535353660583,
"learning_rate": 0.0001133542509833559,
"loss": 0.0199007298797369,
"mean_token_accuracy": 0.994959831237793,
"num_tokens": 9848010.0,
"step": 1117
},
{
"entropy": 0.9201329201459885,
"epoch": 4.036199095022624,
"grad_norm": 0.36862438917160034,
"learning_rate": 0.00011322276912881509,
"loss": 0.033990710973739624,
"mean_token_accuracy": 0.987580731511116,
"num_tokens": 9857231.0,
"step": 1118
},
{
"entropy": 0.8616881370544434,
"epoch": 4.039819004524887,
"grad_norm": 0.4026683568954468,
"learning_rate": 0.00011309128038686278,
"loss": 0.02125810645520687,
"mean_token_accuracy": 0.990529865026474,
"num_tokens": 9866285.0,
"step": 1119
},
{
"entropy": 0.9119723290205002,
"epoch": 4.043438914027149,
"grad_norm": 0.5761182904243469,
"learning_rate": 0.00011295978503850487,
"loss": 0.04496622830629349,
"mean_token_accuracy": 0.9852714240550995,
"num_tokens": 9874724.0,
"step": 1120
},
{
"entropy": 0.8586753159761429,
"epoch": 4.047058823529412,
"grad_norm": 0.5358923673629761,
"learning_rate": 0.00011282828336476134,
"loss": 0.04419999569654465,
"mean_token_accuracy": 0.9864743649959564,
"num_tokens": 9884023.0,
"step": 1121
},
{
"entropy": 0.8618190735578537,
"epoch": 4.050678733031674,
"grad_norm": 0.6738160848617554,
"learning_rate": 0.00011269677564666565,
"loss": 0.051054857671260834,
"mean_token_accuracy": 0.9872888177633286,
"num_tokens": 9892904.0,
"step": 1122
},
{
"entropy": 0.8987855166196823,
"epoch": 4.0542986425339365,
"grad_norm": 0.828499972820282,
"learning_rate": 0.00011256526216526433,
"loss": 0.03101392835378647,
"mean_token_accuracy": 0.9909973591566086,
"num_tokens": 9901345.0,
"step": 1123
},
{
"entropy": 0.8801412582397461,
"epoch": 4.057918552036199,
"grad_norm": 0.6591492891311646,
"learning_rate": 0.00011243374320161607,
"loss": 0.04960804432630539,
"mean_token_accuracy": 0.9818640649318695,
"num_tokens": 9909853.0,
"step": 1124
},
{
"entropy": 0.8607528507709503,
"epoch": 4.061538461538461,
"grad_norm": 0.884670615196228,
"learning_rate": 0.0001123022190367913,
"loss": 0.11029860377311707,
"mean_token_accuracy": 0.9751772731542587,
"num_tokens": 9918983.0,
"step": 1125
},
{
"entropy": 0.864204928278923,
"epoch": 4.065158371040724,
"grad_norm": 0.6966027021408081,
"learning_rate": 0.00011217068995187172,
"loss": 0.054947629570961,
"mean_token_accuracy": 0.9846376031637192,
"num_tokens": 9927814.0,
"step": 1126
},
{
"entropy": 0.8841895759105682,
"epoch": 4.068778280542986,
"grad_norm": 0.49447473883628845,
"learning_rate": 0.00011203915622794934,
"loss": 0.04461928457021713,
"mean_token_accuracy": 0.9833347350358963,
"num_tokens": 9936735.0,
"step": 1127
},
{
"entropy": 0.8424310237169266,
"epoch": 4.072398190045249,
"grad_norm": 0.3998212516307831,
"learning_rate": 0.00011190761814612616,
"loss": 0.022949082776904106,
"mean_token_accuracy": 0.9910575300455093,
"num_tokens": 9945746.0,
"step": 1128
},
{
"entropy": 0.9149628132581711,
"epoch": 4.076018099547511,
"grad_norm": 0.48935163021087646,
"learning_rate": 0.00011177607598751354,
"loss": 0.0449216291308403,
"mean_token_accuracy": 0.9876349568367004,
"num_tokens": 9954786.0,
"step": 1129
},
{
"entropy": 0.9436136782169342,
"epoch": 4.0796380090497735,
"grad_norm": 0.6684723496437073,
"learning_rate": 0.00011164453003323152,
"loss": 0.02876024693250656,
"mean_token_accuracy": 0.9874544590711594,
"num_tokens": 9962930.0,
"step": 1130
},
{
"entropy": 0.9349515736103058,
"epoch": 4.083257918552036,
"grad_norm": 0.3718799352645874,
"learning_rate": 0.00011151298056440825,
"loss": 0.022371353581547737,
"mean_token_accuracy": 0.9954051822423935,
"num_tokens": 9971056.0,
"step": 1131
},
{
"entropy": 0.8744916170835495,
"epoch": 4.086877828054298,
"grad_norm": 0.5116303563117981,
"learning_rate": 0.00011138142786217936,
"loss": 0.021263940259814262,
"mean_token_accuracy": 0.9941851943731308,
"num_tokens": 9980049.0,
"step": 1132
},
{
"entropy": 0.8928166627883911,
"epoch": 4.090497737556561,
"grad_norm": 0.5083812475204468,
"learning_rate": 0.00011124987220768743,
"loss": 0.050615012645721436,
"mean_token_accuracy": 0.9832881093025208,
"num_tokens": 9989156.0,
"step": 1133
},
{
"entropy": 0.9249687194824219,
"epoch": 4.094117647058823,
"grad_norm": 0.39031726121902466,
"learning_rate": 0.00011111831388208138,
"loss": 0.04024628549814224,
"mean_token_accuracy": 0.9921209067106247,
"num_tokens": 9998097.0,
"step": 1134
},
{
"entropy": 0.922693282365799,
"epoch": 4.097737556561086,
"grad_norm": 0.692787766456604,
"learning_rate": 0.00011098675316651576,
"loss": 0.03948502615094185,
"mean_token_accuracy": 0.9865765869617462,
"num_tokens": 10006658.0,
"step": 1135
},
{
"entropy": 0.8548087179660797,
"epoch": 4.101357466063348,
"grad_norm": 0.6769980788230896,
"learning_rate": 0.00011085519034215027,
"loss": 0.04489884525537491,
"mean_token_accuracy": 0.9869794398546219,
"num_tokens": 10016035.0,
"step": 1136
},
{
"entropy": 0.9093173742294312,
"epoch": 4.1049773755656105,
"grad_norm": 0.44418567419052124,
"learning_rate": 0.0001107236256901491,
"loss": 0.04798293486237526,
"mean_token_accuracy": 0.9824085086584091,
"num_tokens": 10024945.0,
"step": 1137
},
{
"entropy": 0.9510129541158676,
"epoch": 4.108597285067873,
"grad_norm": 0.5759614706039429,
"learning_rate": 0.00011059205949168037,
"loss": 0.027157757431268692,
"mean_token_accuracy": 0.9911233633756638,
"num_tokens": 10033483.0,
"step": 1138
},
{
"entropy": 0.9277460277080536,
"epoch": 4.112217194570135,
"grad_norm": 0.5908122658729553,
"learning_rate": 0.00011046049202791553,
"loss": 0.06614906340837479,
"mean_token_accuracy": 0.9815282225608826,
"num_tokens": 10042459.0,
"step": 1139
},
{
"entropy": 0.887006476521492,
"epoch": 4.115837104072398,
"grad_norm": 0.35706913471221924,
"learning_rate": 0.00011032892358002862,
"loss": 0.018396304920315742,
"mean_token_accuracy": 0.9945356100797653,
"num_tokens": 10051689.0,
"step": 1140
},
{
"entropy": 0.9180538654327393,
"epoch": 4.11945701357466,
"grad_norm": 0.5394801497459412,
"learning_rate": 0.00011019735442919594,
"loss": 0.07490956783294678,
"mean_token_accuracy": 0.9806875884532928,
"num_tokens": 10060478.0,
"step": 1141
},
{
"entropy": 0.9495862722396851,
"epoch": 4.123076923076923,
"grad_norm": 0.44942763447761536,
"learning_rate": 0.00011006578485659513,
"loss": 0.044375285506248474,
"mean_token_accuracy": 0.9899759888648987,
"num_tokens": 10069076.0,
"step": 1142
},
{
"entropy": 0.925600215792656,
"epoch": 4.126696832579185,
"grad_norm": 0.42881152033805847,
"learning_rate": 0.00010993421514340489,
"loss": 0.031960733234882355,
"mean_token_accuracy": 0.9871046096086502,
"num_tokens": 10078094.0,
"step": 1143
},
{
"entropy": 0.9153067022562027,
"epoch": 4.130316742081448,
"grad_norm": 0.38486459851264954,
"learning_rate": 0.0001098026455708041,
"loss": 0.026029404252767563,
"mean_token_accuracy": 0.9911711812019348,
"num_tokens": 10086945.0,
"step": 1144
},
{
"entropy": 0.9598971456289291,
"epoch": 4.133936651583711,
"grad_norm": 0.43179431557655334,
"learning_rate": 0.00010967107641997141,
"loss": 0.03247709199786186,
"mean_token_accuracy": 0.9906739294528961,
"num_tokens": 10095794.0,
"step": 1145
},
{
"entropy": 0.934249758720398,
"epoch": 4.137556561085973,
"grad_norm": 0.6291822195053101,
"learning_rate": 0.0001095395079720845,
"loss": 0.056933820247650146,
"mean_token_accuracy": 0.986266016960144,
"num_tokens": 10104405.0,
"step": 1146
},
{
"entropy": 0.9094070792198181,
"epoch": 4.141176470588236,
"grad_norm": 0.5577878952026367,
"learning_rate": 0.00010940794050831964,
"loss": 0.057201556861400604,
"mean_token_accuracy": 0.9824305176734924,
"num_tokens": 10113779.0,
"step": 1147
},
{
"entropy": 0.8751973658800125,
"epoch": 4.144796380090498,
"grad_norm": 0.40218469500541687,
"learning_rate": 0.00010927637430985091,
"loss": 0.026242714375257492,
"mean_token_accuracy": 0.9944493323564529,
"num_tokens": 10122831.0,
"step": 1148
},
{
"entropy": 0.8808578848838806,
"epoch": 4.1484162895927605,
"grad_norm": 0.32130417227745056,
"learning_rate": 0.00010914480965784974,
"loss": 0.022959765046834946,
"mean_token_accuracy": 0.9945127964019775,
"num_tokens": 10132207.0,
"step": 1149
},
{
"entropy": 0.8300377726554871,
"epoch": 4.152036199095023,
"grad_norm": 0.4078406095504761,
"learning_rate": 0.00010901324683348428,
"loss": 0.026690851897001266,
"mean_token_accuracy": 0.9955218881368637,
"num_tokens": 10141567.0,
"step": 1150
},
{
"entropy": 0.8891526609659195,
"epoch": 4.155656108597285,
"grad_norm": 0.47017818689346313,
"learning_rate": 0.00010888168611791864,
"loss": 0.022021235898137093,
"mean_token_accuracy": 0.98915895819664,
"num_tokens": 10150452.0,
"step": 1151
},
{
"entropy": 0.860780343413353,
"epoch": 4.159276018099548,
"grad_norm": 0.6516409516334534,
"learning_rate": 0.0001087501277923126,
"loss": 0.04971027001738548,
"mean_token_accuracy": 0.9832734167575836,
"num_tokens": 10159499.0,
"step": 1152
},
{
"entropy": 0.8994930535554886,
"epoch": 4.16289592760181,
"grad_norm": 0.5181270837783813,
"learning_rate": 0.00010861857213782068,
"loss": 0.03901517018675804,
"mean_token_accuracy": 0.9868257790803909,
"num_tokens": 10168426.0,
"step": 1153
},
{
"entropy": 0.8511227667331696,
"epoch": 4.166515837104073,
"grad_norm": 0.4299660921096802,
"learning_rate": 0.00010848701943559176,
"loss": 0.02564258500933647,
"mean_token_accuracy": 0.9918291866779327,
"num_tokens": 10177416.0,
"step": 1154
},
{
"entropy": 0.8692123293876648,
"epoch": 4.170135746606335,
"grad_norm": 0.7815737128257751,
"learning_rate": 0.00010835546996676848,
"loss": 0.05228308588266373,
"mean_token_accuracy": 0.9848518073558807,
"num_tokens": 10186241.0,
"step": 1155
},
{
"entropy": 0.8547987043857574,
"epoch": 4.173755656108598,
"grad_norm": 0.6081969738006592,
"learning_rate": 0.00010822392401248649,
"loss": 0.044268108904361725,
"mean_token_accuracy": 0.9888571053743362,
"num_tokens": 10195462.0,
"step": 1156
},
{
"entropy": 0.8557311743497849,
"epoch": 4.17737556561086,
"grad_norm": 0.4504448175430298,
"learning_rate": 0.00010809238185387389,
"loss": 0.047527655959129333,
"mean_token_accuracy": 0.984959602355957,
"num_tokens": 10204724.0,
"step": 1157
},
{
"entropy": 0.8731201887130737,
"epoch": 4.180995475113122,
"grad_norm": 0.6010822653770447,
"learning_rate": 0.00010796084377205071,
"loss": 0.05222795158624649,
"mean_token_accuracy": 0.9812868386507034,
"num_tokens": 10213426.0,
"step": 1158
},
{
"entropy": 0.8757798075675964,
"epoch": 4.184615384615385,
"grad_norm": 0.655741274356842,
"learning_rate": 0.00010782931004812831,
"loss": 0.03696342930197716,
"mean_token_accuracy": 0.988820269703865,
"num_tokens": 10221951.0,
"step": 1159
},
{
"entropy": 0.8456969112157822,
"epoch": 4.188235294117647,
"grad_norm": 0.7178743481636047,
"learning_rate": 0.00010769778096320873,
"loss": 0.03984824940562248,
"mean_token_accuracy": 0.9859680682420731,
"num_tokens": 10231000.0,
"step": 1160
},
{
"entropy": 0.8414007723331451,
"epoch": 4.19185520361991,
"grad_norm": 0.4650433361530304,
"learning_rate": 0.00010756625679838397,
"loss": 0.027776649221777916,
"mean_token_accuracy": 0.9927218854427338,
"num_tokens": 10240349.0,
"step": 1161
},
{
"entropy": 0.8852977901697159,
"epoch": 4.195475113122172,
"grad_norm": 0.7676152586936951,
"learning_rate": 0.0001074347378347357,
"loss": 0.04361552372574806,
"mean_token_accuracy": 0.9907232075929642,
"num_tokens": 10248879.0,
"step": 1162
},
{
"entropy": 0.8889047503471375,
"epoch": 4.199095022624435,
"grad_norm": 0.4866149127483368,
"learning_rate": 0.00010730322435333433,
"loss": 0.033081792294979095,
"mean_token_accuracy": 0.9893742352724075,
"num_tokens": 10257389.0,
"step": 1163
},
{
"entropy": 0.8337071388959885,
"epoch": 4.202714932126697,
"grad_norm": 0.3786505460739136,
"learning_rate": 0.00010717171663523871,
"loss": 0.032933011651039124,
"mean_token_accuracy": 0.989232674241066,
"num_tokens": 10266161.0,
"step": 1164
},
{
"entropy": 0.856635645031929,
"epoch": 4.206334841628959,
"grad_norm": 0.42627274990081787,
"learning_rate": 0.00010704021496149517,
"loss": 0.03942575678229332,
"mean_token_accuracy": 0.9888184368610382,
"num_tokens": 10275036.0,
"step": 1165
},
{
"entropy": 0.8601881414651871,
"epoch": 4.209954751131222,
"grad_norm": 0.44472524523735046,
"learning_rate": 0.00010690871961313724,
"loss": 0.03517714887857437,
"mean_token_accuracy": 0.9896524995565414,
"num_tokens": 10283967.0,
"step": 1166
},
{
"entropy": 0.8576315343379974,
"epoch": 4.213574660633484,
"grad_norm": 0.641677975654602,
"learning_rate": 0.00010677723087118495,
"loss": 0.055644311010837555,
"mean_token_accuracy": 0.9859830141067505,
"num_tokens": 10293047.0,
"step": 1167
},
{
"entropy": 0.8803037852048874,
"epoch": 4.217194570135747,
"grad_norm": 0.45178845524787903,
"learning_rate": 0.00010664574901664415,
"loss": 0.02904437854886055,
"mean_token_accuracy": 0.9915114343166351,
"num_tokens": 10301802.0,
"step": 1168
},
{
"entropy": 0.8520867824554443,
"epoch": 4.220814479638009,
"grad_norm": 0.5569225549697876,
"learning_rate": 0.00010651427433050603,
"loss": 0.037966180592775345,
"mean_token_accuracy": 0.9883747845888138,
"num_tokens": 10310649.0,
"step": 1169
},
{
"entropy": 0.9215613752603531,
"epoch": 4.224434389140272,
"grad_norm": 0.34931108355522156,
"learning_rate": 0.00010638280709374642,
"loss": 0.0191643126308918,
"mean_token_accuracy": 0.9933111071586609,
"num_tokens": 10319077.0,
"step": 1170
},
{
"entropy": 0.8855588883161545,
"epoch": 4.228054298642534,
"grad_norm": 0.4663868844509125,
"learning_rate": 0.00010625134758732527,
"loss": 0.019490336999297142,
"mean_token_accuracy": 0.9924804866313934,
"num_tokens": 10327433.0,
"step": 1171
},
{
"entropy": 0.8989104330539703,
"epoch": 4.2316742081447964,
"grad_norm": 0.32792529463768005,
"learning_rate": 0.0001061198960921859,
"loss": 0.023120509460568428,
"mean_token_accuracy": 0.9922090172767639,
"num_tokens": 10335992.0,
"step": 1172
},
{
"entropy": 0.9072947949171066,
"epoch": 4.235294117647059,
"grad_norm": 0.6342188715934753,
"learning_rate": 0.00010598845288925465,
"loss": 0.057832568883895874,
"mean_token_accuracy": 0.9827272593975067,
"num_tokens": 10344173.0,
"step": 1173
},
{
"entropy": 0.8618551194667816,
"epoch": 4.238914027149321,
"grad_norm": 0.6048193573951721,
"learning_rate": 0.0001058570182594401,
"loss": 0.045300260186195374,
"mean_token_accuracy": 0.9866802990436554,
"num_tokens": 10353441.0,
"step": 1174
},
{
"entropy": 0.8884298205375671,
"epoch": 4.242533936651584,
"grad_norm": 0.6075233221054077,
"learning_rate": 0.00010572559248363244,
"loss": 0.043454039841890335,
"mean_token_accuracy": 0.983539417386055,
"num_tokens": 10362718.0,
"step": 1175
},
{
"entropy": 0.8603847771883011,
"epoch": 4.246153846153846,
"grad_norm": 0.4278615117073059,
"learning_rate": 0.00010559417584270297,
"loss": 0.03194679319858551,
"mean_token_accuracy": 0.9888970702886581,
"num_tokens": 10371727.0,
"step": 1176
},
{
"entropy": 0.8933188319206238,
"epoch": 4.249773755656109,
"grad_norm": 0.7898560166358948,
"learning_rate": 0.00010546276861750355,
"loss": 0.041754692792892456,
"mean_token_accuracy": 0.9864661991596222,
"num_tokens": 10380718.0,
"step": 1177
},
{
"entropy": 0.8819432407617569,
"epoch": 4.253393665158371,
"grad_norm": 0.6854934096336365,
"learning_rate": 0.0001053313710888658,
"loss": 0.032602183520793915,
"mean_token_accuracy": 0.9920495897531509,
"num_tokens": 10389630.0,
"step": 1178
},
{
"entropy": 0.9068233966827393,
"epoch": 4.2570135746606335,
"grad_norm": 0.5194956064224243,
"learning_rate": 0.00010519998353760068,
"loss": 0.04578553885221481,
"mean_token_accuracy": 0.9855708330869675,
"num_tokens": 10398459.0,
"step": 1179
},
{
"entropy": 0.8744179904460907,
"epoch": 4.260633484162896,
"grad_norm": 0.402622789144516,
"learning_rate": 0.00010506860624449779,
"loss": 0.028466589748859406,
"mean_token_accuracy": 0.9951285868883133,
"num_tokens": 10407731.0,
"step": 1180
},
{
"entropy": 0.8742282688617706,
"epoch": 4.264253393665158,
"grad_norm": 0.3886919319629669,
"learning_rate": 0.00010493723949032486,
"loss": 0.026117544621229172,
"mean_token_accuracy": 0.9926921725273132,
"num_tokens": 10416677.0,
"step": 1181
},
{
"entropy": 0.8899593651294708,
"epoch": 4.267873303167421,
"grad_norm": 0.4906803369522095,
"learning_rate": 0.00010480588355582708,
"loss": 0.040936604142189026,
"mean_token_accuracy": 0.9897563010454178,
"num_tokens": 10425954.0,
"step": 1182
},
{
"entropy": 0.9250012636184692,
"epoch": 4.271493212669683,
"grad_norm": 0.5353180170059204,
"learning_rate": 0.00010467453872172646,
"loss": 0.04668327420949936,
"mean_token_accuracy": 0.9911708980798721,
"num_tokens": 10434599.0,
"step": 1183
},
{
"entropy": 0.9150962978601456,
"epoch": 4.275113122171946,
"grad_norm": 0.9419394731521606,
"learning_rate": 0.00010454320526872139,
"loss": 0.04198862612247467,
"mean_token_accuracy": 0.9864400774240494,
"num_tokens": 10443597.0,
"step": 1184
},
{
"entropy": 0.8760367184877396,
"epoch": 4.278733031674208,
"grad_norm": 0.5297932624816895,
"learning_rate": 0.00010441188347748583,
"loss": 0.044471338391304016,
"mean_token_accuracy": 0.9881382882595062,
"num_tokens": 10452692.0,
"step": 1185
},
{
"entropy": 0.8790275156497955,
"epoch": 4.2823529411764705,
"grad_norm": 0.46820127964019775,
"learning_rate": 0.00010428057362866888,
"loss": 0.040998734533786774,
"mean_token_accuracy": 0.9851799458265305,
"num_tokens": 10461733.0,
"step": 1186
},
{
"entropy": 0.897658497095108,
"epoch": 4.285972850678733,
"grad_norm": 0.42992568016052246,
"learning_rate": 0.00010414927600289412,
"loss": 0.027044817805290222,
"mean_token_accuracy": 0.9944102466106415,
"num_tokens": 10470816.0,
"step": 1187
},
{
"entropy": 0.9087939113378525,
"epoch": 4.289592760180995,
"grad_norm": 0.32280153036117554,
"learning_rate": 0.00010401799088075899,
"loss": 0.012480968609452248,
"mean_token_accuracy": 0.995869055390358,
"num_tokens": 10479089.0,
"step": 1188
},
{
"entropy": 0.8632544577121735,
"epoch": 4.293212669683258,
"grad_norm": 0.540331244468689,
"learning_rate": 0.0001038867185428342,
"loss": 0.023278292268514633,
"mean_token_accuracy": 0.9912919998168945,
"num_tokens": 10487988.0,
"step": 1189
},
{
"entropy": 0.9213965982198715,
"epoch": 4.29683257918552,
"grad_norm": 0.6128994226455688,
"learning_rate": 0.00010375545926966316,
"loss": 0.029522467404603958,
"mean_token_accuracy": 0.9888563305139542,
"num_tokens": 10496518.0,
"step": 1190
},
{
"entropy": 0.9032467007637024,
"epoch": 4.300452488687783,
"grad_norm": 0.4757763147354126,
"learning_rate": 0.00010362421334176138,
"loss": 0.01943778805434704,
"mean_token_accuracy": 0.9960848391056061,
"num_tokens": 10504869.0,
"step": 1191
},
{
"entropy": 0.8697908222675323,
"epoch": 4.304072398190045,
"grad_norm": 0.6480269432067871,
"learning_rate": 0.00010349298103961577,
"loss": 0.06841661036014557,
"mean_token_accuracy": 0.9833658635616302,
"num_tokens": 10513783.0,
"step": 1192
},
{
"entropy": 0.9298106580972672,
"epoch": 4.3076923076923075,
"grad_norm": 0.8510603904724121,
"learning_rate": 0.00010336176264368418,
"loss": 0.07194619625806808,
"mean_token_accuracy": 0.9774241000413895,
"num_tokens": 10522312.0,
"step": 1193
},
{
"entropy": 0.8464011400938034,
"epoch": 4.31131221719457,
"grad_norm": 0.634172260761261,
"learning_rate": 0.00010323055843439473,
"loss": 0.07087381184101105,
"mean_token_accuracy": 0.9855861663818359,
"num_tokens": 10531488.0,
"step": 1194
},
{
"entropy": 0.8965292721986771,
"epoch": 4.314932126696832,
"grad_norm": 0.654795229434967,
"learning_rate": 0.00010309936869214525,
"loss": 0.048451248556375504,
"mean_token_accuracy": 0.9868658185005188,
"num_tokens": 10539668.0,
"step": 1195
},
{
"entropy": 0.8605607151985168,
"epoch": 4.318552036199095,
"grad_norm": 0.5905312299728394,
"learning_rate": 0.00010296819369730258,
"loss": 0.05599949508905411,
"mean_token_accuracy": 0.981212854385376,
"num_tokens": 10548226.0,
"step": 1196
},
{
"entropy": 0.8900521546602249,
"epoch": 4.322171945701357,
"grad_norm": 0.7872176766395569,
"learning_rate": 0.00010283703373020212,
"loss": 0.04480139538645744,
"mean_token_accuracy": 0.9872453063726425,
"num_tokens": 10557015.0,
"step": 1197
},
{
"entropy": 0.8230308294296265,
"epoch": 4.32579185520362,
"grad_norm": 0.94143146276474,
"learning_rate": 0.00010270588907114716,
"loss": 0.04959043487906456,
"mean_token_accuracy": 0.9870394170284271,
"num_tokens": 10566241.0,
"step": 1198
},
{
"entropy": 0.8108661025762558,
"epoch": 4.329411764705882,
"grad_norm": 0.5073636770248413,
"learning_rate": 0.00010257476000040816,
"loss": 0.039761241525411606,
"mean_token_accuracy": 0.9890642464160919,
"num_tokens": 10575524.0,
"step": 1199
},
{
"entropy": 0.8538337796926498,
"epoch": 4.3330316742081445,
"grad_norm": 0.7470809817314148,
"learning_rate": 0.00010244364679822238,
"loss": 0.07473456114530563,
"mean_token_accuracy": 0.9846348017454147,
"num_tokens": 10584153.0,
"step": 1200
},
{
"entropy": 0.8373128026723862,
"epoch": 4.336651583710407,
"grad_norm": 0.3188524544239044,
"learning_rate": 0.00010231254974479312,
"loss": 0.011771505698561668,
"mean_token_accuracy": 0.9948063492774963,
"num_tokens": 10593069.0,
"step": 1201
},
{
"entropy": 0.8809863477945328,
"epoch": 4.340271493212669,
"grad_norm": 0.805793821811676,
"learning_rate": 0.00010218146912028917,
"loss": 0.23618434369564056,
"mean_token_accuracy": 0.9603950381278992,
"num_tokens": 10602057.0,
"step": 1202
},
{
"entropy": 0.8320661336183548,
"epoch": 4.343891402714932,
"grad_norm": 0.5631644129753113,
"learning_rate": 0.00010205040520484423,
"loss": 0.0406966507434845,
"mean_token_accuracy": 0.9823737889528275,
"num_tokens": 10611082.0,
"step": 1203
},
{
"entropy": 0.9149579256772995,
"epoch": 4.347511312217194,
"grad_norm": 0.6743116974830627,
"learning_rate": 0.0001019193582785563,
"loss": 0.07820634543895721,
"mean_token_accuracy": 0.9759713411331177,
"num_tokens": 10619854.0,
"step": 1204
},
{
"entropy": 0.859554186463356,
"epoch": 4.351131221719457,
"grad_norm": 0.6592840552330017,
"learning_rate": 0.00010178832862148706,
"loss": 0.05549190193414688,
"mean_token_accuracy": 0.986658051609993,
"num_tokens": 10628992.0,
"step": 1205
},
{
"entropy": 0.9278796315193176,
"epoch": 4.354751131221719,
"grad_norm": 0.7901132106781006,
"learning_rate": 0.00010165731651366122,
"loss": 0.057204730808734894,
"mean_token_accuracy": 0.9778600037097931,
"num_tokens": 10637257.0,
"step": 1206
},
{
"entropy": 0.907976433634758,
"epoch": 4.3583710407239815,
"grad_norm": 0.6733710169792175,
"learning_rate": 0.00010152632223506604,
"loss": 0.02807607874274254,
"mean_token_accuracy": 0.9882861822843552,
"num_tokens": 10646088.0,
"step": 1207
},
{
"entropy": 0.8498242944478989,
"epoch": 4.361990950226244,
"grad_norm": 0.44677144289016724,
"learning_rate": 0.00010139534606565073,
"loss": 0.042324379086494446,
"mean_token_accuracy": 0.9891783744096756,
"num_tokens": 10655709.0,
"step": 1208
},
{
"entropy": 0.9032723009586334,
"epoch": 4.365610859728506,
"grad_norm": 0.5025045871734619,
"learning_rate": 0.00010126438828532571,
"loss": 0.03960299491882324,
"mean_token_accuracy": 0.9883267283439636,
"num_tokens": 10664403.0,
"step": 1209
},
{
"entropy": 0.9668902158737183,
"epoch": 4.36923076923077,
"grad_norm": 0.39445021748542786,
"learning_rate": 0.00010113344917396215,
"loss": 0.02176782116293907,
"mean_token_accuracy": 0.9942312687635422,
"num_tokens": 10672986.0,
"step": 1210
},
{
"entropy": 0.9814326167106628,
"epoch": 4.372850678733032,
"grad_norm": 0.6776084303855896,
"learning_rate": 0.00010100252901139131,
"loss": 0.04824589565396309,
"mean_token_accuracy": 0.9848027527332306,
"num_tokens": 10681148.0,
"step": 1211
},
{
"entropy": 0.9044711589813232,
"epoch": 4.376470588235295,
"grad_norm": 0.42135122418403625,
"learning_rate": 0.00010087162807740397,
"loss": 0.024921081960201263,
"mean_token_accuracy": 0.9918465316295624,
"num_tokens": 10690147.0,
"step": 1212
},
{
"entropy": 0.9244499206542969,
"epoch": 4.380090497737557,
"grad_norm": 0.49353837966918945,
"learning_rate": 0.00010074074665174977,
"loss": 0.042720977216959,
"mean_token_accuracy": 0.9891730397939682,
"num_tokens": 10698852.0,
"step": 1213
},
{
"entropy": 0.847070038318634,
"epoch": 4.383710407239819,
"grad_norm": 0.3503934144973755,
"learning_rate": 0.00010060988501413668,
"loss": 0.019927293062210083,
"mean_token_accuracy": 0.9944724142551422,
"num_tokens": 10708150.0,
"step": 1214
},
{
"entropy": 0.8821483105421066,
"epoch": 4.387330316742082,
"grad_norm": 0.5833554267883301,
"learning_rate": 0.00010047904344423043,
"loss": 0.05499357730150223,
"mean_token_accuracy": 0.9791678488254547,
"num_tokens": 10716975.0,
"step": 1215
},
{
"entropy": 0.8406474888324738,
"epoch": 4.390950226244344,
"grad_norm": 0.5742695927619934,
"learning_rate": 0.00010034822222165377,
"loss": 0.038775935769081116,
"mean_token_accuracy": 0.986319050192833,
"num_tokens": 10725858.0,
"step": 1216
},
{
"entropy": 0.857618436217308,
"epoch": 4.394570135746607,
"grad_norm": 0.5021587014198303,
"learning_rate": 0.00010021742162598606,
"loss": 0.024894721806049347,
"mean_token_accuracy": 0.99241703748703,
"num_tokens": 10734716.0,
"step": 1217
},
{
"entropy": 0.9438434839248657,
"epoch": 4.398190045248869,
"grad_norm": 0.5450843572616577,
"learning_rate": 0.00010008664193676251,
"loss": 0.0339815691113472,
"mean_token_accuracy": 0.9880622923374176,
"num_tokens": 10742902.0,
"step": 1218
},
{
"entropy": 0.9149988889694214,
"epoch": 4.401809954751132,
"grad_norm": 0.5017070770263672,
"learning_rate": 9.995588343347373e-05,
"loss": 0.03535865992307663,
"mean_token_accuracy": 0.9894069284200668,
"num_tokens": 10751684.0,
"step": 1219
},
{
"entropy": 0.8433951437473297,
"epoch": 4.405429864253394,
"grad_norm": 0.4098859131336212,
"learning_rate": 9.98251463955649e-05,
"loss": 0.03204619884490967,
"mean_token_accuracy": 0.9892712533473969,
"num_tokens": 10760889.0,
"step": 1220
},
{
"entropy": 0.9143005758523941,
"epoch": 4.409049773755656,
"grad_norm": 0.8866249918937683,
"learning_rate": 9.96944311024355e-05,
"loss": 0.08796840161085129,
"mean_token_accuracy": 0.9761585295200348,
"num_tokens": 10769436.0,
"step": 1221
},
{
"entropy": 0.8927360326051712,
"epoch": 4.412669683257919,
"grad_norm": 0.5494005680084229,
"learning_rate": 9.956373783343847e-05,
"loss": 0.03506309166550636,
"mean_token_accuracy": 0.9899781793355942,
"num_tokens": 10778203.0,
"step": 1222
},
{
"entropy": 0.8397981226444244,
"epoch": 4.416289592760181,
"grad_norm": 0.47664108872413635,
"learning_rate": 9.943306686787964e-05,
"loss": 0.04049497842788696,
"mean_token_accuracy": 0.9860450029373169,
"num_tokens": 10787495.0,
"step": 1223
},
{
"entropy": 0.87860107421875,
"epoch": 4.419909502262444,
"grad_norm": 0.860580563545227,
"learning_rate": 9.930241848501722e-05,
"loss": 0.17442655563354492,
"mean_token_accuracy": 0.9721402823925018,
"num_tokens": 10796694.0,
"step": 1224
},
{
"entropy": 0.8964706212282181,
"epoch": 4.423529411764706,
"grad_norm": 0.5525467395782471,
"learning_rate": 9.917179296406116e-05,
"loss": 0.045697472989559174,
"mean_token_accuracy": 0.9850039780139923,
"num_tokens": 10805314.0,
"step": 1225
},
{
"entropy": 0.9075468629598618,
"epoch": 4.427149321266969,
"grad_norm": 1.0463852882385254,
"learning_rate": 9.904119058417256e-05,
"loss": 0.19723130762577057,
"mean_token_accuracy": 0.964007630944252,
"num_tokens": 10814188.0,
"step": 1226
},
{
"entropy": 0.892121896147728,
"epoch": 4.430769230769231,
"grad_norm": 0.8913745880126953,
"learning_rate": 9.891061162446302e-05,
"loss": 0.05357573926448822,
"mean_token_accuracy": 0.9836710542440414,
"num_tokens": 10822890.0,
"step": 1227
},
{
"entropy": 0.8689533025026321,
"epoch": 4.4343891402714934,
"grad_norm": 0.4191250503063202,
"learning_rate": 9.87800563639941e-05,
"loss": 0.027562851086258888,
"mean_token_accuracy": 0.9921068102121353,
"num_tokens": 10832375.0,
"step": 1228
},
{
"entropy": 0.9156316965818405,
"epoch": 4.438009049773756,
"grad_norm": 0.40549466013908386,
"learning_rate": 9.864952508177673e-05,
"loss": 0.026554280892014503,
"mean_token_accuracy": 0.990155890583992,
"num_tokens": 10840600.0,
"step": 1229
},
{
"entropy": 0.8978532254695892,
"epoch": 4.441628959276018,
"grad_norm": 0.6001290678977966,
"learning_rate": 9.851901805677066e-05,
"loss": 0.044659726321697235,
"mean_token_accuracy": 0.9852373898029327,
"num_tokens": 10849243.0,
"step": 1230
},
{
"entropy": 0.8671956062316895,
"epoch": 4.445248868778281,
"grad_norm": 0.46831727027893066,
"learning_rate": 9.838853556788366e-05,
"loss": 0.024291303008794785,
"mean_token_accuracy": 0.9942016154527664,
"num_tokens": 10858006.0,
"step": 1231
},
{
"entropy": 0.8727803975343704,
"epoch": 4.448868778280543,
"grad_norm": 0.5648714303970337,
"learning_rate": 9.825807789397115e-05,
"loss": 0.03640315309166908,
"mean_token_accuracy": 0.9874684065580368,
"num_tokens": 10867043.0,
"step": 1232
},
{
"entropy": 0.889243483543396,
"epoch": 4.452488687782806,
"grad_norm": 0.5327542424201965,
"learning_rate": 9.812764531383556e-05,
"loss": 0.0371791273355484,
"mean_token_accuracy": 0.9900572001934052,
"num_tokens": 10875412.0,
"step": 1233
},
{
"entropy": 0.9031431376934052,
"epoch": 4.456108597285068,
"grad_norm": 0.4150716960430145,
"learning_rate": 9.799723810622552e-05,
"loss": 0.023472465574741364,
"mean_token_accuracy": 0.9918784946203232,
"num_tokens": 10883763.0,
"step": 1234
},
{
"entropy": 0.85833740234375,
"epoch": 4.4597285067873305,
"grad_norm": 0.7519460916519165,
"learning_rate": 9.786685654983567e-05,
"loss": 0.04321755841374397,
"mean_token_accuracy": 0.9835457056760788,
"num_tokens": 10892993.0,
"step": 1235
},
{
"entropy": 0.8594792932271957,
"epoch": 4.463348416289593,
"grad_norm": 0.4252811670303345,
"learning_rate": 9.773650092330566e-05,
"loss": 0.027196653187274933,
"mean_token_accuracy": 0.9954462945461273,
"num_tokens": 10901708.0,
"step": 1236
},
{
"entropy": 0.9015035331249237,
"epoch": 4.466968325791855,
"grad_norm": 0.5715810656547546,
"learning_rate": 9.760617150521976e-05,
"loss": 0.07570506632328033,
"mean_token_accuracy": 0.9812192022800446,
"num_tokens": 10910501.0,
"step": 1237
},
{
"entropy": 0.8576754629611969,
"epoch": 4.470588235294118,
"grad_norm": 0.6568597555160522,
"learning_rate": 9.747586857410629e-05,
"loss": 0.04479028284549713,
"mean_token_accuracy": 0.9829374700784683,
"num_tokens": 10919679.0,
"step": 1238
},
{
"entropy": 0.9022691249847412,
"epoch": 4.47420814479638,
"grad_norm": 0.8238912224769592,
"learning_rate": 9.73455924084369e-05,
"loss": 0.05337506905198097,
"mean_token_accuracy": 0.9812590926885605,
"num_tokens": 10928585.0,
"step": 1239
},
{
"entropy": 0.926213338971138,
"epoch": 4.477828054298643,
"grad_norm": 0.632087767124176,
"learning_rate": 9.721534328662609e-05,
"loss": 0.0570443719625473,
"mean_token_accuracy": 0.9859164953231812,
"num_tokens": 10937371.0,
"step": 1240
},
{
"entropy": 0.8500941395759583,
"epoch": 4.481447963800905,
"grad_norm": 0.43256041407585144,
"learning_rate": 9.708512148703049e-05,
"loss": 0.03233124688267708,
"mean_token_accuracy": 0.9893523305654526,
"num_tokens": 10946660.0,
"step": 1241
},
{
"entropy": 0.9078550934791565,
"epoch": 4.4850678733031675,
"grad_norm": 0.4536542296409607,
"learning_rate": 9.695492728794837e-05,
"loss": 0.028730809688568115,
"mean_token_accuracy": 0.9890875816345215,
"num_tokens": 10955396.0,
"step": 1242
},
{
"entropy": 0.9219816625118256,
"epoch": 4.48868778280543,
"grad_norm": 0.6403781771659851,
"learning_rate": 9.682476096761907e-05,
"loss": 0.05681582912802696,
"mean_token_accuracy": 0.9841127395629883,
"num_tokens": 10964137.0,
"step": 1243
},
{
"entropy": 0.8394899666309357,
"epoch": 4.492307692307692,
"grad_norm": 0.4959585666656494,
"learning_rate": 9.669462280422234e-05,
"loss": 0.04324223846197128,
"mean_token_accuracy": 0.9881971031427383,
"num_tokens": 10973576.0,
"step": 1244
},
{
"entropy": 0.9000880867242813,
"epoch": 4.495927601809955,
"grad_norm": 0.5821248292922974,
"learning_rate": 9.656451307587769e-05,
"loss": 0.034078195691108704,
"mean_token_accuracy": 0.9908082634210587,
"num_tokens": 10982332.0,
"step": 1245
},
{
"entropy": 0.9280275702476501,
"epoch": 4.499547511312217,
"grad_norm": 0.3630794286727905,
"learning_rate": 9.643443206064386e-05,
"loss": 0.020034782588481903,
"mean_token_accuracy": 0.9927627146244049,
"num_tokens": 10990693.0,
"step": 1246
},
{
"entropy": 0.8500865697860718,
"epoch": 4.50316742081448,
"grad_norm": 0.6605433225631714,
"learning_rate": 9.630438003651833e-05,
"loss": 0.049847669899463654,
"mean_token_accuracy": 0.9830765873193741,
"num_tokens": 10999984.0,
"step": 1247
},
{
"entropy": 0.8788967728614807,
"epoch": 4.506787330316742,
"grad_norm": 0.49596118927001953,
"learning_rate": 9.617435728143654e-05,
"loss": 0.040307819843292236,
"mean_token_accuracy": 0.9881236255168915,
"num_tokens": 11008928.0,
"step": 1248
},
{
"entropy": 0.8518171012401581,
"epoch": 4.5104072398190045,
"grad_norm": 0.48991096019744873,
"learning_rate": 9.60443640732713e-05,
"loss": 0.03139914199709892,
"mean_token_accuracy": 0.9922763705253601,
"num_tokens": 11018014.0,
"step": 1249
},
{
"entropy": 0.8158316314220428,
"epoch": 4.514027149321267,
"grad_norm": 0.5165399312973022,
"learning_rate": 9.59144006898325e-05,
"loss": 0.048440560698509216,
"mean_token_accuracy": 0.9869976490736008,
"num_tokens": 11027583.0,
"step": 1250
},
{
"entropy": 0.9327925741672516,
"epoch": 4.517647058823529,
"grad_norm": 0.667120635509491,
"learning_rate": 9.57844674088661e-05,
"loss": 0.03646295145153999,
"mean_token_accuracy": 0.9886159151792526,
"num_tokens": 11035928.0,
"step": 1251
},
{
"entropy": 0.8431118130683899,
"epoch": 4.521266968325792,
"grad_norm": 0.5622652173042297,
"learning_rate": 9.565456450805382e-05,
"loss": 0.029620612040162086,
"mean_token_accuracy": 0.9908718019723892,
"num_tokens": 11045121.0,
"step": 1252
},
{
"entropy": 0.9268823117017746,
"epoch": 4.524886877828054,
"grad_norm": 0.46910595893859863,
"learning_rate": 9.552469226501237e-05,
"loss": 0.026674820110201836,
"mean_token_accuracy": 0.9921789020299911,
"num_tokens": 11053733.0,
"step": 1253
},
{
"entropy": 0.8750782012939453,
"epoch": 4.528506787330317,
"grad_norm": 0.5680973529815674,
"learning_rate": 9.539485095729308e-05,
"loss": 0.050825320184230804,
"mean_token_accuracy": 0.985125944018364,
"num_tokens": 11062542.0,
"step": 1254
},
{
"entropy": 0.889702245593071,
"epoch": 4.532126696832579,
"grad_norm": 0.5325557589530945,
"learning_rate": 9.526504086238097e-05,
"loss": 0.04129321873188019,
"mean_token_accuracy": 0.9894774854183197,
"num_tokens": 11071184.0,
"step": 1255
},
{
"entropy": 0.8572105765342712,
"epoch": 4.5357466063348415,
"grad_norm": 0.5291478037834167,
"learning_rate": 9.513526225769454e-05,
"loss": 0.028147444128990173,
"mean_token_accuracy": 0.9894004464149475,
"num_tokens": 11080303.0,
"step": 1256
},
{
"entropy": 0.9068948477506638,
"epoch": 4.539366515837104,
"grad_norm": 0.5786257982254028,
"learning_rate": 9.500551542058492e-05,
"loss": 0.03714841976761818,
"mean_token_accuracy": 0.9861379265785217,
"num_tokens": 11088917.0,
"step": 1257
},
{
"entropy": 0.8949933052062988,
"epoch": 4.542986425339366,
"grad_norm": 0.5275850892066956,
"learning_rate": 9.487580062833532e-05,
"loss": 0.04033740609884262,
"mean_token_accuracy": 0.9893816113471985,
"num_tokens": 11097661.0,
"step": 1258
},
{
"entropy": 0.9183568209409714,
"epoch": 4.546606334841629,
"grad_norm": 0.6421916484832764,
"learning_rate": 9.474611815816048e-05,
"loss": 0.06847433000802994,
"mean_token_accuracy": 0.9772931635379791,
"num_tokens": 11106229.0,
"step": 1259
},
{
"entropy": 0.9245471358299255,
"epoch": 4.550226244343891,
"grad_norm": 0.5704196691513062,
"learning_rate": 9.461646828720616e-05,
"loss": 0.0404348149895668,
"mean_token_accuracy": 0.9885217696428299,
"num_tokens": 11115179.0,
"step": 1260
},
{
"entropy": 0.9546661376953125,
"epoch": 4.553846153846154,
"grad_norm": 0.8367648720741272,
"learning_rate": 9.448685129254828e-05,
"loss": 0.07936983555555344,
"mean_token_accuracy": 0.9769112765789032,
"num_tokens": 11123860.0,
"step": 1261
},
{
"entropy": 0.8901357203722,
"epoch": 4.557466063348416,
"grad_norm": 0.48454558849334717,
"learning_rate": 9.435726745119264e-05,
"loss": 0.029340846464037895,
"mean_token_accuracy": 0.9888796657323837,
"num_tokens": 11132821.0,
"step": 1262
},
{
"entropy": 0.8747525364160538,
"epoch": 4.5610859728506785,
"grad_norm": 0.47339335083961487,
"learning_rate": 9.422771704007409e-05,
"loss": 0.03331891447305679,
"mean_token_accuracy": 0.9901967644691467,
"num_tokens": 11141742.0,
"step": 1263
},
{
"entropy": 0.8973748683929443,
"epoch": 4.564705882352941,
"grad_norm": 0.5341124534606934,
"learning_rate": 9.409820033605614e-05,
"loss": 0.03468211740255356,
"mean_token_accuracy": 0.990693673491478,
"num_tokens": 11150411.0,
"step": 1264
},
{
"entropy": 0.8835297226905823,
"epoch": 4.568325791855203,
"grad_norm": 0.5115938186645508,
"learning_rate": 9.39687176159302e-05,
"loss": 0.037047356367111206,
"mean_token_accuracy": 0.9864104092121124,
"num_tokens": 11159328.0,
"step": 1265
},
{
"entropy": 0.8623639047145844,
"epoch": 4.571945701357466,
"grad_norm": 0.5846521258354187,
"learning_rate": 9.3839269156415e-05,
"loss": 0.04693792760372162,
"mean_token_accuracy": 0.9854484647512436,
"num_tokens": 11168251.0,
"step": 1266
},
{
"entropy": 0.9362972676753998,
"epoch": 4.575565610859728,
"grad_norm": 0.26710987091064453,
"learning_rate": 9.370985523415623e-05,
"loss": 0.014222146943211555,
"mean_token_accuracy": 0.9938410818576813,
"num_tokens": 11176539.0,
"step": 1267
},
{
"entropy": 0.9218808859586716,
"epoch": 4.579185520361991,
"grad_norm": 0.594578742980957,
"learning_rate": 9.358047612572554e-05,
"loss": 0.04387912154197693,
"mean_token_accuracy": 0.9871724396944046,
"num_tokens": 11184908.0,
"step": 1268
},
{
"entropy": 0.9472506046295166,
"epoch": 4.582805429864253,
"grad_norm": 0.7246670722961426,
"learning_rate": 9.345113210762033e-05,
"loss": 0.060271404683589935,
"mean_token_accuracy": 0.9830828458070755,
"num_tokens": 11193370.0,
"step": 1269
},
{
"entropy": 0.8755769431591034,
"epoch": 4.5864253393665155,
"grad_norm": 0.4499802887439728,
"learning_rate": 9.332182345626297e-05,
"loss": 0.019561700522899628,
"mean_token_accuracy": 0.9953616410493851,
"num_tokens": 11202005.0,
"step": 1270
},
{
"entropy": 0.9018907248973846,
"epoch": 4.590045248868778,
"grad_norm": 0.4629576504230499,
"learning_rate": 9.319255044800026e-05,
"loss": 0.018598034977912903,
"mean_token_accuracy": 0.9917204529047012,
"num_tokens": 11210617.0,
"step": 1271
},
{
"entropy": 0.9013173431158066,
"epoch": 4.59366515837104,
"grad_norm": 0.5423282980918884,
"learning_rate": 9.306331335910279e-05,
"loss": 0.03331900015473366,
"mean_token_accuracy": 0.9889495521783829,
"num_tokens": 11219357.0,
"step": 1272
},
{
"entropy": 0.9359396398067474,
"epoch": 4.597285067873303,
"grad_norm": 0.37270912528038025,
"learning_rate": 9.293411246576442e-05,
"loss": 0.01753406971693039,
"mean_token_accuracy": 0.995793953537941,
"num_tokens": 11227697.0,
"step": 1273
},
{
"entropy": 0.8669909536838531,
"epoch": 4.600904977375565,
"grad_norm": 0.7340614795684814,
"learning_rate": 9.280494804410167e-05,
"loss": 0.067268967628479,
"mean_token_accuracy": 0.9793154299259186,
"num_tokens": 11236232.0,
"step": 1274
},
{
"entropy": 0.923119992017746,
"epoch": 4.604524886877828,
"grad_norm": 1.0877807140350342,
"learning_rate": 9.267582037015308e-05,
"loss": 0.05310777574777603,
"mean_token_accuracy": 0.9892044812440872,
"num_tokens": 11244814.0,
"step": 1275
},
{
"entropy": 0.8926683962345123,
"epoch": 4.60814479638009,
"grad_norm": 0.4921732246875763,
"learning_rate": 9.254672971987863e-05,
"loss": 0.027801400050520897,
"mean_token_accuracy": 0.9915051609277725,
"num_tokens": 11253577.0,
"step": 1276
},
{
"entropy": 0.9701418429613113,
"epoch": 4.6117647058823525,
"grad_norm": 0.7067182064056396,
"learning_rate": 9.241767636915923e-05,
"loss": 0.04861636832356453,
"mean_token_accuracy": 0.9905703663825989,
"num_tokens": 11261550.0,
"step": 1277
},
{
"entropy": 0.9237975776195526,
"epoch": 4.615384615384615,
"grad_norm": 0.45785027742385864,
"learning_rate": 9.22886605937961e-05,
"loss": 0.031087510287761688,
"mean_token_accuracy": 0.9908934086561203,
"num_tokens": 11270148.0,
"step": 1278
},
{
"entropy": 0.9537828266620636,
"epoch": 4.619004524886877,
"grad_norm": 1.2928509712219238,
"learning_rate": 9.21596826695101e-05,
"loss": 0.03970994055271149,
"mean_token_accuracy": 0.9891022890806198,
"num_tokens": 11278610.0,
"step": 1279
},
{
"entropy": 0.8938749879598618,
"epoch": 4.62262443438914,
"grad_norm": 0.3910481929779053,
"learning_rate": 9.203074287194118e-05,
"loss": 0.03463595360517502,
"mean_token_accuracy": 0.9896118193864822,
"num_tokens": 11287694.0,
"step": 1280
},
{
"entropy": 0.9483572989702225,
"epoch": 4.626244343891402,
"grad_norm": 0.48045435547828674,
"learning_rate": 9.190184147664791e-05,
"loss": 0.0231014396995306,
"mean_token_accuracy": 0.9937128722667694,
"num_tokens": 11296155.0,
"step": 1281
},
{
"entropy": 0.9255064278841019,
"epoch": 4.629864253393665,
"grad_norm": 0.39855605363845825,
"learning_rate": 9.177297875910667e-05,
"loss": 0.021291583776474,
"mean_token_accuracy": 0.9956228137016296,
"num_tokens": 11304810.0,
"step": 1282
},
{
"entropy": 0.9462402909994125,
"epoch": 4.633484162895927,
"grad_norm": 0.4094499349594116,
"learning_rate": 9.164415499471126e-05,
"loss": 0.032552555203437805,
"mean_token_accuracy": 0.9914036691188812,
"num_tokens": 11313680.0,
"step": 1283
},
{
"entropy": 0.951962873339653,
"epoch": 4.63710407239819,
"grad_norm": 0.7086257934570312,
"learning_rate": 9.151537045877221e-05,
"loss": 0.04020824283361435,
"mean_token_accuracy": 0.9873267412185669,
"num_tokens": 11322162.0,
"step": 1284
},
{
"entropy": 0.9462797939777374,
"epoch": 4.640723981900453,
"grad_norm": 0.6588366627693176,
"learning_rate": 9.138662542651621e-05,
"loss": 0.047957099974155426,
"mean_token_accuracy": 0.9874707162380219,
"num_tokens": 11331228.0,
"step": 1285
},
{
"entropy": 0.9177692830562592,
"epoch": 4.644343891402715,
"grad_norm": 0.39822492003440857,
"learning_rate": 9.125792017308553e-05,
"loss": 0.03488968685269356,
"mean_token_accuracy": 0.9879388362169266,
"num_tokens": 11340479.0,
"step": 1286
},
{
"entropy": 1.017251044511795,
"epoch": 4.647963800904978,
"grad_norm": 0.6734071373939514,
"learning_rate": 9.112925497353746e-05,
"loss": 0.0549258328974247,
"mean_token_accuracy": 0.9842551499605179,
"num_tokens": 11349185.0,
"step": 1287
},
{
"entropy": 0.9144134521484375,
"epoch": 4.65158371040724,
"grad_norm": 0.6358972787857056,
"learning_rate": 9.100063010284366e-05,
"loss": 0.0614742636680603,
"mean_token_accuracy": 0.9769433587789536,
"num_tokens": 11358288.0,
"step": 1288
},
{
"entropy": 0.9544530212879181,
"epoch": 4.655203619909503,
"grad_norm": 0.6581546068191528,
"learning_rate": 9.087204583588951e-05,
"loss": 0.05966397002339363,
"mean_token_accuracy": 0.9798359274864197,
"num_tokens": 11367016.0,
"step": 1289
},
{
"entropy": 0.9357064366340637,
"epoch": 4.658823529411765,
"grad_norm": 0.7771779298782349,
"learning_rate": 9.074350244747379e-05,
"loss": 0.0511971078813076,
"mean_token_accuracy": 0.981827974319458,
"num_tokens": 11376242.0,
"step": 1290
},
{
"entropy": 0.9112809002399445,
"epoch": 4.6624434389140275,
"grad_norm": 0.5351385474205017,
"learning_rate": 9.061500021230782e-05,
"loss": 0.033410944044589996,
"mean_token_accuracy": 0.9908107221126556,
"num_tokens": 11385110.0,
"step": 1291
},
{
"entropy": 0.9920302629470825,
"epoch": 4.66606334841629,
"grad_norm": 0.7133076190948486,
"learning_rate": 9.048653940501499e-05,
"loss": 0.07016268372535706,
"mean_token_accuracy": 0.9680485278367996,
"num_tokens": 11393510.0,
"step": 1292
},
{
"entropy": 0.9926072955131531,
"epoch": 4.669683257918552,
"grad_norm": 0.39922699332237244,
"learning_rate": 9.035812030013013e-05,
"loss": 0.028499318286776543,
"mean_token_accuracy": 0.9919241964817047,
"num_tokens": 11402001.0,
"step": 1293
},
{
"entropy": 0.9475626051425934,
"epoch": 4.673303167420815,
"grad_norm": 0.29215705394744873,
"learning_rate": 9.022974317209902e-05,
"loss": 0.021906405687332153,
"mean_token_accuracy": 0.9935397207736969,
"num_tokens": 11410775.0,
"step": 1294
},
{
"entropy": 0.9720835387706757,
"epoch": 4.676923076923077,
"grad_norm": 0.5957951545715332,
"learning_rate": 9.010140829527767e-05,
"loss": 0.03795255348086357,
"mean_token_accuracy": 0.9885151833295822,
"num_tokens": 11419599.0,
"step": 1295
},
{
"entropy": 0.9771716296672821,
"epoch": 4.68054298642534,
"grad_norm": 0.340427041053772,
"learning_rate": 8.997311594393172e-05,
"loss": 0.027071382850408554,
"mean_token_accuracy": 0.9911787509918213,
"num_tokens": 11428528.0,
"step": 1296
},
{
"entropy": 0.9611377120018005,
"epoch": 4.684162895927602,
"grad_norm": 0.5193417072296143,
"learning_rate": 8.98448663922361e-05,
"loss": 0.05718943476676941,
"mean_token_accuracy": 0.9795946478843689,
"num_tokens": 11437338.0,
"step": 1297
},
{
"entropy": 0.9352683573961258,
"epoch": 4.6877828054298645,
"grad_norm": 0.5943431854248047,
"learning_rate": 8.971665991427414e-05,
"loss": 0.10998387634754181,
"mean_token_accuracy": 0.9796771854162216,
"num_tokens": 11446290.0,
"step": 1298
},
{
"entropy": 0.9648092687129974,
"epoch": 4.691402714932127,
"grad_norm": 0.49834129214286804,
"learning_rate": 8.958849678403716e-05,
"loss": 0.023775417357683182,
"mean_token_accuracy": 0.9925644546747208,
"num_tokens": 11454628.0,
"step": 1299
},
{
"entropy": 1.0155327767133713,
"epoch": 4.695022624434389,
"grad_norm": 0.7852084636688232,
"learning_rate": 8.946037727542389e-05,
"loss": 0.044226959347724915,
"mean_token_accuracy": 0.9850529134273529,
"num_tokens": 11462704.0,
"step": 1300
},
{
"entropy": 0.926691085100174,
"epoch": 4.698642533936652,
"grad_norm": 0.4994286298751831,
"learning_rate": 8.933230166223973e-05,
"loss": 0.039819322526454926,
"mean_token_accuracy": 0.9850803911685944,
"num_tokens": 11471404.0,
"step": 1301
},
{
"entropy": 0.9265129566192627,
"epoch": 4.702262443438914,
"grad_norm": 0.42576301097869873,
"learning_rate": 8.920427021819642e-05,
"loss": 0.02052636444568634,
"mean_token_accuracy": 0.9918248355388641,
"num_tokens": 11480092.0,
"step": 1302
},
{
"entropy": 0.8638421446084976,
"epoch": 4.705882352941177,
"grad_norm": 0.48810967803001404,
"learning_rate": 8.90762832169111e-05,
"loss": 0.025864070281386375,
"mean_token_accuracy": 0.9912404119968414,
"num_tokens": 11489596.0,
"step": 1303
},
{
"entropy": 0.8895996809005737,
"epoch": 4.709502262443439,
"grad_norm": 0.6966197490692139,
"learning_rate": 8.89483409319061e-05,
"loss": 0.05462773144245148,
"mean_token_accuracy": 0.9865398108959198,
"num_tokens": 11499041.0,
"step": 1304
},
{
"entropy": 0.9096980839967728,
"epoch": 4.7131221719457015,
"grad_norm": 0.5785058736801147,
"learning_rate": 8.882044363660813e-05,
"loss": 0.041293296962976456,
"mean_token_accuracy": 0.9886120110750198,
"num_tokens": 11508044.0,
"step": 1305
},
{
"entropy": 0.9218699336051941,
"epoch": 4.716742081447964,
"grad_norm": 0.7000371217727661,
"learning_rate": 8.869259160434776e-05,
"loss": 0.04127487540245056,
"mean_token_accuracy": 0.9850119203329086,
"num_tokens": 11516859.0,
"step": 1306
},
{
"entropy": 0.8685460537672043,
"epoch": 4.720361990950226,
"grad_norm": 0.6293456554412842,
"learning_rate": 8.856478510835878e-05,
"loss": 0.0625356063246727,
"mean_token_accuracy": 0.9840066283941269,
"num_tokens": 11526405.0,
"step": 1307
},
{
"entropy": 0.9357750713825226,
"epoch": 4.723981900452489,
"grad_norm": 0.5446988344192505,
"learning_rate": 8.843702442177777e-05,
"loss": 0.06257246434688568,
"mean_token_accuracy": 0.982016310095787,
"num_tokens": 11535367.0,
"step": 1308
},
{
"entropy": 0.9088067710399628,
"epoch": 4.727601809954751,
"grad_norm": 0.8511834740638733,
"learning_rate": 8.830930981764331e-05,
"loss": 0.07266386598348618,
"mean_token_accuracy": 0.9781930446624756,
"num_tokens": 11544182.0,
"step": 1309
},
{
"entropy": 0.8685061484575272,
"epoch": 4.731221719457014,
"grad_norm": 0.41793859004974365,
"learning_rate": 8.818164156889557e-05,
"loss": 0.01888015680015087,
"mean_token_accuracy": 0.9903381317853928,
"num_tokens": 11553094.0,
"step": 1310
},
{
"entropy": 0.9195333868265152,
"epoch": 4.734841628959276,
"grad_norm": 0.45836082100868225,
"learning_rate": 8.805401994837552e-05,
"loss": 0.04215538874268532,
"mean_token_accuracy": 0.988913893699646,
"num_tokens": 11561681.0,
"step": 1311
},
{
"entropy": 0.8929650038480759,
"epoch": 4.7384615384615385,
"grad_norm": 0.5067728757858276,
"learning_rate": 8.79264452288247e-05,
"loss": 0.05104423314332962,
"mean_token_accuracy": 0.9887937307357788,
"num_tokens": 11570799.0,
"step": 1312
},
{
"entropy": 0.9310666769742966,
"epoch": 4.742081447963801,
"grad_norm": 0.5647673606872559,
"learning_rate": 8.77989176828842e-05,
"loss": 0.044227782636880875,
"mean_token_accuracy": 0.9847547262907028,
"num_tokens": 11579550.0,
"step": 1313
},
{
"entropy": 0.866066038608551,
"epoch": 4.745701357466063,
"grad_norm": 0.7207821607589722,
"learning_rate": 8.767143758309441e-05,
"loss": 0.05543539673089981,
"mean_token_accuracy": 0.9844647198915482,
"num_tokens": 11589395.0,
"step": 1314
},
{
"entropy": 0.8902018964290619,
"epoch": 4.749321266968326,
"grad_norm": 0.6174284219741821,
"learning_rate": 8.754400520189434e-05,
"loss": 0.05488777905702591,
"mean_token_accuracy": 0.9817168563604355,
"num_tokens": 11598532.0,
"step": 1315
},
{
"entropy": 0.9263211190700531,
"epoch": 4.752941176470588,
"grad_norm": 0.5040092468261719,
"learning_rate": 8.741662081162101e-05,
"loss": 0.03482822701334953,
"mean_token_accuracy": 0.9891321510076523,
"num_tokens": 11607186.0,
"step": 1316
},
{
"entropy": 0.8952204138040543,
"epoch": 4.756561085972851,
"grad_norm": 0.581134021282196,
"learning_rate": 8.728928468450872e-05,
"loss": 0.039117198437452316,
"mean_token_accuracy": 0.9872443825006485,
"num_tokens": 11616135.0,
"step": 1317
},
{
"entropy": 0.903697595000267,
"epoch": 4.760180995475113,
"grad_norm": 0.5381007790565491,
"learning_rate": 8.716199709268888e-05,
"loss": 0.044438499957323074,
"mean_token_accuracy": 0.9856991767883301,
"num_tokens": 11624827.0,
"step": 1318
},
{
"entropy": 0.8752525150775909,
"epoch": 4.7638009049773755,
"grad_norm": 0.3217228353023529,
"learning_rate": 8.703475830818897e-05,
"loss": 0.02365241013467312,
"mean_token_accuracy": 0.9894928485155106,
"num_tokens": 11633905.0,
"step": 1319
},
{
"entropy": 0.8906976282596588,
"epoch": 4.767420814479638,
"grad_norm": 0.49503424763679504,
"learning_rate": 8.690756860293228e-05,
"loss": 0.05426553636789322,
"mean_token_accuracy": 0.9833587259054184,
"num_tokens": 11642732.0,
"step": 1320
},
{
"entropy": 0.8395461142063141,
"epoch": 4.7710407239819,
"grad_norm": 0.5068437457084656,
"learning_rate": 8.678042824873718e-05,
"loss": 0.037847042083740234,
"mean_token_accuracy": 0.9877204298973083,
"num_tokens": 11651835.0,
"step": 1321
},
{
"entropy": 0.833812415599823,
"epoch": 4.774660633484163,
"grad_norm": 0.33258169889450073,
"learning_rate": 8.665333751731657e-05,
"loss": 0.023194529116153717,
"mean_token_accuracy": 0.9938502311706543,
"num_tokens": 11661353.0,
"step": 1322
},
{
"entropy": 0.8494678735733032,
"epoch": 4.778280542986425,
"grad_norm": 0.41458430886268616,
"learning_rate": 8.652629668027731e-05,
"loss": 0.02174549549818039,
"mean_token_accuracy": 0.9927671700716019,
"num_tokens": 11670581.0,
"step": 1323
},
{
"entropy": 0.9297997653484344,
"epoch": 4.781900452488688,
"grad_norm": 0.6218132972717285,
"learning_rate": 8.639930600911958e-05,
"loss": 0.03217285871505737,
"mean_token_accuracy": 0.9903489202260971,
"num_tokens": 11679163.0,
"step": 1324
},
{
"entropy": 0.9243068993091583,
"epoch": 4.78552036199095,
"grad_norm": 0.6559213399887085,
"learning_rate": 8.627236577523638e-05,
"loss": 0.0579834058880806,
"mean_token_accuracy": 0.9795654565095901,
"num_tokens": 11687686.0,
"step": 1325
},
{
"entropy": 0.8774077147245407,
"epoch": 4.7891402714932125,
"grad_norm": 0.3600568175315857,
"learning_rate": 8.614547624991298e-05,
"loss": 0.03028794750571251,
"mean_token_accuracy": 0.9934851080179214,
"num_tokens": 11696611.0,
"step": 1326
},
{
"entropy": 0.892438679933548,
"epoch": 4.792760180995475,
"grad_norm": 0.4619959592819214,
"learning_rate": 8.601863770432621e-05,
"loss": 0.025009188801050186,
"mean_token_accuracy": 0.9917362928390503,
"num_tokens": 11705104.0,
"step": 1327
},
{
"entropy": 0.8198121190071106,
"epoch": 4.796380090497737,
"grad_norm": 0.3183024227619171,
"learning_rate": 8.589185040954397e-05,
"loss": 0.01881255768239498,
"mean_token_accuracy": 0.9933484643697739,
"num_tokens": 11714363.0,
"step": 1328
},
{
"entropy": 0.8623606115579605,
"epoch": 4.8,
"grad_norm": 0.4741048216819763,
"learning_rate": 8.576511463652459e-05,
"loss": 0.03326078876852989,
"mean_token_accuracy": 0.9923247247934341,
"num_tokens": 11723196.0,
"step": 1329
},
{
"entropy": 0.8758032768964767,
"epoch": 4.803619909502262,
"grad_norm": 0.4637037515640259,
"learning_rate": 8.563843065611644e-05,
"loss": 0.016431959345936775,
"mean_token_accuracy": 0.9943708181381226,
"num_tokens": 11731543.0,
"step": 1330
},
{
"entropy": 0.8611963838338852,
"epoch": 4.807239819004525,
"grad_norm": 0.32876336574554443,
"learning_rate": 8.551179873905695e-05,
"loss": 0.011545347981154919,
"mean_token_accuracy": 0.9968069344758987,
"num_tokens": 11740082.0,
"step": 1331
},
{
"entropy": 0.878911018371582,
"epoch": 4.810859728506787,
"grad_norm": 0.5621935129165649,
"learning_rate": 8.538521915597255e-05,
"loss": 0.03741168975830078,
"mean_token_accuracy": 0.9872355908155441,
"num_tokens": 11748731.0,
"step": 1332
},
{
"entropy": 0.8687689304351807,
"epoch": 4.8144796380090495,
"grad_norm": 0.6842139959335327,
"learning_rate": 8.525869217737765e-05,
"loss": 0.04332878440618515,
"mean_token_accuracy": 0.9868139326572418,
"num_tokens": 11757407.0,
"step": 1333
},
{
"entropy": 0.8257188647985458,
"epoch": 4.818099547511312,
"grad_norm": 0.6337531805038452,
"learning_rate": 8.513221807367431e-05,
"loss": 0.05870246887207031,
"mean_token_accuracy": 0.9833265393972397,
"num_tokens": 11766726.0,
"step": 1334
},
{
"entropy": 0.8069233894348145,
"epoch": 4.821719457013574,
"grad_norm": 0.5104876160621643,
"learning_rate": 8.500579711515157e-05,
"loss": 0.060887325555086136,
"mean_token_accuracy": 0.9869293421506882,
"num_tokens": 11776135.0,
"step": 1335
},
{
"entropy": 0.8201400488615036,
"epoch": 4.825339366515837,
"grad_norm": 0.5308789610862732,
"learning_rate": 8.487942957198494e-05,
"loss": 0.0418221578001976,
"mean_token_accuracy": 0.9881918877363205,
"num_tokens": 11785483.0,
"step": 1336
},
{
"entropy": 0.7880836576223373,
"epoch": 4.828959276018099,
"grad_norm": 0.7861600518226624,
"learning_rate": 8.47531157142357e-05,
"loss": 0.05443998798727989,
"mean_token_accuracy": 0.9877728223800659,
"num_tokens": 11794707.0,
"step": 1337
},
{
"entropy": 0.8260870426893234,
"epoch": 4.832579185520362,
"grad_norm": 0.487807959318161,
"learning_rate": 8.462685581185041e-05,
"loss": 0.048258986324071884,
"mean_token_accuracy": 0.9844470620155334,
"num_tokens": 11803756.0,
"step": 1338
},
{
"entropy": 0.8445771187543869,
"epoch": 4.836199095022624,
"grad_norm": 0.6421375274658203,
"learning_rate": 8.450065013466038e-05,
"loss": 0.06861534714698792,
"mean_token_accuracy": 0.980402871966362,
"num_tokens": 11812898.0,
"step": 1339
},
{
"entropy": 0.8794773668050766,
"epoch": 4.839819004524887,
"grad_norm": 0.6781204342842102,
"learning_rate": 8.437449895238103e-05,
"loss": 0.03358523175120354,
"mean_token_accuracy": 0.9903728663921356,
"num_tokens": 11821573.0,
"step": 1340
},
{
"entropy": 0.8457778543233871,
"epoch": 4.843438914027149,
"grad_norm": 0.44379106163978577,
"learning_rate": 8.424840253461122e-05,
"loss": 0.029661916196346283,
"mean_token_accuracy": 0.9889777451753616,
"num_tokens": 11830502.0,
"step": 1341
},
{
"entropy": 0.8524041026830673,
"epoch": 4.847058823529411,
"grad_norm": 0.5640258193016052,
"learning_rate": 8.412236115083285e-05,
"loss": 0.038459956645965576,
"mean_token_accuracy": 0.9902327507734299,
"num_tokens": 11839647.0,
"step": 1342
},
{
"entropy": 0.8478478789329529,
"epoch": 4.850678733031674,
"grad_norm": 0.5391163229942322,
"learning_rate": 8.399637507041029e-05,
"loss": 0.044210609048604965,
"mean_token_accuracy": 0.982999712228775,
"num_tokens": 11848488.0,
"step": 1343
},
{
"entropy": 0.8328516036272049,
"epoch": 4.854298642533936,
"grad_norm": 0.635456383228302,
"learning_rate": 8.387044456258952e-05,
"loss": 0.030223991721868515,
"mean_token_accuracy": 0.9904019236564636,
"num_tokens": 11857067.0,
"step": 1344
},
{
"entropy": 0.869571715593338,
"epoch": 4.857918552036199,
"grad_norm": 0.6396393775939941,
"learning_rate": 8.37445698964979e-05,
"loss": 0.04591453820466995,
"mean_token_accuracy": 0.9870909750461578,
"num_tokens": 11865616.0,
"step": 1345
},
{
"entropy": 0.8614618182182312,
"epoch": 4.861538461538462,
"grad_norm": 0.4452335238456726,
"learning_rate": 8.361875134114343e-05,
"loss": 0.033750128000974655,
"mean_token_accuracy": 0.9905308783054352,
"num_tokens": 11874492.0,
"step": 1346
},
{
"entropy": 0.8712886422872543,
"epoch": 4.8651583710407245,
"grad_norm": 0.698449969291687,
"learning_rate": 8.349298916541415e-05,
"loss": 0.048250485211610794,
"mean_token_accuracy": 0.9835482239723206,
"num_tokens": 11883017.0,
"step": 1347
},
{
"entropy": 0.8200095295906067,
"epoch": 4.868778280542987,
"grad_norm": 0.47245922684669495,
"learning_rate": 8.336728363807767e-05,
"loss": 0.036576300859451294,
"mean_token_accuracy": 0.9883123934268951,
"num_tokens": 11891712.0,
"step": 1348
},
{
"entropy": 0.9147795736789703,
"epoch": 4.872398190045249,
"grad_norm": 0.5961781144142151,
"learning_rate": 8.324163502778048e-05,
"loss": 0.02409369871020317,
"mean_token_accuracy": 0.9936271756887436,
"num_tokens": 11899916.0,
"step": 1349
},
{
"entropy": 0.8785306960344315,
"epoch": 4.876018099547512,
"grad_norm": 0.5444777607917786,
"learning_rate": 8.31160436030475e-05,
"loss": 0.09001006931066513,
"mean_token_accuracy": 0.9791886657476425,
"num_tokens": 11908640.0,
"step": 1350
},
{
"entropy": 0.8604621440172195,
"epoch": 4.879638009049774,
"grad_norm": 0.58026123046875,
"learning_rate": 8.299050963228133e-05,
"loss": 0.034584179520606995,
"mean_token_accuracy": 0.9857950210571289,
"num_tokens": 11917480.0,
"step": 1351
},
{
"entropy": 0.8894483149051666,
"epoch": 4.883257918552037,
"grad_norm": 0.6459085941314697,
"learning_rate": 8.286503338376186e-05,
"loss": 0.05245271325111389,
"mean_token_accuracy": 0.9862827807664871,
"num_tokens": 11925818.0,
"step": 1352
},
{
"entropy": 0.8818574398756027,
"epoch": 4.886877828054299,
"grad_norm": 0.36797812581062317,
"learning_rate": 8.273961512564566e-05,
"loss": 0.026651078835129738,
"mean_token_accuracy": 0.9902130663394928,
"num_tokens": 11934907.0,
"step": 1353
},
{
"entropy": 0.8988694995641708,
"epoch": 4.8904977375565615,
"grad_norm": 0.43273940682411194,
"learning_rate": 8.261425512596525e-05,
"loss": 0.03141481429338455,
"mean_token_accuracy": 0.9927571415901184,
"num_tokens": 11943693.0,
"step": 1354
},
{
"entropy": 0.9035229533910751,
"epoch": 4.894117647058824,
"grad_norm": 0.5842182636260986,
"learning_rate": 8.24889536526288e-05,
"loss": 0.06542620062828064,
"mean_token_accuracy": 0.9820240437984467,
"num_tokens": 11952674.0,
"step": 1355
},
{
"entropy": 0.882953867316246,
"epoch": 4.897737556561086,
"grad_norm": 0.6475455164909363,
"learning_rate": 8.236371097341925e-05,
"loss": 0.03121659904718399,
"mean_token_accuracy": 0.9919475317001343,
"num_tokens": 11961391.0,
"step": 1356
},
{
"entropy": 0.8515945225954056,
"epoch": 4.901357466063349,
"grad_norm": 0.4392819106578827,
"learning_rate": 8.223852735599402e-05,
"loss": 0.029448354616761208,
"mean_token_accuracy": 0.9892574101686478,
"num_tokens": 11970576.0,
"step": 1357
},
{
"entropy": 0.967382162809372,
"epoch": 4.904977375565611,
"grad_norm": 0.6789311170578003,
"learning_rate": 8.21134030678842e-05,
"loss": 0.03611960634589195,
"mean_token_accuracy": 0.9896086901426315,
"num_tokens": 11978784.0,
"step": 1358
},
{
"entropy": 0.9994252473115921,
"epoch": 4.908597285067874,
"grad_norm": 0.5801035761833191,
"learning_rate": 8.198833837649412e-05,
"loss": 0.03757525607943535,
"mean_token_accuracy": 0.9898461997509003,
"num_tokens": 11986702.0,
"step": 1359
},
{
"entropy": 0.951177105307579,
"epoch": 4.912217194570136,
"grad_norm": 0.5682723522186279,
"learning_rate": 8.186333354910076e-05,
"loss": 0.030282404273748398,
"mean_token_accuracy": 0.9920378625392914,
"num_tokens": 11995195.0,
"step": 1360
},
{
"entropy": 0.9082824736833572,
"epoch": 4.9158371040723985,
"grad_norm": 0.4542546272277832,
"learning_rate": 8.17383888528532e-05,
"loss": 0.03241301700472832,
"mean_token_accuracy": 0.9934439212083817,
"num_tokens": 12003988.0,
"step": 1361
},
{
"entropy": 0.8864389061927795,
"epoch": 4.919457013574661,
"grad_norm": 0.5183593034744263,
"learning_rate": 8.161350455477197e-05,
"loss": 0.03903008624911308,
"mean_token_accuracy": 0.9880173355340958,
"num_tokens": 12012979.0,
"step": 1362
},
{
"entropy": 0.9383509159088135,
"epoch": 4.923076923076923,
"grad_norm": 0.4837839603424072,
"learning_rate": 8.14886809217485e-05,
"loss": 0.041375696659088135,
"mean_token_accuracy": 0.9865945130586624,
"num_tokens": 12021717.0,
"step": 1363
},
{
"entropy": 0.9083155989646912,
"epoch": 4.926696832579186,
"grad_norm": 0.3800016939640045,
"learning_rate": 8.136391822054466e-05,
"loss": 0.019948210567235947,
"mean_token_accuracy": 0.991382360458374,
"num_tokens": 12030384.0,
"step": 1364
},
{
"entropy": 0.8765196651220322,
"epoch": 4.930316742081448,
"grad_norm": 0.5050578713417053,
"learning_rate": 8.123921671779193e-05,
"loss": 0.029966101050376892,
"mean_token_accuracy": 0.9913382828235626,
"num_tokens": 12039517.0,
"step": 1365
},
{
"entropy": 0.8959863632917404,
"epoch": 4.933936651583711,
"grad_norm": 0.5598185658454895,
"learning_rate": 8.111457667999123e-05,
"loss": 0.041590698063373566,
"mean_token_accuracy": 0.9848756641149521,
"num_tokens": 12048401.0,
"step": 1366
},
{
"entropy": 0.9154993742704391,
"epoch": 4.937556561085973,
"grad_norm": 0.7497878670692444,
"learning_rate": 8.098999837351193e-05,
"loss": 0.060688458383083344,
"mean_token_accuracy": 0.9851639270782471,
"num_tokens": 12057061.0,
"step": 1367
},
{
"entropy": 0.8572700619697571,
"epoch": 4.9411764705882355,
"grad_norm": 0.39258310198783875,
"learning_rate": 8.086548206459157e-05,
"loss": 0.03950543329119682,
"mean_token_accuracy": 0.9885773807764053,
"num_tokens": 12066074.0,
"step": 1368
},
{
"entropy": 0.9203635454177856,
"epoch": 4.944796380090498,
"grad_norm": 0.4106742739677429,
"learning_rate": 8.07410280193352e-05,
"loss": 0.028075195848941803,
"mean_token_accuracy": 0.9931609779596329,
"num_tokens": 12074134.0,
"step": 1369
},
{
"entropy": 0.9020792841911316,
"epoch": 4.94841628959276,
"grad_norm": 0.627805233001709,
"learning_rate": 8.061663650371478e-05,
"loss": 0.06281647086143494,
"mean_token_accuracy": 0.9771750569343567,
"num_tokens": 12083039.0,
"step": 1370
},
{
"entropy": 0.8852731287479401,
"epoch": 4.952036199095023,
"grad_norm": 0.657303512096405,
"learning_rate": 8.049230778356864e-05,
"loss": 0.029552282765507698,
"mean_token_accuracy": 0.991500198841095,
"num_tokens": 12091688.0,
"step": 1371
},
{
"entropy": 0.8907477855682373,
"epoch": 4.955656108597285,
"grad_norm": 0.5756560564041138,
"learning_rate": 8.036804212460085e-05,
"loss": 0.06554236263036728,
"mean_token_accuracy": 0.9861236363649368,
"num_tokens": 12101040.0,
"step": 1372
},
{
"entropy": 0.901018038392067,
"epoch": 4.959276018099548,
"grad_norm": 0.6850928664207458,
"learning_rate": 8.024383979238082e-05,
"loss": 0.028915666043758392,
"mean_token_accuracy": 0.9911025762557983,
"num_tokens": 12109748.0,
"step": 1373
},
{
"entropy": 0.8686677515506744,
"epoch": 4.96289592760181,
"grad_norm": 0.4919429123401642,
"learning_rate": 8.011970105234254e-05,
"loss": 0.04680994898080826,
"mean_token_accuracy": 0.9891356080770493,
"num_tokens": 12118627.0,
"step": 1374
},
{
"entropy": 0.9104600250720978,
"epoch": 4.9665158371040725,
"grad_norm": 0.4508710205554962,
"learning_rate": 7.999562616978418e-05,
"loss": 0.022277040407061577,
"mean_token_accuracy": 0.9955233782529831,
"num_tokens": 12126969.0,
"step": 1375
},
{
"entropy": 0.9048454165458679,
"epoch": 4.970135746606335,
"grad_norm": 0.5170649290084839,
"learning_rate": 7.987161540986733e-05,
"loss": 0.02180103212594986,
"mean_token_accuracy": 0.9945137053728104,
"num_tokens": 12135468.0,
"step": 1376
},
{
"entropy": 0.8686342835426331,
"epoch": 4.973755656108597,
"grad_norm": 0.8190595507621765,
"learning_rate": 7.974766903761663e-05,
"loss": 0.05652451515197754,
"mean_token_accuracy": 0.9849314540624619,
"num_tokens": 12144295.0,
"step": 1377
},
{
"entropy": 0.862045481801033,
"epoch": 4.97737556561086,
"grad_norm": 0.6442601084709167,
"learning_rate": 7.962378731791913e-05,
"loss": 0.0695543959736824,
"mean_token_accuracy": 0.9838263392448425,
"num_tokens": 12153149.0,
"step": 1378
},
{
"entropy": 0.8685291558504105,
"epoch": 4.980995475113122,
"grad_norm": 0.4403861463069916,
"learning_rate": 7.949997051552358e-05,
"loss": 0.02654072642326355,
"mean_token_accuracy": 0.9921073764562607,
"num_tokens": 12162005.0,
"step": 1379
},
{
"entropy": 0.8518020212650299,
"epoch": 4.984615384615385,
"grad_norm": 0.5914137959480286,
"learning_rate": 7.937621889504015e-05,
"loss": 0.028411295264959335,
"mean_token_accuracy": 0.9918225407600403,
"num_tokens": 12171200.0,
"step": 1380
},
{
"entropy": 0.8667004555463791,
"epoch": 4.988235294117647,
"grad_norm": 0.4217661917209625,
"learning_rate": 7.925253272093959e-05,
"loss": 0.03126171976327896,
"mean_token_accuracy": 0.990281954407692,
"num_tokens": 12180238.0,
"step": 1381
},
{
"entropy": 0.8675567209720612,
"epoch": 4.9918552036199095,
"grad_norm": 0.5172854065895081,
"learning_rate": 7.912891225755288e-05,
"loss": 0.04897421598434448,
"mean_token_accuracy": 0.9878519624471664,
"num_tokens": 12189686.0,
"step": 1382
},
{
"entropy": 0.8342727571725845,
"epoch": 4.995475113122172,
"grad_norm": 0.6539183259010315,
"learning_rate": 7.900535776907049e-05,
"loss": 0.11599453538656235,
"mean_token_accuracy": 0.9718467444181442,
"num_tokens": 12199160.0,
"step": 1383
},
{
"entropy": 0.8954996764659882,
"epoch": 4.999095022624434,
"grad_norm": 0.43926727771759033,
"learning_rate": 7.888186951954197e-05,
"loss": 0.022596832364797592,
"mean_token_accuracy": 0.9941199272871017,
"num_tokens": 12207819.0,
"step": 1384
},
{
"entropy": 0.7847009301185608,
"epoch": 5.0,
"grad_norm": 3.054668426513672,
"learning_rate": 7.875844777287526e-05,
"loss": 0.07187020778656006,
"mean_token_accuracy": 0.9810426831245422,
"num_tokens": 12208625.0,
"step": 1385
},
{
"epoch": 5.0,
"eval_entropy": 0.8913698419322812,
"eval_loss": 0.11407072842121124,
"eval_mean_token_accuracy": 0.9732460108229785,
"eval_num_tokens": 12208625.0,
"eval_runtime": 31.7957,
"eval_samples_per_second": 11.605,
"eval_steps_per_second": 3.868,
"step": 1385
}
],
"logging_steps": 1,
"max_steps": 2216,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.473572203031882e+18,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}