hal9000-adapter / checkpoint-460 /trainer_state.json
michaelwaves's picture
Add files using upload-large-folder tool
0a4972f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 460,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.991020917892456,
"epoch": 0.002177463255307567,
"grad_norm": 1.7277425527572632,
"learning_rate": 0.0,
"loss": 2.1366,
"mean_token_accuracy": 0.628267303109169,
"num_tokens": 3878.0,
"step": 1
},
{
"entropy": 0.957662433385849,
"epoch": 0.004354926510615134,
"grad_norm": 2.072942018508911,
"learning_rate": 1.4285714285714285e-05,
"loss": 2.1489,
"mean_token_accuracy": 0.6321403831243515,
"num_tokens": 7754.0,
"step": 2
},
{
"entropy": 0.9688078463077545,
"epoch": 0.0065323897659227,
"grad_norm": 1.712500810623169,
"learning_rate": 2.857142857142857e-05,
"loss": 2.1006,
"mean_token_accuracy": 0.6395954489707947,
"num_tokens": 11724.0,
"step": 3
},
{
"entropy": 0.9446051567792892,
"epoch": 0.008709853021230268,
"grad_norm": 1.6249885559082031,
"learning_rate": 4.2857142857142856e-05,
"loss": 1.8636,
"mean_token_accuracy": 0.6592330932617188,
"num_tokens": 15998.0,
"step": 4
},
{
"entropy": 0.9482486844062805,
"epoch": 0.010887316276537834,
"grad_norm": 1.2645702362060547,
"learning_rate": 5.714285714285714e-05,
"loss": 1.6381,
"mean_token_accuracy": 0.6715894490480423,
"num_tokens": 20034.0,
"step": 5
},
{
"entropy": 0.8820638656616211,
"epoch": 0.0130647795318454,
"grad_norm": 0.9111854434013367,
"learning_rate": 7.142857142857143e-05,
"loss": 1.256,
"mean_token_accuracy": 0.7338996976613998,
"num_tokens": 24592.0,
"step": 6
},
{
"entropy": 0.866950273513794,
"epoch": 0.015242242787152967,
"grad_norm": 0.6964920163154602,
"learning_rate": 8.571428571428571e-05,
"loss": 1.0385,
"mean_token_accuracy": 0.7606792002916336,
"num_tokens": 29247.0,
"step": 7
},
{
"entropy": 0.9245865046977997,
"epoch": 0.017419706042460535,
"grad_norm": 0.6615565419197083,
"learning_rate": 0.0001,
"loss": 0.9594,
"mean_token_accuracy": 0.7808533608913422,
"num_tokens": 33561.0,
"step": 8
},
{
"entropy": 0.8866463452577591,
"epoch": 0.0195971692977681,
"grad_norm": 0.5024364590644836,
"learning_rate": 0.00011428571428571428,
"loss": 0.8709,
"mean_token_accuracy": 0.7967555373907089,
"num_tokens": 37956.0,
"step": 9
},
{
"entropy": 0.8838604241609573,
"epoch": 0.021774632553075667,
"grad_norm": 0.637697696685791,
"learning_rate": 0.00012857142857142858,
"loss": 0.8448,
"mean_token_accuracy": 0.7953355461359024,
"num_tokens": 41607.0,
"step": 10
},
{
"entropy": 0.8180341869592667,
"epoch": 0.023952095808383235,
"grad_norm": 0.5411834120750427,
"learning_rate": 0.00014285714285714287,
"loss": 0.7641,
"mean_token_accuracy": 0.8057558983564377,
"num_tokens": 45872.0,
"step": 11
},
{
"entropy": 0.6423389464616776,
"epoch": 0.0261295590636908,
"grad_norm": 0.5807392597198486,
"learning_rate": 0.00015714285714285716,
"loss": 0.6364,
"mean_token_accuracy": 0.8353168815374374,
"num_tokens": 50197.0,
"step": 12
},
{
"entropy": 0.7770279943943024,
"epoch": 0.028307022318998367,
"grad_norm": 0.602966845035553,
"learning_rate": 0.00017142857142857143,
"loss": 0.911,
"mean_token_accuracy": 0.8115980476140976,
"num_tokens": 55436.0,
"step": 13
},
{
"entropy": 0.6030550897121429,
"epoch": 0.030484485574305935,
"grad_norm": 0.471264511346817,
"learning_rate": 0.00018571428571428572,
"loss": 0.6506,
"mean_token_accuracy": 0.8220222592353821,
"num_tokens": 59509.0,
"step": 14
},
{
"entropy": 0.5797188133001328,
"epoch": 0.0326619488296135,
"grad_norm": 0.3981204628944397,
"learning_rate": 0.0002,
"loss": 0.6439,
"mean_token_accuracy": 0.8296175897121429,
"num_tokens": 63811.0,
"step": 15
},
{
"entropy": 0.5227785632014275,
"epoch": 0.03483941208492107,
"grad_norm": 0.3803451955318451,
"learning_rate": 0.00019999776724666853,
"loss": 0.5614,
"mean_token_accuracy": 0.8536529093980789,
"num_tokens": 67933.0,
"step": 16
},
{
"entropy": 0.5339454486966133,
"epoch": 0.037016875340228635,
"grad_norm": 0.4023122489452362,
"learning_rate": 0.00019999106909745614,
"loss": 0.5768,
"mean_token_accuracy": 0.8468181490898132,
"num_tokens": 71929.0,
"step": 17
},
{
"entropy": 0.5080433636903763,
"epoch": 0.0391943385955362,
"grad_norm": 0.359109103679657,
"learning_rate": 0.0001999799058847031,
"loss": 0.5158,
"mean_token_accuracy": 0.8626691251993179,
"num_tokens": 76116.0,
"step": 18
},
{
"entropy": 0.49260225892066956,
"epoch": 0.04137180185084377,
"grad_norm": 0.34172919392585754,
"learning_rate": 0.00019996427816229171,
"loss": 0.5121,
"mean_token_accuracy": 0.8724553287029266,
"num_tokens": 80000.0,
"step": 19
},
{
"entropy": 0.5065008923411369,
"epoch": 0.043549265106151334,
"grad_norm": 0.4033750295639038,
"learning_rate": 0.00019994418670561857,
"loss": 0.5636,
"mean_token_accuracy": 0.8592322468757629,
"num_tokens": 83682.0,
"step": 20
},
{
"entropy": 0.54892348498106,
"epoch": 0.0457267283614589,
"grad_norm": 0.41379520297050476,
"learning_rate": 0.00019991963251155627,
"loss": 0.5693,
"mean_token_accuracy": 0.8495212495326996,
"num_tokens": 87684.0,
"step": 21
},
{
"entropy": 0.4928950071334839,
"epoch": 0.04790419161676647,
"grad_norm": 0.3717893362045288,
"learning_rate": 0.00019989061679840392,
"loss": 0.523,
"mean_token_accuracy": 0.8606368601322174,
"num_tokens": 91550.0,
"step": 22
},
{
"entropy": 0.5253347381949425,
"epoch": 0.050081654872074034,
"grad_norm": 0.3741125166416168,
"learning_rate": 0.0001998571410058266,
"loss": 0.5433,
"mean_token_accuracy": 0.8630485236644745,
"num_tokens": 95625.0,
"step": 23
},
{
"entropy": 0.5028375387191772,
"epoch": 0.0522591181273816,
"grad_norm": 1.8555870056152344,
"learning_rate": 0.00019981920679478407,
"loss": 0.5296,
"mean_token_accuracy": 0.8609876334667206,
"num_tokens": 99517.0,
"step": 24
},
{
"entropy": 0.5414893701672554,
"epoch": 0.05443658138268917,
"grad_norm": 0.44715237617492676,
"learning_rate": 0.00019977681604744824,
"loss": 0.5782,
"mean_token_accuracy": 0.8441034108400345,
"num_tokens": 103204.0,
"step": 25
},
{
"entropy": 0.48021427541971207,
"epoch": 0.056614044637996734,
"grad_norm": 0.31098225712776184,
"learning_rate": 0.0001997299708671098,
"loss": 0.4932,
"mean_token_accuracy": 0.8744789958000183,
"num_tokens": 107327.0,
"step": 26
},
{
"entropy": 0.46857017278671265,
"epoch": 0.0587915078933043,
"grad_norm": 0.3036307692527771,
"learning_rate": 0.00019967867357807391,
"loss": 0.4791,
"mean_token_accuracy": 0.8786111921072006,
"num_tokens": 111453.0,
"step": 27
},
{
"entropy": 0.49031493067741394,
"epoch": 0.06096897114861187,
"grad_norm": 0.3337958753108978,
"learning_rate": 0.00019962292672554493,
"loss": 0.5018,
"mean_token_accuracy": 0.8619510382413864,
"num_tokens": 115266.0,
"step": 28
},
{
"entropy": 0.4807809889316559,
"epoch": 0.06314643440391943,
"grad_norm": 0.35365355014801025,
"learning_rate": 0.00019956273307549988,
"loss": 0.4877,
"mean_token_accuracy": 0.8618622571229935,
"num_tokens": 118928.0,
"step": 29
},
{
"entropy": 0.40949247032403946,
"epoch": 0.065323897659227,
"grad_norm": 0.3226538300514221,
"learning_rate": 0.00019949809561455156,
"loss": 0.4133,
"mean_token_accuracy": 0.8882981538772583,
"num_tokens": 122893.0,
"step": 30
},
{
"entropy": 0.49030745029449463,
"epoch": 0.06750136091453457,
"grad_norm": 0.33420825004577637,
"learning_rate": 0.0001994290175498001,
"loss": 0.503,
"mean_token_accuracy": 0.8634953200817108,
"num_tokens": 127132.0,
"step": 31
},
{
"entropy": 0.49527500569820404,
"epoch": 0.06967882416984214,
"grad_norm": 0.4112333655357361,
"learning_rate": 0.00019935550230867392,
"loss": 0.5067,
"mean_token_accuracy": 0.8607686161994934,
"num_tokens": 131100.0,
"step": 32
},
{
"entropy": 0.5203969404101372,
"epoch": 0.0718562874251497,
"grad_norm": 1.3927068710327148,
"learning_rate": 0.00019927755353875965,
"loss": 0.5942,
"mean_token_accuracy": 0.8566101640462875,
"num_tokens": 135503.0,
"step": 33
},
{
"entropy": 0.5023058727383614,
"epoch": 0.07403375068045727,
"grad_norm": 0.4079550802707672,
"learning_rate": 0.00019919517510762124,
"loss": 0.4961,
"mean_token_accuracy": 0.8630523085594177,
"num_tokens": 139771.0,
"step": 34
},
{
"entropy": 0.4864235520362854,
"epoch": 0.07621121393576484,
"grad_norm": 0.39264485239982605,
"learning_rate": 0.0001991083711026079,
"loss": 0.5009,
"mean_token_accuracy": 0.871365949511528,
"num_tokens": 143980.0,
"step": 35
},
{
"entropy": 0.525127612054348,
"epoch": 0.0783886771910724,
"grad_norm": 0.4305553734302521,
"learning_rate": 0.00019901714583065152,
"loss": 0.4872,
"mean_token_accuracy": 0.8670255392789841,
"num_tokens": 148059.0,
"step": 36
},
{
"entropy": 0.5707878470420837,
"epoch": 0.08056614044637997,
"grad_norm": 0.44129130244255066,
"learning_rate": 0.00019892150381805267,
"loss": 0.581,
"mean_token_accuracy": 0.844414696097374,
"num_tokens": 152230.0,
"step": 37
},
{
"entropy": 0.5001106485724449,
"epoch": 0.08274360370168754,
"grad_norm": 0.5918931365013123,
"learning_rate": 0.00019882144981025633,
"loss": 0.4751,
"mean_token_accuracy": 0.8649907559156418,
"num_tokens": 156252.0,
"step": 38
},
{
"entropy": 0.5312293991446495,
"epoch": 0.0849210669569951,
"grad_norm": 0.4835371971130371,
"learning_rate": 0.00019871698877161627,
"loss": 0.5091,
"mean_token_accuracy": 0.871647521853447,
"num_tokens": 160171.0,
"step": 39
},
{
"entropy": 0.4701843932271004,
"epoch": 0.08709853021230267,
"grad_norm": 0.48571643233299255,
"learning_rate": 0.0001986081258851487,
"loss": 0.4495,
"mean_token_accuracy": 0.8771228045225143,
"num_tokens": 163975.0,
"step": 40
},
{
"entropy": 0.4694196283817291,
"epoch": 0.08927599346761024,
"grad_norm": 0.4216046929359436,
"learning_rate": 0.00019849486655227532,
"loss": 0.4158,
"mean_token_accuracy": 0.8735549598932266,
"num_tokens": 168421.0,
"step": 41
},
{
"entropy": 0.4697120413184166,
"epoch": 0.0914534567229178,
"grad_norm": 0.3664827346801758,
"learning_rate": 0.000198377216392555,
"loss": 0.4231,
"mean_token_accuracy": 0.8784957528114319,
"num_tokens": 172395.0,
"step": 42
},
{
"entropy": 0.44586674869060516,
"epoch": 0.09363091997822537,
"grad_norm": 0.39455050230026245,
"learning_rate": 0.00019825518124340529,
"loss": 0.4166,
"mean_token_accuracy": 0.8799059689044952,
"num_tokens": 175967.0,
"step": 43
},
{
"entropy": 0.4293370470404625,
"epoch": 0.09580838323353294,
"grad_norm": 0.39706796407699585,
"learning_rate": 0.00019812876715981248,
"loss": 0.4522,
"mean_token_accuracy": 0.8723510503768921,
"num_tokens": 180153.0,
"step": 44
},
{
"entropy": 0.41641899943351746,
"epoch": 0.0979858464888405,
"grad_norm": 0.41735970973968506,
"learning_rate": 0.00019799798041403137,
"loss": 0.4436,
"mean_token_accuracy": 0.8725763112306595,
"num_tokens": 184161.0,
"step": 45
},
{
"entropy": 0.40215710550546646,
"epoch": 0.10016330974414807,
"grad_norm": 0.44639289379119873,
"learning_rate": 0.00019786282749527406,
"loss": 0.4289,
"mean_token_accuracy": 0.8803199082612991,
"num_tokens": 187869.0,
"step": 46
},
{
"entropy": 0.48222628980875015,
"epoch": 0.10234077299945564,
"grad_norm": 0.4197250306606293,
"learning_rate": 0.00019772331510938782,
"loss": 0.4861,
"mean_token_accuracy": 0.8618861585855484,
"num_tokens": 192020.0,
"step": 47
},
{
"entropy": 0.49629800766706467,
"epoch": 0.1045182362547632,
"grad_norm": 0.5031387209892273,
"learning_rate": 0.00019757945017852258,
"loss": 0.4775,
"mean_token_accuracy": 0.8681423515081406,
"num_tokens": 195514.0,
"step": 48
},
{
"entropy": 0.3977178856730461,
"epoch": 0.10669569951007077,
"grad_norm": 0.4578983783721924,
"learning_rate": 0.0001974312398407873,
"loss": 0.3673,
"mean_token_accuracy": 0.8914825022220612,
"num_tokens": 199234.0,
"step": 49
},
{
"entropy": 0.3965229466557503,
"epoch": 0.10887316276537834,
"grad_norm": 0.37602174282073975,
"learning_rate": 0.0001972786914498958,
"loss": 0.3953,
"mean_token_accuracy": 0.8783656060695648,
"num_tokens": 203760.0,
"step": 50
},
{
"entropy": 0.42161373794078827,
"epoch": 0.1110506260206859,
"grad_norm": 0.3125810921192169,
"learning_rate": 0.00019712181257480212,
"loss": 0.3754,
"mean_token_accuracy": 0.8832796663045883,
"num_tokens": 207439.0,
"step": 51
},
{
"entropy": 0.4191659912467003,
"epoch": 0.11322808927599347,
"grad_norm": 0.32242998480796814,
"learning_rate": 0.00019696061099932471,
"loss": 0.3861,
"mean_token_accuracy": 0.8820012956857681,
"num_tokens": 211708.0,
"step": 52
},
{
"entropy": 0.464703693985939,
"epoch": 0.11540555253130104,
"grad_norm": 0.4021685719490051,
"learning_rate": 0.00019679509472176032,
"loss": 0.4384,
"mean_token_accuracy": 0.8743875622749329,
"num_tokens": 215763.0,
"step": 53
},
{
"entropy": 0.4165603965520859,
"epoch": 0.1175830157866086,
"grad_norm": 0.3444255590438843,
"learning_rate": 0.00019662527195448722,
"loss": 0.3991,
"mean_token_accuracy": 0.88118776679039,
"num_tokens": 220090.0,
"step": 54
},
{
"entropy": 0.4068721905350685,
"epoch": 0.11976047904191617,
"grad_norm": 0.3705560564994812,
"learning_rate": 0.00019645115112355754,
"loss": 0.3707,
"mean_token_accuracy": 0.882274329662323,
"num_tokens": 223672.0,
"step": 55
},
{
"entropy": 0.3627975210547447,
"epoch": 0.12193794229722374,
"grad_norm": 0.37365177273750305,
"learning_rate": 0.00019627274086827948,
"loss": 0.36,
"mean_token_accuracy": 0.8874702304601669,
"num_tokens": 227497.0,
"step": 56
},
{
"entropy": 0.40359440445899963,
"epoch": 0.1241154055525313,
"grad_norm": 0.33996060490608215,
"learning_rate": 0.00019609005004078838,
"loss": 0.4253,
"mean_token_accuracy": 0.8732311725616455,
"num_tokens": 231293.0,
"step": 57
},
{
"entropy": 0.36641839146614075,
"epoch": 0.12629286880783885,
"grad_norm": 0.2762836515903473,
"learning_rate": 0.00019590308770560763,
"loss": 0.3485,
"mean_token_accuracy": 0.8926344960927963,
"num_tokens": 236001.0,
"step": 58
},
{
"entropy": 0.4077141284942627,
"epoch": 0.12847033206314643,
"grad_norm": 0.2915239930152893,
"learning_rate": 0.00019571186313919895,
"loss": 0.3942,
"mean_token_accuracy": 0.8783977180719376,
"num_tokens": 240264.0,
"step": 59
},
{
"entropy": 0.4022030830383301,
"epoch": 0.130647795318454,
"grad_norm": 0.3684654235839844,
"learning_rate": 0.00019551638582950213,
"loss": 0.412,
"mean_token_accuracy": 0.8735997825860977,
"num_tokens": 243854.0,
"step": 60
},
{
"entropy": 0.41812095791101456,
"epoch": 0.13282525857376157,
"grad_norm": 0.3383813500404358,
"learning_rate": 0.00019531666547546403,
"loss": 0.4302,
"mean_token_accuracy": 0.8795482665300369,
"num_tokens": 247268.0,
"step": 61
},
{
"entropy": 0.38665496557950974,
"epoch": 0.13500272182906914,
"grad_norm": 0.31561279296875,
"learning_rate": 0.0001951127119865578,
"loss": 0.3844,
"mean_token_accuracy": 0.8816228210926056,
"num_tokens": 251256.0,
"step": 62
},
{
"entropy": 0.4358583614230156,
"epoch": 0.1371801850843767,
"grad_norm": 0.3552601933479309,
"learning_rate": 0.00019490453548229075,
"loss": 0.4193,
"mean_token_accuracy": 0.8728261440992355,
"num_tokens": 255350.0,
"step": 63
},
{
"entropy": 0.40031400322914124,
"epoch": 0.13935764833968428,
"grad_norm": 0.30350831151008606,
"learning_rate": 0.00019469214629170246,
"loss": 0.4005,
"mean_token_accuracy": 0.8818740844726562,
"num_tokens": 259391.0,
"step": 64
},
{
"entropy": 0.3782212808728218,
"epoch": 0.14153511159499182,
"grad_norm": 0.2870739996433258,
"learning_rate": 0.00019447555495285247,
"loss": 0.3396,
"mean_token_accuracy": 0.8948279619216919,
"num_tokens": 263599.0,
"step": 65
},
{
"entropy": 0.41549866646528244,
"epoch": 0.1437125748502994,
"grad_norm": 0.2995204031467438,
"learning_rate": 0.00019425477221229694,
"loss": 0.394,
"mean_token_accuracy": 0.8853535056114197,
"num_tokens": 267514.0,
"step": 66
},
{
"entropy": 0.40607404708862305,
"epoch": 0.14589003810560697,
"grad_norm": 0.3016026020050049,
"learning_rate": 0.00019402980902455592,
"loss": 0.4006,
"mean_token_accuracy": 0.8783000707626343,
"num_tokens": 271156.0,
"step": 67
},
{
"entropy": 0.3719393089413643,
"epoch": 0.14806750136091454,
"grad_norm": 0.26128438115119934,
"learning_rate": 0.00019380067655156956,
"loss": 0.3537,
"mean_token_accuracy": 0.8965920209884644,
"num_tokens": 275317.0,
"step": 68
},
{
"entropy": 0.42157839983701706,
"epoch": 0.1502449646162221,
"grad_norm": 0.3250483572483063,
"learning_rate": 0.00019356738616214435,
"loss": 0.4115,
"mean_token_accuracy": 0.8846541047096252,
"num_tokens": 279424.0,
"step": 69
},
{
"entropy": 0.4183052033185959,
"epoch": 0.15242242787152968,
"grad_norm": 0.315361887216568,
"learning_rate": 0.00019332994943138906,
"loss": 0.4148,
"mean_token_accuracy": 0.8700041323900223,
"num_tokens": 283564.0,
"step": 70
},
{
"entropy": 0.40483858436346054,
"epoch": 0.15459989112683722,
"grad_norm": 0.31096142530441284,
"learning_rate": 0.00019308837814014038,
"loss": 0.3835,
"mean_token_accuracy": 0.8849562704563141,
"num_tokens": 287357.0,
"step": 71
},
{
"entropy": 0.39035435765981674,
"epoch": 0.1567773543821448,
"grad_norm": 0.3067997097969055,
"learning_rate": 0.0001928426842743784,
"loss": 0.3846,
"mean_token_accuracy": 0.8829791098833084,
"num_tokens": 291390.0,
"step": 72
},
{
"entropy": 0.3541962653398514,
"epoch": 0.15895481763745237,
"grad_norm": 0.27743661403656006,
"learning_rate": 0.000192592880024632,
"loss": 0.3279,
"mean_token_accuracy": 0.8986150324344635,
"num_tokens": 295446.0,
"step": 73
},
{
"entropy": 0.4067593812942505,
"epoch": 0.16113228089275994,
"grad_norm": 0.2917785346508026,
"learning_rate": 0.00019233897778537387,
"loss": 0.4056,
"mean_token_accuracy": 0.8775222897529602,
"num_tokens": 299884.0,
"step": 74
},
{
"entropy": 0.3865869492292404,
"epoch": 0.1633097441480675,
"grad_norm": 0.3175944685935974,
"learning_rate": 0.00019208099015440553,
"loss": 0.3947,
"mean_token_accuracy": 0.8831316977739334,
"num_tokens": 303679.0,
"step": 75
},
{
"entropy": 0.42061641067266464,
"epoch": 0.16548720740337508,
"grad_norm": 0.29020923376083374,
"learning_rate": 0.00019181892993223241,
"loss": 0.424,
"mean_token_accuracy": 0.8717161864042282,
"num_tokens": 308028.0,
"step": 76
},
{
"entropy": 0.3790237084031105,
"epoch": 0.16766467065868262,
"grad_norm": 0.28459441661834717,
"learning_rate": 0.00019155281012142857,
"loss": 0.3669,
"mean_token_accuracy": 0.8902580589056015,
"num_tokens": 312280.0,
"step": 77
},
{
"entropy": 0.4007532522082329,
"epoch": 0.1698421339139902,
"grad_norm": 0.2907998263835907,
"learning_rate": 0.00019128264392599166,
"loss": 0.421,
"mean_token_accuracy": 0.8734158575534821,
"num_tokens": 316050.0,
"step": 78
},
{
"entropy": 0.38431502133607864,
"epoch": 0.17201959716929777,
"grad_norm": 0.2705579102039337,
"learning_rate": 0.00019100844475068777,
"loss": 0.3687,
"mean_token_accuracy": 0.8934948295354843,
"num_tokens": 319866.0,
"step": 79
},
{
"entropy": 0.4128147065639496,
"epoch": 0.17419706042460534,
"grad_norm": 0.3151399493217468,
"learning_rate": 0.0001907302262003863,
"loss": 0.3829,
"mean_token_accuracy": 0.8834633827209473,
"num_tokens": 323982.0,
"step": 80
},
{
"entropy": 0.4086031913757324,
"epoch": 0.1763745236799129,
"grad_norm": 0.3054238557815552,
"learning_rate": 0.00019044800207938483,
"loss": 0.3987,
"mean_token_accuracy": 0.8847066015005112,
"num_tokens": 327984.0,
"step": 81
},
{
"entropy": 0.3883258253335953,
"epoch": 0.17855198693522048,
"grad_norm": 0.29092952609062195,
"learning_rate": 0.00019016178639072448,
"loss": 0.3799,
"mean_token_accuracy": 0.8958835899829865,
"num_tokens": 331502.0,
"step": 82
},
{
"entropy": 0.41453375667333603,
"epoch": 0.18072945019052802,
"grad_norm": 0.279079407453537,
"learning_rate": 0.0001898715933354948,
"loss": 0.4303,
"mean_token_accuracy": 0.879971370100975,
"num_tokens": 335369.0,
"step": 83
},
{
"entropy": 0.395871065557003,
"epoch": 0.1829069134458356,
"grad_norm": 0.2992061972618103,
"learning_rate": 0.0001895774373121294,
"loss": 0.3933,
"mean_token_accuracy": 0.8855740427970886,
"num_tokens": 339407.0,
"step": 84
},
{
"entropy": 0.352156363427639,
"epoch": 0.18508437670114317,
"grad_norm": 0.29319193959236145,
"learning_rate": 0.00018927933291569142,
"loss": 0.3458,
"mean_token_accuracy": 0.8971658796072006,
"num_tokens": 343524.0,
"step": 85
},
{
"entropy": 0.3487248420715332,
"epoch": 0.18726183995645074,
"grad_norm": 0.2763819694519043,
"learning_rate": 0.00018897729493714936,
"loss": 0.3259,
"mean_token_accuracy": 0.8960808515548706,
"num_tokens": 347925.0,
"step": 86
},
{
"entropy": 0.4102029874920845,
"epoch": 0.1894393032117583,
"grad_norm": 0.2646510601043701,
"learning_rate": 0.00018867133836264333,
"loss": 0.3945,
"mean_token_accuracy": 0.8839164674282074,
"num_tokens": 352250.0,
"step": 87
},
{
"entropy": 0.3762153908610344,
"epoch": 0.19161676646706588,
"grad_norm": 0.3275756239891052,
"learning_rate": 0.00018836147837274128,
"loss": 0.3588,
"mean_token_accuracy": 0.893315777182579,
"num_tokens": 356538.0,
"step": 88
},
{
"entropy": 0.3680166006088257,
"epoch": 0.19379422972237342,
"grad_norm": 0.3026663362979889,
"learning_rate": 0.00018804773034168605,
"loss": 0.346,
"mean_token_accuracy": 0.8997195810079575,
"num_tokens": 360352.0,
"step": 89
},
{
"entropy": 0.3681929111480713,
"epoch": 0.195971692977681,
"grad_norm": 0.27409690618515015,
"learning_rate": 0.00018773010983663235,
"loss": 0.3619,
"mean_token_accuracy": 0.8918221592903137,
"num_tokens": 364359.0,
"step": 90
},
{
"entropy": 0.41026338934898376,
"epoch": 0.19814915623298857,
"grad_norm": 0.27450209856033325,
"learning_rate": 0.00018740863261687438,
"loss": 0.3772,
"mean_token_accuracy": 0.885251596570015,
"num_tokens": 368184.0,
"step": 91
},
{
"entropy": 0.41991668939590454,
"epoch": 0.20032661948829614,
"grad_norm": 0.3204193413257599,
"learning_rate": 0.000187083314633064,
"loss": 0.4387,
"mean_token_accuracy": 0.877353847026825,
"num_tokens": 372188.0,
"step": 92
},
{
"entropy": 0.3829573169350624,
"epoch": 0.2025040827436037,
"grad_norm": 0.2948894500732422,
"learning_rate": 0.00018675417202641928,
"loss": 0.3713,
"mean_token_accuracy": 0.8871684223413467,
"num_tokens": 376175.0,
"step": 93
},
{
"entropy": 0.37284964323043823,
"epoch": 0.20468154599891128,
"grad_norm": 0.3094096779823303,
"learning_rate": 0.00018642122112792352,
"loss": 0.3704,
"mean_token_accuracy": 0.8872140049934387,
"num_tokens": 380212.0,
"step": 94
},
{
"entropy": 0.3658677488565445,
"epoch": 0.20685900925421882,
"grad_norm": 0.2979802191257477,
"learning_rate": 0.00018608447845751521,
"loss": 0.3491,
"mean_token_accuracy": 0.8897504657506943,
"num_tokens": 384295.0,
"step": 95
},
{
"entropy": 0.36876438558101654,
"epoch": 0.2090364725095264,
"grad_norm": 0.2677754759788513,
"learning_rate": 0.00018574396072326807,
"loss": 0.3441,
"mean_token_accuracy": 0.894922137260437,
"num_tokens": 388732.0,
"step": 96
},
{
"entropy": 0.3612924814224243,
"epoch": 0.21121393576483397,
"grad_norm": 0.2736094892024994,
"learning_rate": 0.0001853996848205622,
"loss": 0.3723,
"mean_token_accuracy": 0.8909705579280853,
"num_tokens": 392764.0,
"step": 97
},
{
"entropy": 0.3905804604291916,
"epoch": 0.21339139902014154,
"grad_norm": 0.2624414265155792,
"learning_rate": 0.0001850516678312458,
"loss": 0.3891,
"mean_token_accuracy": 0.8835895210504532,
"num_tokens": 397014.0,
"step": 98
},
{
"entropy": 0.3591335415840149,
"epoch": 0.2155688622754491,
"grad_norm": 0.27455052733421326,
"learning_rate": 0.0001846999270227876,
"loss": 0.3285,
"mean_token_accuracy": 0.9014366716146469,
"num_tokens": 400931.0,
"step": 99
},
{
"entropy": 0.3889941945672035,
"epoch": 0.21774632553075668,
"grad_norm": 0.3075306713581085,
"learning_rate": 0.00018434447984742012,
"loss": 0.3748,
"mean_token_accuracy": 0.8902212232351303,
"num_tokens": 404953.0,
"step": 100
},
{
"entropy": 0.40706127136945724,
"epoch": 0.21992378878606422,
"grad_norm": 0.291089802980423,
"learning_rate": 0.00018398534394127366,
"loss": 0.3842,
"mean_token_accuracy": 0.8786927759647369,
"num_tokens": 408846.0,
"step": 101
},
{
"entropy": 0.3662910833954811,
"epoch": 0.2221012520413718,
"grad_norm": 0.2830312252044678,
"learning_rate": 0.00018362253712350131,
"loss": 0.3651,
"mean_token_accuracy": 0.8856998383998871,
"num_tokens": 413058.0,
"step": 102
},
{
"entropy": 0.3981722518801689,
"epoch": 0.22427871529667937,
"grad_norm": 0.26717105507850647,
"learning_rate": 0.00018325607739539497,
"loss": 0.4013,
"mean_token_accuracy": 0.881842851638794,
"num_tokens": 417404.0,
"step": 103
},
{
"entropy": 0.38402143120765686,
"epoch": 0.22645617855198694,
"grad_norm": 0.26284581422805786,
"learning_rate": 0.00018288598293949185,
"loss": 0.3933,
"mean_token_accuracy": 0.8858134895563126,
"num_tokens": 421886.0,
"step": 104
},
{
"entropy": 0.35189586132764816,
"epoch": 0.2286336418072945,
"grad_norm": 0.2981458604335785,
"learning_rate": 0.00018251227211867264,
"loss": 0.3779,
"mean_token_accuracy": 0.8904144316911697,
"num_tokens": 426069.0,
"step": 105
},
{
"entropy": 0.3991141989827156,
"epoch": 0.23081110506260208,
"grad_norm": 0.30855289101600647,
"learning_rate": 0.0001821349634752502,
"loss": 0.4118,
"mean_token_accuracy": 0.875004380941391,
"num_tokens": 430019.0,
"step": 106
},
{
"entropy": 0.3846806064248085,
"epoch": 0.23298856831790962,
"grad_norm": 0.25153040885925293,
"learning_rate": 0.00018175407573004974,
"loss": 0.3944,
"mean_token_accuracy": 0.8794781714677811,
"num_tokens": 434787.0,
"step": 107
},
{
"entropy": 0.38610684871673584,
"epoch": 0.2351660315732172,
"grad_norm": 0.25855541229248047,
"learning_rate": 0.00018136962778147965,
"loss": 0.3625,
"mean_token_accuracy": 0.895257756114006,
"num_tokens": 438762.0,
"step": 108
},
{
"entropy": 0.38023480772972107,
"epoch": 0.23734349482852476,
"grad_norm": 0.26064959168434143,
"learning_rate": 0.00018098163870459419,
"loss": 0.3508,
"mean_token_accuracy": 0.8982452154159546,
"num_tokens": 442358.0,
"step": 109
},
{
"entropy": 0.38109494745731354,
"epoch": 0.23952095808383234,
"grad_norm": 0.2560478746891022,
"learning_rate": 0.00018059012775014673,
"loss": 0.3316,
"mean_token_accuracy": 0.8920884728431702,
"num_tokens": 446375.0,
"step": 110
},
{
"entropy": 0.40175357460975647,
"epoch": 0.2416984213391399,
"grad_norm": 0.2690741717815399,
"learning_rate": 0.00018019511434363479,
"loss": 0.3694,
"mean_token_accuracy": 0.8843608647584915,
"num_tokens": 450240.0,
"step": 111
},
{
"entropy": 0.4437231123447418,
"epoch": 0.24387588459444748,
"grad_norm": 0.3393898606300354,
"learning_rate": 0.00017979661808433615,
"loss": 0.4375,
"mean_token_accuracy": 0.8717398643493652,
"num_tokens": 454162.0,
"step": 112
},
{
"entropy": 0.39301927387714386,
"epoch": 0.24605334784975502,
"grad_norm": 0.26305022835731506,
"learning_rate": 0.00017939465874433633,
"loss": 0.3915,
"mean_token_accuracy": 0.8859032839536667,
"num_tokens": 458075.0,
"step": 113
},
{
"entropy": 0.37585896253585815,
"epoch": 0.2482308111050626,
"grad_norm": 0.2808936536312103,
"learning_rate": 0.0001789892562675477,
"loss": 0.3808,
"mean_token_accuracy": 0.8814007937908173,
"num_tokens": 462440.0,
"step": 114
},
{
"entropy": 0.35389212518930435,
"epoch": 0.25040827436037016,
"grad_norm": 0.2638992667198181,
"learning_rate": 0.0001785804307687199,
"loss": 0.3669,
"mean_token_accuracy": 0.8885058760643005,
"num_tokens": 466896.0,
"step": 115
},
{
"entropy": 0.32084520161151886,
"epoch": 0.2525857376156777,
"grad_norm": 0.2875458896160126,
"learning_rate": 0.00017816820253244156,
"loss": 0.3393,
"mean_token_accuracy": 0.8992051929235458,
"num_tokens": 470737.0,
"step": 116
},
{
"entropy": 0.37875620275735855,
"epoch": 0.2547632008709853,
"grad_norm": 0.3010421693325043,
"learning_rate": 0.0001777525920121343,
"loss": 0.3771,
"mean_token_accuracy": 0.8866951763629913,
"num_tokens": 474704.0,
"step": 117
},
{
"entropy": 0.3695053979754448,
"epoch": 0.25694066412629285,
"grad_norm": 0.28365740180015564,
"learning_rate": 0.0001773336198290375,
"loss": 0.3606,
"mean_token_accuracy": 0.8899102210998535,
"num_tokens": 478684.0,
"step": 118
},
{
"entropy": 0.37022798508405685,
"epoch": 0.25911812738160045,
"grad_norm": 0.2810768187046051,
"learning_rate": 0.00017691130677118533,
"loss": 0.371,
"mean_token_accuracy": 0.8898769170045853,
"num_tokens": 482795.0,
"step": 119
},
{
"entropy": 0.3846744894981384,
"epoch": 0.261295590636908,
"grad_norm": 0.2767440974712372,
"learning_rate": 0.00017648567379237524,
"loss": 0.3858,
"mean_token_accuracy": 0.8894098848104477,
"num_tokens": 486910.0,
"step": 120
},
{
"entropy": 0.36647915840148926,
"epoch": 0.2634730538922156,
"grad_norm": 0.29192766547203064,
"learning_rate": 0.00017605674201112844,
"loss": 0.3532,
"mean_token_accuracy": 0.8931601047515869,
"num_tokens": 490909.0,
"step": 121
},
{
"entropy": 0.3607020005583763,
"epoch": 0.26565051714752314,
"grad_norm": 0.27455756068229675,
"learning_rate": 0.00017562453270964184,
"loss": 0.3376,
"mean_token_accuracy": 0.8977847099304199,
"num_tokens": 494900.0,
"step": 122
},
{
"entropy": 0.39875783771276474,
"epoch": 0.2678279804028307,
"grad_norm": 0.29144948720932007,
"learning_rate": 0.0001751890673327323,
"loss": 0.3625,
"mean_token_accuracy": 0.8899316191673279,
"num_tokens": 498621.0,
"step": 123
},
{
"entropy": 0.388169527053833,
"epoch": 0.2700054436581383,
"grad_norm": 0.28327831625938416,
"learning_rate": 0.00017475036748677253,
"loss": 0.368,
"mean_token_accuracy": 0.8881956189870834,
"num_tokens": 502604.0,
"step": 124
},
{
"entropy": 0.42279627174139023,
"epoch": 0.2721829069134458,
"grad_norm": 0.2637234330177307,
"learning_rate": 0.00017430845493861903,
"loss": 0.4163,
"mean_token_accuracy": 0.8793482929468155,
"num_tokens": 506851.0,
"step": 125
},
{
"entropy": 0.3659377843141556,
"epoch": 0.2743603701687534,
"grad_norm": 0.2649920582771301,
"learning_rate": 0.00017386335161453204,
"loss": 0.3592,
"mean_token_accuracy": 0.8870955407619476,
"num_tokens": 511029.0,
"step": 126
},
{
"entropy": 0.3424355015158653,
"epoch": 0.27653783342406096,
"grad_norm": 0.24584396183490753,
"learning_rate": 0.00017341507959908788,
"loss": 0.3212,
"mean_token_accuracy": 0.8989846706390381,
"num_tokens": 514975.0,
"step": 127
},
{
"entropy": 0.38080035150051117,
"epoch": 0.27871529667936856,
"grad_norm": 0.2918618321418762,
"learning_rate": 0.00017296366113408283,
"loss": 0.3836,
"mean_token_accuracy": 0.8840546309947968,
"num_tokens": 518603.0,
"step": 128
},
{
"entropy": 0.37054024636745453,
"epoch": 0.2808927599346761,
"grad_norm": 0.2792854309082031,
"learning_rate": 0.00017250911861742984,
"loss": 0.383,
"mean_token_accuracy": 0.8847608417272568,
"num_tokens": 522974.0,
"step": 129
},
{
"entropy": 0.4149508401751518,
"epoch": 0.28307022318998365,
"grad_norm": 0.2900242805480957,
"learning_rate": 0.00017205147460204708,
"loss": 0.4176,
"mean_token_accuracy": 0.8743131309747696,
"num_tokens": 527053.0,
"step": 130
},
{
"entropy": 0.3568470776081085,
"epoch": 0.28524768644529125,
"grad_norm": 0.2806275188922882,
"learning_rate": 0.00017159075179473904,
"loss": 0.3506,
"mean_token_accuracy": 0.8944987952709198,
"num_tokens": 531165.0,
"step": 131
},
{
"entropy": 0.3553621917963028,
"epoch": 0.2874251497005988,
"grad_norm": 0.25992849469184875,
"learning_rate": 0.00017112697305506972,
"loss": 0.3473,
"mean_token_accuracy": 0.8974603414535522,
"num_tokens": 535268.0,
"step": 132
},
{
"entropy": 0.350556381046772,
"epoch": 0.2896026129559064,
"grad_norm": 0.255686491727829,
"learning_rate": 0.00017066016139422868,
"loss": 0.3428,
"mean_token_accuracy": 0.8938136249780655,
"num_tokens": 539608.0,
"step": 133
},
{
"entropy": 0.3975898027420044,
"epoch": 0.29178007621121393,
"grad_norm": 0.2862681746482849,
"learning_rate": 0.00017019033997388893,
"loss": 0.3852,
"mean_token_accuracy": 0.8919837325811386,
"num_tokens": 543509.0,
"step": 134
},
{
"entropy": 0.3602987751364708,
"epoch": 0.2939575394665215,
"grad_norm": 0.2506209909915924,
"learning_rate": 0.00016971753210505815,
"loss": 0.3512,
"mean_token_accuracy": 0.8999500423669815,
"num_tokens": 548201.0,
"step": 135
},
{
"entropy": 0.36172477155923843,
"epoch": 0.2961350027218291,
"grad_norm": 0.24992506206035614,
"learning_rate": 0.00016924176124692171,
"loss": 0.3296,
"mean_token_accuracy": 0.9002155065536499,
"num_tokens": 552588.0,
"step": 136
},
{
"entropy": 0.39114704728126526,
"epoch": 0.2983124659771366,
"grad_norm": 0.26535582542419434,
"learning_rate": 0.00016876305100567898,
"loss": 0.3606,
"mean_token_accuracy": 0.8913624733686447,
"num_tokens": 556684.0,
"step": 137
},
{
"entropy": 0.3595954030752182,
"epoch": 0.3004899292324442,
"grad_norm": 0.2526366114616394,
"learning_rate": 0.0001682814251333718,
"loss": 0.3524,
"mean_token_accuracy": 0.8964285999536514,
"num_tokens": 560872.0,
"step": 138
},
{
"entropy": 0.3456057384610176,
"epoch": 0.30266739248775176,
"grad_norm": 0.2838667631149292,
"learning_rate": 0.0001677969075267062,
"loss": 0.3598,
"mean_token_accuracy": 0.8893538117408752,
"num_tokens": 565414.0,
"step": 139
},
{
"entropy": 0.3304522782564163,
"epoch": 0.30484485574305936,
"grad_norm": 0.2537218928337097,
"learning_rate": 0.00016730952222586672,
"loss": 0.3252,
"mean_token_accuracy": 0.9008310884237289,
"num_tokens": 569961.0,
"step": 140
},
{
"entropy": 0.37971338629722595,
"epoch": 0.3070223189983669,
"grad_norm": 0.2846769392490387,
"learning_rate": 0.00016681929341332333,
"loss": 0.3812,
"mean_token_accuracy": 0.8877308219671249,
"num_tokens": 573882.0,
"step": 141
},
{
"entropy": 0.32383736968040466,
"epoch": 0.30919978225367445,
"grad_norm": 0.30265504121780396,
"learning_rate": 0.00016632624541263193,
"loss": 0.3259,
"mean_token_accuracy": 0.8970090597867966,
"num_tokens": 577860.0,
"step": 142
},
{
"entropy": 0.4320111721754074,
"epoch": 0.31137724550898205,
"grad_norm": 0.2903831899166107,
"learning_rate": 0.0001658304026872274,
"loss": 0.4118,
"mean_token_accuracy": 0.8787370920181274,
"num_tokens": 581333.0,
"step": 143
},
{
"entropy": 0.372535839676857,
"epoch": 0.3135547087642896,
"grad_norm": 0.26929277181625366,
"learning_rate": 0.00016533178983920964,
"loss": 0.3555,
"mean_token_accuracy": 0.8883365392684937,
"num_tokens": 585459.0,
"step": 144
},
{
"entropy": 0.38039466738700867,
"epoch": 0.3157321720195972,
"grad_norm": 0.2679445743560791,
"learning_rate": 0.00016483043160812295,
"loss": 0.3633,
"mean_token_accuracy": 0.8902519345283508,
"num_tokens": 589257.0,
"step": 145
},
{
"entropy": 0.42324574291706085,
"epoch": 0.31790963527490473,
"grad_norm": 0.2745194137096405,
"learning_rate": 0.0001643263528697288,
"loss": 0.4154,
"mean_token_accuracy": 0.878746971487999,
"num_tokens": 593457.0,
"step": 146
},
{
"entropy": 0.46310587227344513,
"epoch": 0.3200870985302123,
"grad_norm": 0.2937363088130951,
"learning_rate": 0.0001638195786347712,
"loss": 0.4564,
"mean_token_accuracy": 0.8730504065752029,
"num_tokens": 596979.0,
"step": 147
},
{
"entropy": 0.3750259429216385,
"epoch": 0.3222645617855199,
"grad_norm": 0.24124816060066223,
"learning_rate": 0.00016331013404773597,
"loss": 0.3568,
"mean_token_accuracy": 0.8933057188987732,
"num_tokens": 601388.0,
"step": 148
},
{
"entropy": 0.37991973757743835,
"epoch": 0.3244420250408274,
"grad_norm": 0.27898603677749634,
"learning_rate": 0.00016279804438560304,
"loss": 0.3518,
"mean_token_accuracy": 0.8888091742992401,
"num_tokens": 605267.0,
"step": 149
},
{
"entropy": 0.38875921070575714,
"epoch": 0.326619488296135,
"grad_norm": 0.2823559641838074,
"learning_rate": 0.00016228333505659246,
"loss": 0.376,
"mean_token_accuracy": 0.8856324106454849,
"num_tokens": 609434.0,
"step": 150
},
{
"entropy": 0.3876258060336113,
"epoch": 0.32879695155144256,
"grad_norm": 0.2898506224155426,
"learning_rate": 0.00016176603159890346,
"loss": 0.376,
"mean_token_accuracy": 0.8831023424863815,
"num_tokens": 613396.0,
"step": 151
},
{
"entropy": 0.3707014173269272,
"epoch": 0.33097441480675016,
"grad_norm": 0.2642916142940521,
"learning_rate": 0.00016124615967944762,
"loss": 0.3752,
"mean_token_accuracy": 0.8911104500293732,
"num_tokens": 617399.0,
"step": 152
},
{
"entropy": 0.3736526593565941,
"epoch": 0.3331518780620577,
"grad_norm": 0.3004290461540222,
"learning_rate": 0.00016072374509257516,
"loss": 0.3808,
"mean_token_accuracy": 0.8887975662946701,
"num_tokens": 621104.0,
"step": 153
},
{
"entropy": 0.35118088871240616,
"epoch": 0.33532934131736525,
"grad_norm": 0.26038020849227905,
"learning_rate": 0.0001601988137587952,
"loss": 0.3382,
"mean_token_accuracy": 0.8998311161994934,
"num_tokens": 625151.0,
"step": 154
},
{
"entropy": 0.38535889238119125,
"epoch": 0.33750680457267285,
"grad_norm": 0.2737407088279724,
"learning_rate": 0.00015967139172348954,
"loss": 0.3913,
"mean_token_accuracy": 0.8854628801345825,
"num_tokens": 628964.0,
"step": 155
},
{
"entropy": 0.38133371621370316,
"epoch": 0.3396842678279804,
"grad_norm": 0.27977254986763,
"learning_rate": 0.00015914150515562055,
"loss": 0.3794,
"mean_token_accuracy": 0.8869093209505081,
"num_tokens": 632846.0,
"step": 156
},
{
"entropy": 0.37492088973522186,
"epoch": 0.341861731083288,
"grad_norm": 0.2831854224205017,
"learning_rate": 0.00015860918034643276,
"loss": 0.355,
"mean_token_accuracy": 0.8947048038244247,
"num_tokens": 636601.0,
"step": 157
},
{
"entropy": 0.4035057872533798,
"epoch": 0.34403919433859553,
"grad_norm": 0.37472277879714966,
"learning_rate": 0.00015807444370814815,
"loss": 0.3954,
"mean_token_accuracy": 0.8825927823781967,
"num_tokens": 640518.0,
"step": 158
},
{
"entropy": 0.34154055267572403,
"epoch": 0.3462166575939031,
"grad_norm": 0.27869144082069397,
"learning_rate": 0.00015753732177265582,
"loss": 0.3376,
"mean_token_accuracy": 0.8913106769323349,
"num_tokens": 644858.0,
"step": 159
},
{
"entropy": 0.41696153581142426,
"epoch": 0.3483941208492107,
"grad_norm": 0.291029155254364,
"learning_rate": 0.00015699784119019554,
"loss": 0.3964,
"mean_token_accuracy": 0.8756668865680695,
"num_tokens": 648735.0,
"step": 160
},
{
"entropy": 0.3924735262989998,
"epoch": 0.3505715841045182,
"grad_norm": 0.28552576899528503,
"learning_rate": 0.00015645602872803554,
"loss": 0.3852,
"mean_token_accuracy": 0.8868783414363861,
"num_tokens": 652408.0,
"step": 161
},
{
"entropy": 0.34768833965063095,
"epoch": 0.3527490473598258,
"grad_norm": 0.2506498098373413,
"learning_rate": 0.00015591191126914424,
"loss": 0.3351,
"mean_token_accuracy": 0.8980260044336319,
"num_tokens": 656844.0,
"step": 162
},
{
"entropy": 0.3891329765319824,
"epoch": 0.35492651061513336,
"grad_norm": 0.30480027198791504,
"learning_rate": 0.0001553655158108565,
"loss": 0.4034,
"mean_token_accuracy": 0.8790914118289948,
"num_tokens": 661184.0,
"step": 163
},
{
"entropy": 0.4067026600241661,
"epoch": 0.35710397387044096,
"grad_norm": 0.27617979049682617,
"learning_rate": 0.00015481686946353413,
"loss": 0.4081,
"mean_token_accuracy": 0.8769482225179672,
"num_tokens": 665163.0,
"step": 164
},
{
"entropy": 0.4310021921992302,
"epoch": 0.3592814371257485,
"grad_norm": 0.2954219877719879,
"learning_rate": 0.00015426599944922062,
"loss": 0.4193,
"mean_token_accuracy": 0.8807303011417389,
"num_tokens": 669177.0,
"step": 165
},
{
"entropy": 0.37181543558835983,
"epoch": 0.36145890038105605,
"grad_norm": 0.2674584984779358,
"learning_rate": 0.0001537129331002907,
"loss": 0.3423,
"mean_token_accuracy": 0.8933178037405014,
"num_tokens": 672660.0,
"step": 166
},
{
"entropy": 0.36294087767601013,
"epoch": 0.36363636363636365,
"grad_norm": 0.2539677321910858,
"learning_rate": 0.00015315769785809394,
"loss": 0.3419,
"mean_token_accuracy": 0.8953043073415756,
"num_tokens": 676937.0,
"step": 167
},
{
"entropy": 0.36527111381292343,
"epoch": 0.3658138268916712,
"grad_norm": 0.279691219329834,
"learning_rate": 0.0001526003212715934,
"loss": 0.3689,
"mean_token_accuracy": 0.8915591537952423,
"num_tokens": 680798.0,
"step": 168
},
{
"entropy": 0.32713668793439865,
"epoch": 0.3679912901469788,
"grad_norm": 0.2610296308994293,
"learning_rate": 0.00015204083099599862,
"loss": 0.3398,
"mean_token_accuracy": 0.8963142186403275,
"num_tokens": 685386.0,
"step": 169
},
{
"entropy": 0.35941240191459656,
"epoch": 0.37016875340228633,
"grad_norm": 0.26744726300239563,
"learning_rate": 0.00015147925479139357,
"loss": 0.3543,
"mean_token_accuracy": 0.8914755284786224,
"num_tokens": 689455.0,
"step": 170
},
{
"entropy": 0.3640653118491173,
"epoch": 0.3723462166575939,
"grad_norm": 0.2773352861404419,
"learning_rate": 0.00015091562052135912,
"loss": 0.3822,
"mean_token_accuracy": 0.8882244229316711,
"num_tokens": 693956.0,
"step": 171
},
{
"entropy": 0.37736089527606964,
"epoch": 0.3745236799129015,
"grad_norm": 0.2925175130367279,
"learning_rate": 0.00015034995615159074,
"loss": 0.3628,
"mean_token_accuracy": 0.889089897274971,
"num_tokens": 697863.0,
"step": 172
},
{
"entropy": 0.37925824522972107,
"epoch": 0.376701143168209,
"grad_norm": 0.2618020474910736,
"learning_rate": 0.00014978228974851077,
"loss": 0.3624,
"mean_token_accuracy": 0.8942320197820663,
"num_tokens": 701537.0,
"step": 173
},
{
"entropy": 0.34706228971481323,
"epoch": 0.3788786064235166,
"grad_norm": 0.2923741340637207,
"learning_rate": 0.000149212649477876,
"loss": 0.3541,
"mean_token_accuracy": 0.8954867422580719,
"num_tokens": 705253.0,
"step": 174
},
{
"entropy": 0.3569258749485016,
"epoch": 0.38105606967882416,
"grad_norm": 0.2816322147846222,
"learning_rate": 0.00014864106360337992,
"loss": 0.357,
"mean_token_accuracy": 0.8935216814279556,
"num_tokens": 709276.0,
"step": 175
},
{
"entropy": 0.35546237230300903,
"epoch": 0.38323353293413176,
"grad_norm": 0.2701316773891449,
"learning_rate": 0.00014806756048525073,
"loss": 0.3423,
"mean_token_accuracy": 0.9047370553016663,
"num_tokens": 713489.0,
"step": 176
},
{
"entropy": 0.38647014647722244,
"epoch": 0.3854109961894393,
"grad_norm": 0.2974873185157776,
"learning_rate": 0.00014749216857884388,
"loss": 0.3698,
"mean_token_accuracy": 0.8884487450122833,
"num_tokens": 717582.0,
"step": 177
},
{
"entropy": 0.41117021441459656,
"epoch": 0.38758845944474685,
"grad_norm": 0.46910688281059265,
"learning_rate": 0.0001469149164332304,
"loss": 0.3913,
"mean_token_accuracy": 0.8818454891443253,
"num_tokens": 721522.0,
"step": 178
},
{
"entropy": 0.3503909111022949,
"epoch": 0.38976592270005445,
"grad_norm": 0.24447594583034515,
"learning_rate": 0.00014633583268978037,
"loss": 0.3159,
"mean_token_accuracy": 0.9022247046232224,
"num_tokens": 725345.0,
"step": 179
},
{
"entropy": 0.34674597531557083,
"epoch": 0.391943385955362,
"grad_norm": 0.25831112265586853,
"learning_rate": 0.00014575494608074166,
"loss": 0.3403,
"mean_token_accuracy": 0.8952628076076508,
"num_tokens": 729377.0,
"step": 180
},
{
"entropy": 0.32907338812947273,
"epoch": 0.3941208492106696,
"grad_norm": 0.25881391763687134,
"learning_rate": 0.0001451722854278146,
"loss": 0.3039,
"mean_token_accuracy": 0.9026439040899277,
"num_tokens": 733265.0,
"step": 181
},
{
"entropy": 0.35795633494853973,
"epoch": 0.39629831246597713,
"grad_norm": 0.28063708543777466,
"learning_rate": 0.00014458787964072165,
"loss": 0.3381,
"mean_token_accuracy": 0.8983410447835922,
"num_tokens": 737131.0,
"step": 182
},
{
"entropy": 0.33193762600421906,
"epoch": 0.39847577572128473,
"grad_norm": 0.29431116580963135,
"learning_rate": 0.00014400175771577326,
"loss": 0.3225,
"mean_token_accuracy": 0.9057250618934631,
"num_tokens": 740821.0,
"step": 183
},
{
"entropy": 0.31135137379169464,
"epoch": 0.4006532389765923,
"grad_norm": 0.29750552773475647,
"learning_rate": 0.00014341394873442897,
"loss": 0.3264,
"mean_token_accuracy": 0.8973560929298401,
"num_tokens": 744896.0,
"step": 184
},
{
"entropy": 0.3354290798306465,
"epoch": 0.4028307022318998,
"grad_norm": 0.27261385321617126,
"learning_rate": 0.0001428244818618546,
"loss": 0.3427,
"mean_token_accuracy": 0.8985736221075058,
"num_tokens": 748839.0,
"step": 185
},
{
"entropy": 0.3166900649666786,
"epoch": 0.4050081654872074,
"grad_norm": 0.27092301845550537,
"learning_rate": 0.0001422333863454751,
"loss": 0.3087,
"mean_token_accuracy": 0.9003172963857651,
"num_tokens": 752819.0,
"step": 186
},
{
"entropy": 0.3550329655408859,
"epoch": 0.40718562874251496,
"grad_norm": 0.27660685777664185,
"learning_rate": 0.0001416406915135235,
"loss": 0.3544,
"mean_token_accuracy": 0.8941550552845001,
"num_tokens": 756769.0,
"step": 187
},
{
"entropy": 0.3845446854829788,
"epoch": 0.40936309199782256,
"grad_norm": 0.3029703199863434,
"learning_rate": 0.00014104642677358547,
"loss": 0.3864,
"mean_token_accuracy": 0.8840687274932861,
"num_tokens": 760466.0,
"step": 188
},
{
"entropy": 0.3692278042435646,
"epoch": 0.4115405552531301,
"grad_norm": 0.2795009911060333,
"learning_rate": 0.00014045062161114065,
"loss": 0.3618,
"mean_token_accuracy": 0.8954125195741653,
"num_tokens": 764627.0,
"step": 189
},
{
"entropy": 0.34045620262622833,
"epoch": 0.41371801850843765,
"grad_norm": 0.2698828876018524,
"learning_rate": 0.00013985330558809918,
"loss": 0.3225,
"mean_token_accuracy": 0.8965429812669754,
"num_tokens": 768901.0,
"step": 190
},
{
"entropy": 0.3410160765051842,
"epoch": 0.41589548176374525,
"grad_norm": 0.25038790702819824,
"learning_rate": 0.00013925450834133542,
"loss": 0.3253,
"mean_token_accuracy": 0.9037521332502365,
"num_tokens": 773052.0,
"step": 191
},
{
"entropy": 0.36402270942926407,
"epoch": 0.4180729450190528,
"grad_norm": 0.2695653736591339,
"learning_rate": 0.00013865425958121697,
"loss": 0.3614,
"mean_token_accuracy": 0.8942222446203232,
"num_tokens": 776826.0,
"step": 192
},
{
"entropy": 0.31327100098133087,
"epoch": 0.4202504082743604,
"grad_norm": 0.2406344711780548,
"learning_rate": 0.00013805258909013095,
"loss": 0.2927,
"mean_token_accuracy": 0.9095935225486755,
"num_tokens": 781250.0,
"step": 193
},
{
"entropy": 0.37202536314725876,
"epoch": 0.42242787152966793,
"grad_norm": 0.30606889724731445,
"learning_rate": 0.00013744952672100613,
"loss": 0.3924,
"mean_token_accuracy": 0.8838685899972916,
"num_tokens": 785238.0,
"step": 194
},
{
"entropy": 0.3558414503931999,
"epoch": 0.42460533478497553,
"grad_norm": 0.24589793384075165,
"learning_rate": 0.00013684510239583166,
"loss": 0.344,
"mean_token_accuracy": 0.896059587597847,
"num_tokens": 789796.0,
"step": 195
},
{
"entropy": 0.37479735910892487,
"epoch": 0.4267827980402831,
"grad_norm": 0.25714266300201416,
"learning_rate": 0.0001362393461041726,
"loss": 0.3708,
"mean_token_accuracy": 0.8902730643749237,
"num_tokens": 794040.0,
"step": 196
},
{
"entropy": 0.356051467359066,
"epoch": 0.4289602612955906,
"grad_norm": 0.27870944142341614,
"learning_rate": 0.00013563228790168178,
"loss": 0.3551,
"mean_token_accuracy": 0.8951977044343948,
"num_tokens": 798230.0,
"step": 197
},
{
"entropy": 0.3533203676342964,
"epoch": 0.4311377245508982,
"grad_norm": 0.2748214602470398,
"learning_rate": 0.00013502395790860862,
"loss": 0.3345,
"mean_token_accuracy": 0.8976791948080063,
"num_tokens": 802137.0,
"step": 198
},
{
"entropy": 0.404046893119812,
"epoch": 0.43331518780620576,
"grad_norm": 0.2737223505973816,
"learning_rate": 0.00013441438630830464,
"loss": 0.4053,
"mean_token_accuracy": 0.8848972916603088,
"num_tokens": 806240.0,
"step": 199
},
{
"entropy": 0.3257349133491516,
"epoch": 0.43549265106151336,
"grad_norm": 0.28284040093421936,
"learning_rate": 0.0001338036033457259,
"loss": 0.3047,
"mean_token_accuracy": 0.9047138094902039,
"num_tokens": 809920.0,
"step": 200
},
{
"entropy": 0.3515155389904976,
"epoch": 0.4376701143168209,
"grad_norm": 0.2601410746574402,
"learning_rate": 0.00013319163932593226,
"loss": 0.3389,
"mean_token_accuracy": 0.8959746956825256,
"num_tokens": 813888.0,
"step": 201
},
{
"entropy": 0.35355835407972336,
"epoch": 0.43984757757212845,
"grad_norm": 0.28591784834861755,
"learning_rate": 0.0001325785246125838,
"loss": 0.3629,
"mean_token_accuracy": 0.8906663358211517,
"num_tokens": 817940.0,
"step": 202
},
{
"entropy": 0.36141665279865265,
"epoch": 0.44202504082743604,
"grad_norm": 0.27857449650764465,
"learning_rate": 0.00013196428962643426,
"loss": 0.3418,
"mean_token_accuracy": 0.8927578181028366,
"num_tokens": 822014.0,
"step": 203
},
{
"entropy": 0.4061436876654625,
"epoch": 0.4442025040827436,
"grad_norm": 0.2518883943557739,
"learning_rate": 0.0001313489648438217,
"loss": 0.4024,
"mean_token_accuracy": 0.8816352039575577,
"num_tokens": 826422.0,
"step": 204
},
{
"entropy": 0.3674250468611717,
"epoch": 0.4463799673380512,
"grad_norm": 0.2753954231739044,
"learning_rate": 0.00013073258079515632,
"loss": 0.3508,
"mean_token_accuracy": 0.8967752158641815,
"num_tokens": 830085.0,
"step": 205
},
{
"entropy": 0.35362084209918976,
"epoch": 0.44855743059335873,
"grad_norm": 0.2868417203426361,
"learning_rate": 0.00013011516806340557,
"loss": 0.3743,
"mean_token_accuracy": 0.8918885141611099,
"num_tokens": 834548.0,
"step": 206
},
{
"entropy": 0.39741218090057373,
"epoch": 0.45073489384866633,
"grad_norm": 0.2914039194583893,
"learning_rate": 0.0001294967572825769,
"loss": 0.3976,
"mean_token_accuracy": 0.8822353929281235,
"num_tokens": 838029.0,
"step": 207
},
{
"entropy": 0.31900452077388763,
"epoch": 0.4529123571039739,
"grad_norm": 0.24336911737918854,
"learning_rate": 0.0001288773791361977,
"loss": 0.3179,
"mean_token_accuracy": 0.9089991301298141,
"num_tokens": 842500.0,
"step": 208
},
{
"entropy": 0.3548683598637581,
"epoch": 0.4550898203592814,
"grad_norm": 0.24573664367198944,
"learning_rate": 0.0001282570643557928,
"loss": 0.3332,
"mean_token_accuracy": 0.8994109332561493,
"num_tokens": 846504.0,
"step": 209
},
{
"entropy": 0.4130469933152199,
"epoch": 0.457267283614589,
"grad_norm": 0.22916413843631744,
"learning_rate": 0.00012763584371935986,
"loss": 0.3935,
"mean_token_accuracy": 0.8888524770736694,
"num_tokens": 850825.0,
"step": 210
},
{
"entropy": 0.39430346339941025,
"epoch": 0.45944474686989656,
"grad_norm": 0.24899472296237946,
"learning_rate": 0.00012701374804984205,
"loss": 0.3623,
"mean_token_accuracy": 0.8868012726306915,
"num_tokens": 854995.0,
"step": 211
},
{
"entropy": 0.3773266300559044,
"epoch": 0.46162221012520416,
"grad_norm": 0.282216340303421,
"learning_rate": 0.00012639080821359898,
"loss": 0.3786,
"mean_token_accuracy": 0.8827318847179413,
"num_tokens": 858988.0,
"step": 212
},
{
"entropy": 0.3632218912243843,
"epoch": 0.4637996733805117,
"grad_norm": 0.2573084235191345,
"learning_rate": 0.00012576705511887492,
"loss": 0.3624,
"mean_token_accuracy": 0.8912414461374283,
"num_tokens": 863081.0,
"step": 213
},
{
"entropy": 0.35169900953769684,
"epoch": 0.46597713663581924,
"grad_norm": 0.2548096477985382,
"learning_rate": 0.00012514251971426545,
"loss": 0.3325,
"mean_token_accuracy": 0.9051143527030945,
"num_tokens": 867052.0,
"step": 214
},
{
"entropy": 0.36711084097623825,
"epoch": 0.46815459989112684,
"grad_norm": 0.2645510733127594,
"learning_rate": 0.00012451723298718175,
"loss": 0.3774,
"mean_token_accuracy": 0.8909319043159485,
"num_tokens": 871119.0,
"step": 215
},
{
"entropy": 0.35685280710458755,
"epoch": 0.4703320631464344,
"grad_norm": 0.3010730445384979,
"learning_rate": 0.0001238912259623133,
"loss": 0.3435,
"mean_token_accuracy": 0.8955214470624924,
"num_tokens": 874529.0,
"step": 216
},
{
"entropy": 0.3657463937997818,
"epoch": 0.472509526401742,
"grad_norm": 0.2753501534461975,
"learning_rate": 0.0001232645297000883,
"loss": 0.356,
"mean_token_accuracy": 0.8999243825674057,
"num_tokens": 878518.0,
"step": 217
},
{
"entropy": 0.3516548126935959,
"epoch": 0.47468698965704953,
"grad_norm": 0.2859194576740265,
"learning_rate": 0.00012263717529513267,
"loss": 0.3561,
"mean_token_accuracy": 0.8952623754739761,
"num_tokens": 882202.0,
"step": 218
},
{
"entropy": 0.3554818853735924,
"epoch": 0.47686445291235713,
"grad_norm": 0.2630636394023895,
"learning_rate": 0.00012200919387472723,
"loss": 0.3454,
"mean_token_accuracy": 0.8877929896116257,
"num_tokens": 886781.0,
"step": 219
},
{
"entropy": 0.35459691286087036,
"epoch": 0.47904191616766467,
"grad_norm": 0.28057464957237244,
"learning_rate": 0.0001213806165972633,
"loss": 0.3597,
"mean_token_accuracy": 0.8925827890634537,
"num_tokens": 890846.0,
"step": 220
},
{
"entropy": 0.3253984898328781,
"epoch": 0.4812193794229722,
"grad_norm": 0.2502402067184448,
"learning_rate": 0.00012075147465069667,
"loss": 0.3183,
"mean_token_accuracy": 0.9015309363603592,
"num_tokens": 895392.0,
"step": 221
},
{
"entropy": 0.3588094562292099,
"epoch": 0.4833968426782798,
"grad_norm": 0.24630582332611084,
"learning_rate": 0.0001201217992510002,
"loss": 0.3361,
"mean_token_accuracy": 0.9005966037511826,
"num_tokens": 899490.0,
"step": 222
},
{
"entropy": 0.3819248303771019,
"epoch": 0.48557430593358736,
"grad_norm": 0.24468845129013062,
"learning_rate": 0.00011949162164061486,
"loss": 0.3661,
"mean_token_accuracy": 0.8975157290697098,
"num_tokens": 903478.0,
"step": 223
},
{
"entropy": 0.4134289547801018,
"epoch": 0.48775176918889496,
"grad_norm": 0.27261775732040405,
"learning_rate": 0.0001188609730868998,
"loss": 0.4087,
"mean_token_accuracy": 0.8844785243272781,
"num_tokens": 907286.0,
"step": 224
},
{
"entropy": 0.3919166326522827,
"epoch": 0.4899292324442025,
"grad_norm": 0.2661035358905792,
"learning_rate": 0.00011822988488058071,
"loss": 0.3575,
"mean_token_accuracy": 0.8900353014469147,
"num_tokens": 911300.0,
"step": 225
},
{
"entropy": 0.34307558089494705,
"epoch": 0.49210669569951004,
"grad_norm": 0.2561405301094055,
"learning_rate": 0.00011759838833419754,
"loss": 0.3052,
"mean_token_accuracy": 0.90419901907444,
"num_tokens": 915659.0,
"step": 226
},
{
"entropy": 0.35558557510375977,
"epoch": 0.49428415895481764,
"grad_norm": 0.24936646223068237,
"learning_rate": 0.00011696651478055067,
"loss": 0.3531,
"mean_token_accuracy": 0.8979819416999817,
"num_tokens": 919483.0,
"step": 227
},
{
"entropy": 0.35391464084386826,
"epoch": 0.4964616222101252,
"grad_norm": 0.2600042521953583,
"learning_rate": 0.00011633429557114635,
"loss": 0.3565,
"mean_token_accuracy": 0.889078825712204,
"num_tokens": 923394.0,
"step": 228
},
{
"entropy": 0.37007713317871094,
"epoch": 0.4986390854654328,
"grad_norm": 0.25796735286712646,
"learning_rate": 0.00011570176207464114,
"loss": 0.3369,
"mean_token_accuracy": 0.8971839994192123,
"num_tokens": 927293.0,
"step": 229
},
{
"entropy": 0.38342171162366867,
"epoch": 0.5008165487207403,
"grad_norm": 0.27563533186912537,
"learning_rate": 0.00011506894567528556,
"loss": 0.3546,
"mean_token_accuracy": 0.8875249475240707,
"num_tokens": 931453.0,
"step": 230
},
{
"entropy": 0.3373766243457794,
"epoch": 0.5029940119760479,
"grad_norm": 0.24225658178329468,
"learning_rate": 0.00011443587777136679,
"loss": 0.3411,
"mean_token_accuracy": 0.9000124335289001,
"num_tokens": 936010.0,
"step": 231
},
{
"entropy": 0.33466411381959915,
"epoch": 0.5051714752313554,
"grad_norm": 0.2858439087867737,
"learning_rate": 0.0001138025897736509,
"loss": 0.3343,
"mean_token_accuracy": 0.8957197666168213,
"num_tokens": 939926.0,
"step": 232
},
{
"entropy": 0.3573242276906967,
"epoch": 0.5073489384866631,
"grad_norm": 0.30942314863204956,
"learning_rate": 0.00011316911310382416,
"loss": 0.3597,
"mean_token_accuracy": 0.8864942044019699,
"num_tokens": 944087.0,
"step": 233
},
{
"entropy": 0.3710939437150955,
"epoch": 0.5095264017419706,
"grad_norm": 0.2737363278865814,
"learning_rate": 0.00011253547919293439,
"loss": 0.3577,
"mean_token_accuracy": 0.8874527662992477,
"num_tokens": 948518.0,
"step": 234
},
{
"entropy": 0.33612143993377686,
"epoch": 0.5117038649972782,
"grad_norm": 0.24085883796215057,
"learning_rate": 0.00011190171947983091,
"loss": 0.3161,
"mean_token_accuracy": 0.902932345867157,
"num_tokens": 952833.0,
"step": 235
},
{
"entropy": 0.353444904088974,
"epoch": 0.5138813282525857,
"grad_norm": 0.28172338008880615,
"learning_rate": 0.00011126786540960512,
"loss": 0.3562,
"mean_token_accuracy": 0.8990496397018433,
"num_tokens": 956824.0,
"step": 236
},
{
"entropy": 0.33875197917222977,
"epoch": 0.5160587915078934,
"grad_norm": 0.2717280387878418,
"learning_rate": 0.00011063394843203004,
"loss": 0.3117,
"mean_token_accuracy": 0.9031887650489807,
"num_tokens": 960613.0,
"step": 237
},
{
"entropy": 0.3543147072196007,
"epoch": 0.5182362547632009,
"grad_norm": 0.2418098896741867,
"learning_rate": 0.00011000000000000002,
"loss": 0.3577,
"mean_token_accuracy": 0.8868001103401184,
"num_tokens": 965072.0,
"step": 238
},
{
"entropy": 0.3672889471054077,
"epoch": 0.5204137180185084,
"grad_norm": 0.27860227227211,
"learning_rate": 0.00010936605156797,
"loss": 0.3616,
"mean_token_accuracy": 0.8912352472543716,
"num_tokens": 969185.0,
"step": 239
},
{
"entropy": 0.3546944558620453,
"epoch": 0.522591181273816,
"grad_norm": 0.27250248193740845,
"learning_rate": 0.0001087321345903949,
"loss": 0.34,
"mean_token_accuracy": 0.8949205875396729,
"num_tokens": 972955.0,
"step": 240
},
{
"entropy": 0.4006873667240143,
"epoch": 0.5247686445291235,
"grad_norm": 0.28049609065055847,
"learning_rate": 0.00010809828052016913,
"loss": 0.3895,
"mean_token_accuracy": 0.878919780254364,
"num_tokens": 976759.0,
"step": 241
},
{
"entropy": 0.34407609701156616,
"epoch": 0.5269461077844312,
"grad_norm": 0.22804318368434906,
"learning_rate": 0.00010746452080706563,
"loss": 0.3046,
"mean_token_accuracy": 0.9041478931903839,
"num_tokens": 981169.0,
"step": 242
},
{
"entropy": 0.34020114690065384,
"epoch": 0.5291235710397387,
"grad_norm": 0.25987792015075684,
"learning_rate": 0.00010683088689617582,
"loss": 0.3175,
"mean_token_accuracy": 0.9022326022386551,
"num_tokens": 984838.0,
"step": 243
},
{
"entropy": 0.35350754112005234,
"epoch": 0.5313010342950463,
"grad_norm": 0.2573815584182739,
"learning_rate": 0.00010619741022634912,
"loss": 0.3525,
"mean_token_accuracy": 0.8904687911272049,
"num_tokens": 988767.0,
"step": 244
},
{
"entropy": 0.319248978048563,
"epoch": 0.5334784975503538,
"grad_norm": 0.21112677454948425,
"learning_rate": 0.00010556412222863321,
"loss": 0.3022,
"mean_token_accuracy": 0.9129808992147446,
"num_tokens": 993209.0,
"step": 245
},
{
"entropy": 0.3874542936682701,
"epoch": 0.5356559608056614,
"grad_norm": 0.2539237439632416,
"learning_rate": 0.00010493105432471443,
"loss": 0.3908,
"mean_token_accuracy": 0.8874447643756866,
"num_tokens": 997348.0,
"step": 246
},
{
"entropy": 0.3753085806965828,
"epoch": 0.537833424060969,
"grad_norm": 0.242266446352005,
"learning_rate": 0.00010429823792535891,
"loss": 0.3721,
"mean_token_accuracy": 0.8896859586238861,
"num_tokens": 1001182.0,
"step": 247
},
{
"entropy": 0.326670840382576,
"epoch": 0.5400108873162766,
"grad_norm": 0.24620375037193298,
"learning_rate": 0.00010366570442885373,
"loss": 0.3195,
"mean_token_accuracy": 0.9036577641963959,
"num_tokens": 1005310.0,
"step": 248
},
{
"entropy": 0.36552029848098755,
"epoch": 0.5421883505715841,
"grad_norm": 0.24721576273441315,
"learning_rate": 0.00010303348521944938,
"loss": 0.3665,
"mean_token_accuracy": 0.892762616276741,
"num_tokens": 1009657.0,
"step": 249
},
{
"entropy": 0.34408629685640335,
"epoch": 0.5443658138268916,
"grad_norm": 0.23724570870399475,
"learning_rate": 0.0001024016116658025,
"loss": 0.3347,
"mean_token_accuracy": 0.9008950591087341,
"num_tokens": 1014240.0,
"step": 250
},
{
"entropy": 0.33717598021030426,
"epoch": 0.5465432770821992,
"grad_norm": 0.25629547238349915,
"learning_rate": 0.0001017701151194193,
"loss": 0.3434,
"mean_token_accuracy": 0.9011830985546112,
"num_tokens": 1018254.0,
"step": 251
},
{
"entropy": 0.36749306321144104,
"epoch": 0.5487207403375068,
"grad_norm": 0.2619577944278717,
"learning_rate": 0.00010113902691310024,
"loss": 0.3551,
"mean_token_accuracy": 0.8974686414003372,
"num_tokens": 1022155.0,
"step": 252
},
{
"entropy": 0.4006720781326294,
"epoch": 0.5508982035928144,
"grad_norm": 0.2916308343410492,
"learning_rate": 0.00010050837835938516,
"loss": 0.3901,
"mean_token_accuracy": 0.884143054485321,
"num_tokens": 1026011.0,
"step": 253
},
{
"entropy": 0.3434867560863495,
"epoch": 0.5530756668481219,
"grad_norm": 0.24261599779129028,
"learning_rate": 9.98782007489998e-05,
"loss": 0.3447,
"mean_token_accuracy": 0.8931203186511993,
"num_tokens": 1029811.0,
"step": 254
},
{
"entropy": 0.33298294991254807,
"epoch": 0.5552531301034295,
"grad_norm": 0.24710261821746826,
"learning_rate": 9.924852534930333e-05,
"loss": 0.3163,
"mean_token_accuracy": 0.8988287448883057,
"num_tokens": 1033838.0,
"step": 255
},
{
"entropy": 0.36351051926612854,
"epoch": 0.5574305933587371,
"grad_norm": 0.22865501046180725,
"learning_rate": 9.861938340273671e-05,
"loss": 0.3537,
"mean_token_accuracy": 0.8958317637443542,
"num_tokens": 1038890.0,
"step": 256
},
{
"entropy": 0.34496162831783295,
"epoch": 0.5596080566140447,
"grad_norm": 0.27052974700927734,
"learning_rate": 9.79908061252728e-05,
"loss": 0.3422,
"mean_token_accuracy": 0.8985669314861298,
"num_tokens": 1042344.0,
"step": 257
},
{
"entropy": 0.3629255071282387,
"epoch": 0.5617855198693522,
"grad_norm": 0.27112752199172974,
"learning_rate": 9.736282470486739e-05,
"loss": 0.36,
"mean_token_accuracy": 0.8962416350841522,
"num_tokens": 1046638.0,
"step": 258
},
{
"entropy": 0.3592648208141327,
"epoch": 0.5639629831246598,
"grad_norm": 0.23911136388778687,
"learning_rate": 9.673547029991173e-05,
"loss": 0.3398,
"mean_token_accuracy": 0.8957805782556534,
"num_tokens": 1050963.0,
"step": 259
},
{
"entropy": 0.41059066355228424,
"epoch": 0.5661404463799673,
"grad_norm": 0.2601061463356018,
"learning_rate": 9.61087740376867e-05,
"loss": 0.413,
"mean_token_accuracy": 0.875213697552681,
"num_tokens": 1055077.0,
"step": 260
},
{
"entropy": 0.33156271278858185,
"epoch": 0.568317909635275,
"grad_norm": 0.2332238405942917,
"learning_rate": 9.548276701281821e-05,
"loss": 0.3202,
"mean_token_accuracy": 0.9033721536397934,
"num_tokens": 1059270.0,
"step": 261
},
{
"entropy": 0.38206638395786285,
"epoch": 0.5704953728905825,
"grad_norm": 0.2890869677066803,
"learning_rate": 9.485748028573455e-05,
"loss": 0.3721,
"mean_token_accuracy": 0.8858179748058319,
"num_tokens": 1063429.0,
"step": 262
},
{
"entropy": 0.3339729979634285,
"epoch": 0.57267283614589,
"grad_norm": 0.23651231825351715,
"learning_rate": 9.423294488112509e-05,
"loss": 0.3376,
"mean_token_accuracy": 0.9060862809419632,
"num_tokens": 1067575.0,
"step": 263
},
{
"entropy": 0.36243191361427307,
"epoch": 0.5748502994011976,
"grad_norm": 0.2469407469034195,
"learning_rate": 9.360919178640104e-05,
"loss": 0.3313,
"mean_token_accuracy": 0.9048342257738113,
"num_tokens": 1071393.0,
"step": 264
},
{
"entropy": 0.3420562148094177,
"epoch": 0.5770277626565051,
"grad_norm": 0.24036115407943726,
"learning_rate": 9.298625195015796e-05,
"loss": 0.3464,
"mean_token_accuracy": 0.900355190038681,
"num_tokens": 1076079.0,
"step": 265
},
{
"entropy": 0.39919717609882355,
"epoch": 0.5792052259118128,
"grad_norm": 0.2509303390979767,
"learning_rate": 9.236415628064017e-05,
"loss": 0.3731,
"mean_token_accuracy": 0.8862645626068115,
"num_tokens": 1079989.0,
"step": 266
},
{
"entropy": 0.3894932344555855,
"epoch": 0.5813826891671203,
"grad_norm": 0.25672271847724915,
"learning_rate": 9.174293564420724e-05,
"loss": 0.3749,
"mean_token_accuracy": 0.8905623853206635,
"num_tokens": 1083957.0,
"step": 267
},
{
"entropy": 0.37751832604408264,
"epoch": 0.5835601524224279,
"grad_norm": 0.2643100321292877,
"learning_rate": 9.112262086380234e-05,
"loss": 0.371,
"mean_token_accuracy": 0.8892365545034409,
"num_tokens": 1087639.0,
"step": 268
},
{
"entropy": 0.35557425767183304,
"epoch": 0.5857376156777354,
"grad_norm": 0.2569376230239868,
"learning_rate": 9.050324271742312e-05,
"loss": 0.3369,
"mean_token_accuracy": 0.8985206633806229,
"num_tokens": 1091448.0,
"step": 269
},
{
"entropy": 0.3663223683834076,
"epoch": 0.587915078933043,
"grad_norm": 0.28307580947875977,
"learning_rate": 8.988483193659447e-05,
"loss": 0.3379,
"mean_token_accuracy": 0.8939681947231293,
"num_tokens": 1095282.0,
"step": 270
},
{
"entropy": 0.35191214829683304,
"epoch": 0.5900925421883506,
"grad_norm": 0.241379514336586,
"learning_rate": 8.926741920484374e-05,
"loss": 0.3447,
"mean_token_accuracy": 0.8967802226543427,
"num_tokens": 1099519.0,
"step": 271
},
{
"entropy": 0.33553165942430496,
"epoch": 0.5922700054436582,
"grad_norm": 0.26522010564804077,
"learning_rate": 8.865103515617834e-05,
"loss": 0.3126,
"mean_token_accuracy": 0.9028987288475037,
"num_tokens": 1103293.0,
"step": 272
},
{
"entropy": 0.321424663066864,
"epoch": 0.5944474686989657,
"grad_norm": 0.23075014352798462,
"learning_rate": 8.803571037356575e-05,
"loss": 0.3204,
"mean_token_accuracy": 0.9045960456132889,
"num_tokens": 1107725.0,
"step": 273
},
{
"entropy": 0.3491132855415344,
"epoch": 0.5966249319542732,
"grad_norm": 0.26291459798812866,
"learning_rate": 8.742147538741623e-05,
"loss": 0.3178,
"mean_token_accuracy": 0.9050692319869995,
"num_tokens": 1111448.0,
"step": 274
},
{
"entropy": 0.3245581164956093,
"epoch": 0.5988023952095808,
"grad_norm": 0.2527916729450226,
"learning_rate": 8.680836067406775e-05,
"loss": 0.3164,
"mean_token_accuracy": 0.9089783430099487,
"num_tokens": 1115353.0,
"step": 275
},
{
"entropy": 0.3509984761476517,
"epoch": 0.6009798584648884,
"grad_norm": 0.2409028708934784,
"learning_rate": 8.619639665427411e-05,
"loss": 0.3205,
"mean_token_accuracy": 0.901856929063797,
"num_tokens": 1119037.0,
"step": 276
},
{
"entropy": 0.41339434683322906,
"epoch": 0.603157321720196,
"grad_norm": 0.2666266858577728,
"learning_rate": 8.558561369169535e-05,
"loss": 0.4118,
"mean_token_accuracy": 0.8851277679204941,
"num_tokens": 1122815.0,
"step": 277
},
{
"entropy": 0.355926550924778,
"epoch": 0.6053347849755035,
"grad_norm": 0.2666811943054199,
"learning_rate": 8.497604209139139e-05,
"loss": 0.3598,
"mean_token_accuracy": 0.8959801942110062,
"num_tokens": 1126942.0,
"step": 278
},
{
"entropy": 0.33385297656059265,
"epoch": 0.6075122482308111,
"grad_norm": 0.26262858510017395,
"learning_rate": 8.436771209831825e-05,
"loss": 0.356,
"mean_token_accuracy": 0.8975410759449005,
"num_tokens": 1130948.0,
"step": 279
},
{
"entropy": 0.3468668982386589,
"epoch": 0.6096897114861187,
"grad_norm": 0.2627294659614563,
"learning_rate": 8.376065389582739e-05,
"loss": 0.3453,
"mean_token_accuracy": 0.8972453325986862,
"num_tokens": 1135319.0,
"step": 280
},
{
"entropy": 0.34889067709445953,
"epoch": 0.6118671747414263,
"grad_norm": 0.2477421760559082,
"learning_rate": 8.315489760416839e-05,
"loss": 0.3221,
"mean_token_accuracy": 0.9074793308973312,
"num_tokens": 1138864.0,
"step": 281
},
{
"entropy": 0.3540688380599022,
"epoch": 0.6140446379967338,
"grad_norm": 0.2644377052783966,
"learning_rate": 8.255047327899392e-05,
"loss": 0.3697,
"mean_token_accuracy": 0.8973688334226608,
"num_tokens": 1142749.0,
"step": 282
},
{
"entropy": 0.3112604096531868,
"epoch": 0.6162221012520414,
"grad_norm": 0.24608677625656128,
"learning_rate": 8.19474109098691e-05,
"loss": 0.3115,
"mean_token_accuracy": 0.9069450497627258,
"num_tokens": 1146891.0,
"step": 283
},
{
"entropy": 0.3105768784880638,
"epoch": 0.6183995645073489,
"grad_norm": 0.2628800868988037,
"learning_rate": 8.134574041878306e-05,
"loss": 0.3144,
"mean_token_accuracy": 0.9045025259256363,
"num_tokens": 1151024.0,
"step": 284
},
{
"entropy": 0.3061619848012924,
"epoch": 0.6205770277626566,
"grad_norm": 0.2500765919685364,
"learning_rate": 8.074549165866463e-05,
"loss": 0.2996,
"mean_token_accuracy": 0.9090612530708313,
"num_tokens": 1155564.0,
"step": 285
},
{
"entropy": 0.34363674372434616,
"epoch": 0.6227544910179641,
"grad_norm": 0.2619493305683136,
"learning_rate": 8.014669441190081e-05,
"loss": 0.3196,
"mean_token_accuracy": 0.8998923152685165,
"num_tokens": 1159454.0,
"step": 286
},
{
"entropy": 0.3449995443224907,
"epoch": 0.6249319542732716,
"grad_norm": 0.2670820355415344,
"learning_rate": 7.954937838885937e-05,
"loss": 0.3517,
"mean_token_accuracy": 0.8967305719852448,
"num_tokens": 1163267.0,
"step": 287
},
{
"entropy": 0.3603576719760895,
"epoch": 0.6271094175285792,
"grad_norm": 0.24100132286548615,
"learning_rate": 7.895357322641452e-05,
"loss": 0.3508,
"mean_token_accuracy": 0.8935562521219254,
"num_tokens": 1167581.0,
"step": 288
},
{
"entropy": 0.3160111829638481,
"epoch": 0.6292868807838867,
"grad_norm": 0.2645825445652008,
"learning_rate": 7.835930848647653e-05,
"loss": 0.3045,
"mean_token_accuracy": 0.9113835692405701,
"num_tokens": 1171514.0,
"step": 289
},
{
"entropy": 0.33360420912504196,
"epoch": 0.6314643440391944,
"grad_norm": 0.22924089431762695,
"learning_rate": 7.776661365452491e-05,
"loss": 0.3087,
"mean_token_accuracy": 0.9061863869428635,
"num_tokens": 1175361.0,
"step": 290
},
{
"entropy": 0.3485657498240471,
"epoch": 0.6336418072945019,
"grad_norm": 0.24018257856369019,
"learning_rate": 7.717551813814543e-05,
"loss": 0.3087,
"mean_token_accuracy": 0.903602659702301,
"num_tokens": 1179132.0,
"step": 291
},
{
"entropy": 0.342680849134922,
"epoch": 0.6358192705498095,
"grad_norm": 0.22566929459571838,
"learning_rate": 7.658605126557105e-05,
"loss": 0.3183,
"mean_token_accuracy": 0.9066330194473267,
"num_tokens": 1183571.0,
"step": 292
},
{
"entropy": 0.3731561452150345,
"epoch": 0.637996733805117,
"grad_norm": 0.2820538580417633,
"learning_rate": 7.599824228422677e-05,
"loss": 0.371,
"mean_token_accuracy": 0.8894180357456207,
"num_tokens": 1187179.0,
"step": 293
},
{
"entropy": 0.32869182527065277,
"epoch": 0.6401741970604246,
"grad_norm": 0.2502634823322296,
"learning_rate": 7.541212035927839e-05,
"loss": 0.2968,
"mean_token_accuracy": 0.9134543687105179,
"num_tokens": 1191246.0,
"step": 294
},
{
"entropy": 0.37562160193920135,
"epoch": 0.6423516603157322,
"grad_norm": 0.2863782048225403,
"learning_rate": 7.482771457218542e-05,
"loss": 0.3717,
"mean_token_accuracy": 0.8882504254579544,
"num_tokens": 1195149.0,
"step": 295
},
{
"entropy": 0.33977876603603363,
"epoch": 0.6445291235710398,
"grad_norm": 0.24794067442417145,
"learning_rate": 7.424505391925833e-05,
"loss": 0.3122,
"mean_token_accuracy": 0.9125866144895554,
"num_tokens": 1198886.0,
"step": 296
},
{
"entropy": 0.3772798329591751,
"epoch": 0.6467065868263473,
"grad_norm": 0.23983165621757507,
"learning_rate": 7.366416731021964e-05,
"loss": 0.362,
"mean_token_accuracy": 0.8952146172523499,
"num_tokens": 1202933.0,
"step": 297
},
{
"entropy": 0.3076706826686859,
"epoch": 0.6488840500816548,
"grad_norm": 0.2429223656654358,
"learning_rate": 7.30850835667696e-05,
"loss": 0.3008,
"mean_token_accuracy": 0.909699097275734,
"num_tokens": 1206978.0,
"step": 298
},
{
"entropy": 0.3360467702150345,
"epoch": 0.6510615133369625,
"grad_norm": 0.25572511553764343,
"learning_rate": 7.250783142115615e-05,
"loss": 0.341,
"mean_token_accuracy": 0.9028728753328323,
"num_tokens": 1210951.0,
"step": 299
},
{
"entropy": 0.305056668817997,
"epoch": 0.65323897659227,
"grad_norm": 0.24135318398475647,
"learning_rate": 7.193243951474933e-05,
"loss": 0.3122,
"mean_token_accuracy": 0.908637598156929,
"num_tokens": 1215517.0,
"step": 300
},
{
"entropy": 0.337252639234066,
"epoch": 0.6554164398475776,
"grad_norm": 0.27407306432724,
"learning_rate": 7.135893639662012e-05,
"loss": 0.3226,
"mean_token_accuracy": 0.9033920913934708,
"num_tokens": 1219456.0,
"step": 301
},
{
"entropy": 0.3444167599081993,
"epoch": 0.6575939031028851,
"grad_norm": 0.2554808557033539,
"learning_rate": 7.078735052212402e-05,
"loss": 0.3405,
"mean_token_accuracy": 0.8994651138782501,
"num_tokens": 1223440.0,
"step": 302
},
{
"entropy": 0.3203364834189415,
"epoch": 0.6597713663581927,
"grad_norm": 0.2498241364955902,
"learning_rate": 7.021771025148922e-05,
"loss": 0.2994,
"mean_token_accuracy": 0.9104214161634445,
"num_tokens": 1227205.0,
"step": 303
},
{
"entropy": 0.3659024015069008,
"epoch": 0.6619488296135003,
"grad_norm": 0.24576182663440704,
"learning_rate": 6.965004384840928e-05,
"loss": 0.3434,
"mean_token_accuracy": 0.8974325805902481,
"num_tokens": 1231062.0,
"step": 304
},
{
"entropy": 0.35433361679315567,
"epoch": 0.6641262928688079,
"grad_norm": 0.2348756641149521,
"learning_rate": 6.90843794786409e-05,
"loss": 0.3326,
"mean_token_accuracy": 0.8999007195234299,
"num_tokens": 1235210.0,
"step": 305
},
{
"entropy": 0.3522880747914314,
"epoch": 0.6663037561241154,
"grad_norm": 0.24180057644844055,
"learning_rate": 6.852074520860648e-05,
"loss": 0.3286,
"mean_token_accuracy": 0.9014742374420166,
"num_tokens": 1238954.0,
"step": 306
},
{
"entropy": 0.35235612094402313,
"epoch": 0.668481219379423,
"grad_norm": 0.24760101735591888,
"learning_rate": 6.795916900400138e-05,
"loss": 0.3262,
"mean_token_accuracy": 0.9001569449901581,
"num_tokens": 1242691.0,
"step": 307
},
{
"entropy": 0.35372819751501083,
"epoch": 0.6706586826347305,
"grad_norm": 0.2558618485927582,
"learning_rate": 6.739967872840662e-05,
"loss": 0.3389,
"mean_token_accuracy": 0.9027666747570038,
"num_tokens": 1246355.0,
"step": 308
},
{
"entropy": 0.32674338668584824,
"epoch": 0.6728361458900382,
"grad_norm": 0.2397354543209076,
"learning_rate": 6.684230214190608e-05,
"loss": 0.3026,
"mean_token_accuracy": 0.9039190113544464,
"num_tokens": 1251017.0,
"step": 309
},
{
"entropy": 0.3184630870819092,
"epoch": 0.6750136091453457,
"grad_norm": 0.2725917100906372,
"learning_rate": 6.628706689970932e-05,
"loss": 0.3305,
"mean_token_accuracy": 0.8989760279655457,
"num_tokens": 1255024.0,
"step": 310
},
{
"entropy": 0.35561081022024155,
"epoch": 0.6771910724006532,
"grad_norm": 0.24204087257385254,
"learning_rate": 6.573400055077938e-05,
"loss": 0.3393,
"mean_token_accuracy": 0.8942540436983109,
"num_tokens": 1259033.0,
"step": 311
},
{
"entropy": 0.3308749422430992,
"epoch": 0.6793685356559608,
"grad_norm": 0.23772156238555908,
"learning_rate": 6.518313053646586e-05,
"loss": 0.3264,
"mean_token_accuracy": 0.9023979008197784,
"num_tokens": 1263455.0,
"step": 312
},
{
"entropy": 0.3347730040550232,
"epoch": 0.6815459989112683,
"grad_norm": 0.2505793571472168,
"learning_rate": 6.463448418914348e-05,
"loss": 0.3392,
"mean_token_accuracy": 0.9027709066867828,
"num_tokens": 1267335.0,
"step": 313
},
{
"entropy": 0.3568695932626724,
"epoch": 0.683723462166576,
"grad_norm": 0.24569235742092133,
"learning_rate": 6.408808873085577e-05,
"loss": 0.3399,
"mean_token_accuracy": 0.8940989226102829,
"num_tokens": 1271810.0,
"step": 314
},
{
"entropy": 0.33151426911354065,
"epoch": 0.6859009254218835,
"grad_norm": 0.28417110443115234,
"learning_rate": 6.354397127196448e-05,
"loss": 0.3196,
"mean_token_accuracy": 0.9016236513853073,
"num_tokens": 1275575.0,
"step": 315
},
{
"entropy": 0.31233637779951096,
"epoch": 0.6880783886771911,
"grad_norm": 0.23522846400737762,
"learning_rate": 6.300215880980446e-05,
"loss": 0.2954,
"mean_token_accuracy": 0.9116706401109695,
"num_tokens": 1280034.0,
"step": 316
},
{
"entropy": 0.35172613710165024,
"epoch": 0.6902558519324986,
"grad_norm": 0.25289177894592285,
"learning_rate": 6.246267822734421e-05,
"loss": 0.3253,
"mean_token_accuracy": 0.8971187770366669,
"num_tokens": 1283664.0,
"step": 317
},
{
"entropy": 0.3484005257487297,
"epoch": 0.6924333151878062,
"grad_norm": 0.2565121054649353,
"learning_rate": 6.192555629185189e-05,
"loss": 0.3408,
"mean_token_accuracy": 0.8945488780736923,
"num_tokens": 1287685.0,
"step": 318
},
{
"entropy": 0.3195461556315422,
"epoch": 0.6946107784431138,
"grad_norm": 0.24285030364990234,
"learning_rate": 6.139081965356725e-05,
"loss": 0.3188,
"mean_token_accuracy": 0.9035038352012634,
"num_tokens": 1291337.0,
"step": 319
},
{
"entropy": 0.33748240023851395,
"epoch": 0.6967882416984214,
"grad_norm": 0.24630972743034363,
"learning_rate": 6.085849484437944e-05,
"loss": 0.3411,
"mean_token_accuracy": 0.9040576815605164,
"num_tokens": 1295196.0,
"step": 320
},
{
"entropy": 0.3222072795033455,
"epoch": 0.6989657049537289,
"grad_norm": 0.23582881689071655,
"learning_rate": 6.0328608276510476e-05,
"loss": 0.3193,
"mean_token_accuracy": 0.900396928191185,
"num_tokens": 1299276.0,
"step": 321
},
{
"entropy": 0.34793104976415634,
"epoch": 0.7011431682090364,
"grad_norm": 0.28013235330581665,
"learning_rate": 5.980118624120483e-05,
"loss": 0.3234,
"mean_token_accuracy": 0.8983870148658752,
"num_tokens": 1302970.0,
"step": 322
},
{
"entropy": 0.3033921644091606,
"epoch": 0.7033206314643441,
"grad_norm": 0.23157738149166107,
"learning_rate": 5.9276254907424846e-05,
"loss": 0.2927,
"mean_token_accuracy": 0.9108779579401016,
"num_tokens": 1307008.0,
"step": 323
},
{
"entropy": 0.36583440005779266,
"epoch": 0.7054980947196516,
"grad_norm": 0.2319372296333313,
"learning_rate": 5.875384032055239e-05,
"loss": 0.371,
"mean_token_accuracy": 0.897381991147995,
"num_tokens": 1311263.0,
"step": 324
},
{
"entropy": 0.3215944245457649,
"epoch": 0.7076755579749592,
"grad_norm": 0.23082365095615387,
"learning_rate": 5.823396840109657e-05,
"loss": 0.3094,
"mean_token_accuracy": 0.903637707233429,
"num_tokens": 1315823.0,
"step": 325
},
{
"entropy": 0.34219200164079666,
"epoch": 0.7098530212302667,
"grad_norm": 0.23884856700897217,
"learning_rate": 5.771666494340756e-05,
"loss": 0.3289,
"mean_token_accuracy": 0.9032928794622421,
"num_tokens": 1319955.0,
"step": 326
},
{
"entropy": 0.2886577844619751,
"epoch": 0.7120304844855743,
"grad_norm": 0.22707660496234894,
"learning_rate": 5.7201955614396964e-05,
"loss": 0.2839,
"mean_token_accuracy": 0.9143697619438171,
"num_tokens": 1324096.0,
"step": 327
},
{
"entropy": 0.3365718871355057,
"epoch": 0.7142079477408819,
"grad_norm": 0.21789753437042236,
"learning_rate": 5.668986595226404e-05,
"loss": 0.3316,
"mean_token_accuracy": 0.9025033861398697,
"num_tokens": 1328868.0,
"step": 328
},
{
"entropy": 0.313778854906559,
"epoch": 0.7163854109961895,
"grad_norm": 0.24393050372600555,
"learning_rate": 5.618042136522881e-05,
"loss": 0.3212,
"mean_token_accuracy": 0.9037179052829742,
"num_tokens": 1333087.0,
"step": 329
},
{
"entropy": 0.3029978275299072,
"epoch": 0.718562874251497,
"grad_norm": 0.24070705473423004,
"learning_rate": 5.567364713027121e-05,
"loss": 0.306,
"mean_token_accuracy": 0.9108355790376663,
"num_tokens": 1337351.0,
"step": 330
},
{
"entropy": 0.36839231103658676,
"epoch": 0.7207403375068046,
"grad_norm": 0.25364482402801514,
"learning_rate": 5.5169568391877035e-05,
"loss": 0.3493,
"mean_token_accuracy": 0.89275161921978,
"num_tokens": 1341499.0,
"step": 331
},
{
"entropy": 0.37619777768850327,
"epoch": 0.7229178007621121,
"grad_norm": 0.24351854622364044,
"learning_rate": 5.46682101607904e-05,
"loss": 0.3816,
"mean_token_accuracy": 0.8932196348905563,
"num_tokens": 1345295.0,
"step": 332
},
{
"entropy": 0.30273835361003876,
"epoch": 0.7250952640174197,
"grad_norm": 0.2297053039073944,
"learning_rate": 5.416959731277264e-05,
"loss": 0.2852,
"mean_token_accuracy": 0.9142936319112778,
"num_tokens": 1349605.0,
"step": 333
},
{
"entropy": 0.3630438446998596,
"epoch": 0.7272727272727273,
"grad_norm": 0.2559914290904999,
"learning_rate": 5.3673754587368094e-05,
"loss": 0.3791,
"mean_token_accuracy": 0.8942387253046036,
"num_tokens": 1353706.0,
"step": 334
},
{
"entropy": 0.32199136167764664,
"epoch": 0.7294501905280348,
"grad_norm": 0.25669071078300476,
"learning_rate": 5.318070658667671e-05,
"loss": 0.3123,
"mean_token_accuracy": 0.9080253690481186,
"num_tokens": 1357558.0,
"step": 335
},
{
"entropy": 0.35776887834072113,
"epoch": 0.7316276537833424,
"grad_norm": 0.2596750855445862,
"learning_rate": 5.269047777413333e-05,
"loss": 0.3436,
"mean_token_accuracy": 0.8997514098882675,
"num_tokens": 1361340.0,
"step": 336
},
{
"entropy": 0.34765905141830444,
"epoch": 0.7338051170386499,
"grad_norm": 0.21836940944194794,
"learning_rate": 5.22030924732938e-05,
"loss": 0.3277,
"mean_token_accuracy": 0.9053044319152832,
"num_tokens": 1365153.0,
"step": 337
},
{
"entropy": 0.34295450896024704,
"epoch": 0.7359825802939576,
"grad_norm": 0.2738622725009918,
"learning_rate": 5.171857486662823e-05,
"loss": 0.3336,
"mean_token_accuracy": 0.8998141139745712,
"num_tokens": 1368896.0,
"step": 338
},
{
"entropy": 0.32134611159563065,
"epoch": 0.7381600435492651,
"grad_norm": 0.22107118368148804,
"learning_rate": 5.1236948994321055e-05,
"loss": 0.2999,
"mean_token_accuracy": 0.908054381608963,
"num_tokens": 1373609.0,
"step": 339
},
{
"entropy": 0.3105727434158325,
"epoch": 0.7403375068045727,
"grad_norm": 0.23407259583473206,
"learning_rate": 5.075823875307828e-05,
"loss": 0.2947,
"mean_token_accuracy": 0.9088436663150787,
"num_tokens": 1377893.0,
"step": 340
},
{
"entropy": 0.3235616162419319,
"epoch": 0.7425149700598802,
"grad_norm": 0.2505863606929779,
"learning_rate": 5.0282467894941864e-05,
"loss": 0.3338,
"mean_token_accuracy": 0.9098049253225327,
"num_tokens": 1381665.0,
"step": 341
},
{
"entropy": 0.30407993495464325,
"epoch": 0.7446924333151878,
"grad_norm": 0.23674152791500092,
"learning_rate": 4.980966002611108e-05,
"loss": 0.2939,
"mean_token_accuracy": 0.9113668948411942,
"num_tokens": 1386000.0,
"step": 342
},
{
"entropy": 0.29837000370025635,
"epoch": 0.7468698965704954,
"grad_norm": 0.24069277942180634,
"learning_rate": 4.933983860577136e-05,
"loss": 0.2801,
"mean_token_accuracy": 0.9147733300924301,
"num_tokens": 1389768.0,
"step": 343
},
{
"entropy": 0.5488722026348114,
"epoch": 0.749047359825803,
"grad_norm": 0.23018239438533783,
"learning_rate": 4.887302694493029e-05,
"loss": 0.6326,
"mean_token_accuracy": 0.8503530323505402,
"num_tokens": 1394588.0,
"step": 344
},
{
"entropy": 0.3708427771925926,
"epoch": 0.7512248230811105,
"grad_norm": 0.28215181827545166,
"learning_rate": 4.840924820526096e-05,
"loss": 0.3952,
"mean_token_accuracy": 0.8861146718263626,
"num_tokens": 1398304.0,
"step": 345
},
{
"entropy": 0.34193163365125656,
"epoch": 0.753402286336418,
"grad_norm": 0.2342662215232849,
"learning_rate": 4.794852539795291e-05,
"loss": 0.3495,
"mean_token_accuracy": 0.903597891330719,
"num_tokens": 1402505.0,
"step": 346
},
{
"entropy": 0.3103507123887539,
"epoch": 0.7555797495917257,
"grad_norm": 0.23902368545532227,
"learning_rate": 4.749088138257017e-05,
"loss": 0.3078,
"mean_token_accuracy": 0.9087391942739487,
"num_tokens": 1406703.0,
"step": 347
},
{
"entropy": 0.3236440420150757,
"epoch": 0.7577572128470332,
"grad_norm": 0.22265306115150452,
"learning_rate": 4.703633886591719e-05,
"loss": 0.3387,
"mean_token_accuracy": 0.9036975800991058,
"num_tokens": 1410765.0,
"step": 348
},
{
"entropy": 0.31991977244615555,
"epoch": 0.7599346761023408,
"grad_norm": 0.2397955358028412,
"learning_rate": 4.6584920400912156e-05,
"loss": 0.3056,
"mean_token_accuracy": 0.9113240092992783,
"num_tokens": 1414804.0,
"step": 349
},
{
"entropy": 0.28877923637628555,
"epoch": 0.7621121393576483,
"grad_norm": 0.2253178060054779,
"learning_rate": 4.6136648385467977e-05,
"loss": 0.2649,
"mean_token_accuracy": 0.9233576655387878,
"num_tokens": 1419025.0,
"step": 350
},
{
"entropy": 0.35838521271944046,
"epoch": 0.7642896026129559,
"grad_norm": 0.2513080835342407,
"learning_rate": 4.5691545061381026e-05,
"loss": 0.3413,
"mean_token_accuracy": 0.8982634395360947,
"num_tokens": 1423031.0,
"step": 351
},
{
"entropy": 0.371716171503067,
"epoch": 0.7664670658682635,
"grad_norm": 0.23435106873512268,
"learning_rate": 4.5249632513227504e-05,
"loss": 0.3457,
"mean_token_accuracy": 0.9014202654361725,
"num_tokens": 1427232.0,
"step": 352
},
{
"entropy": 0.32540784031152725,
"epoch": 0.7686445291235711,
"grad_norm": 0.2637276351451874,
"learning_rate": 4.481093266726772e-05,
"loss": 0.2913,
"mean_token_accuracy": 0.9063924849033356,
"num_tokens": 1431135.0,
"step": 353
},
{
"entropy": 0.35406405478715897,
"epoch": 0.7708219923788786,
"grad_norm": 0.24304324388504028,
"learning_rate": 4.43754672903582e-05,
"loss": 0.3232,
"mean_token_accuracy": 0.9024296700954437,
"num_tokens": 1435499.0,
"step": 354
},
{
"entropy": 0.32546380907297134,
"epoch": 0.7729994556341862,
"grad_norm": 0.22986435890197754,
"learning_rate": 4.394325798887158e-05,
"loss": 0.31,
"mean_token_accuracy": 0.9013588130474091,
"num_tokens": 1439833.0,
"step": 355
},
{
"entropy": 0.38513386994600296,
"epoch": 0.7751769188894937,
"grad_norm": 0.27596256136894226,
"learning_rate": 4.351432620762478e-05,
"loss": 0.346,
"mean_token_accuracy": 0.8986889123916626,
"num_tokens": 1443460.0,
"step": 356
},
{
"entropy": 0.3382200300693512,
"epoch": 0.7773543821448013,
"grad_norm": 0.24578897655010223,
"learning_rate": 4.30886932288147e-05,
"loss": 0.3229,
"mean_token_accuracy": 0.9034900367259979,
"num_tokens": 1447099.0,
"step": 357
},
{
"entropy": 0.3409022316336632,
"epoch": 0.7795318454001089,
"grad_norm": 0.2280901074409485,
"learning_rate": 4.266638017096252e-05,
"loss": 0.3411,
"mean_token_accuracy": 0.9012559950351715,
"num_tokens": 1451312.0,
"step": 358
},
{
"entropy": 0.32152481377124786,
"epoch": 0.7817093086554164,
"grad_norm": 0.24760432541370392,
"learning_rate": 4.224740798786573e-05,
"loss": 0.3204,
"mean_token_accuracy": 0.9076259434223175,
"num_tokens": 1455523.0,
"step": 359
},
{
"entropy": 0.31170132011175156,
"epoch": 0.783886771910724,
"grad_norm": 0.2510303258895874,
"learning_rate": 4.183179746755844e-05,
"loss": 0.3126,
"mean_token_accuracy": 0.9090617448091507,
"num_tokens": 1459544.0,
"step": 360
},
{
"entropy": 0.3523375913500786,
"epoch": 0.7860642351660315,
"grad_norm": 0.26667118072509766,
"learning_rate": 4.141956923128013e-05,
"loss": 0.3492,
"mean_token_accuracy": 0.8998522162437439,
"num_tokens": 1463315.0,
"step": 361
},
{
"entropy": 0.3598644956946373,
"epoch": 0.7882416984213392,
"grad_norm": 0.2440025806427002,
"learning_rate": 4.1010743732452294e-05,
"loss": 0.3544,
"mean_token_accuracy": 0.8947449177503586,
"num_tokens": 1467647.0,
"step": 362
},
{
"entropy": 0.395267553627491,
"epoch": 0.7904191616766467,
"grad_norm": 0.24411144852638245,
"learning_rate": 4.0605341255663696e-05,
"loss": 0.4317,
"mean_token_accuracy": 0.8864284604787827,
"num_tokens": 1471972.0,
"step": 363
},
{
"entropy": 0.33659572899341583,
"epoch": 0.7925966249319543,
"grad_norm": 0.26458773016929626,
"learning_rate": 4.02033819156639e-05,
"loss": 0.3298,
"mean_token_accuracy": 0.9003510624170303,
"num_tokens": 1475826.0,
"step": 364
},
{
"entropy": 0.29316914454102516,
"epoch": 0.7947740881872618,
"grad_norm": 0.25398463010787964,
"learning_rate": 3.980488565636522e-05,
"loss": 0.2772,
"mean_token_accuracy": 0.9137367159128189,
"num_tokens": 1480107.0,
"step": 365
},
{
"entropy": 0.3080258443951607,
"epoch": 0.7969515514425695,
"grad_norm": 0.26426613330841064,
"learning_rate": 3.9409872249853286e-05,
"loss": 0.3046,
"mean_token_accuracy": 0.9098687618970871,
"num_tokens": 1484069.0,
"step": 366
},
{
"entropy": 0.34426791220903397,
"epoch": 0.799129014697877,
"grad_norm": 0.2809188663959503,
"learning_rate": 3.9018361295405856e-05,
"loss": 0.3592,
"mean_token_accuracy": 0.9000663906335831,
"num_tokens": 1487840.0,
"step": 367
},
{
"entropy": 0.33940157294273376,
"epoch": 0.8013064779531845,
"grad_norm": 0.2272171825170517,
"learning_rate": 3.8630372218520384e-05,
"loss": 0.3417,
"mean_token_accuracy": 0.9024456739425659,
"num_tokens": 1491938.0,
"step": 368
},
{
"entropy": 0.33219510316848755,
"epoch": 0.8034839412084921,
"grad_norm": 0.2192796915769577,
"learning_rate": 3.824592426995029e-05,
"loss": 0.3221,
"mean_token_accuracy": 0.9031501561403275,
"num_tokens": 1496386.0,
"step": 369
},
{
"entropy": 0.3439122289419174,
"epoch": 0.8056614044637996,
"grad_norm": 0.229109987616539,
"learning_rate": 3.786503652474982e-05,
"loss": 0.3427,
"mean_token_accuracy": 0.9062491357326508,
"num_tokens": 1500938.0,
"step": 370
},
{
"entropy": 0.3725889101624489,
"epoch": 0.8078388677191073,
"grad_norm": 0.2585630714893341,
"learning_rate": 3.7487727881327405e-05,
"loss": 0.3704,
"mean_token_accuracy": 0.8960603177547455,
"num_tokens": 1504742.0,
"step": 371
},
{
"entropy": 0.3037722408771515,
"epoch": 0.8100163309744148,
"grad_norm": 0.23759490251541138,
"learning_rate": 3.711401706050821e-05,
"loss": 0.2939,
"mean_token_accuracy": 0.9124279767274857,
"num_tokens": 1508512.0,
"step": 372
},
{
"entropy": 0.3051731139421463,
"epoch": 0.8121937942297224,
"grad_norm": 0.22473642230033875,
"learning_rate": 3.674392260460509e-05,
"loss": 0.3036,
"mean_token_accuracy": 0.9092454463243484,
"num_tokens": 1513083.0,
"step": 373
},
{
"entropy": 0.3145020753145218,
"epoch": 0.8143712574850299,
"grad_norm": 0.2272917479276657,
"learning_rate": 3.6377462876498694e-05,
"loss": 0.2858,
"mean_token_accuracy": 0.9174733906984329,
"num_tokens": 1516960.0,
"step": 374
},
{
"entropy": 0.33495523035526276,
"epoch": 0.8165487207403375,
"grad_norm": 0.24096311628818512,
"learning_rate": 3.601465605872636e-05,
"loss": 0.3004,
"mean_token_accuracy": 0.9126247465610504,
"num_tokens": 1520583.0,
"step": 375
},
{
"entropy": 0.3524938374757767,
"epoch": 0.8187261839956451,
"grad_norm": 0.23482073843479156,
"learning_rate": 3.565552015257989e-05,
"loss": 0.3596,
"mean_token_accuracy": 0.894221231341362,
"num_tokens": 1525126.0,
"step": 376
},
{
"entropy": 0.3637235388159752,
"epoch": 0.8209036472509527,
"grad_norm": 0.2486315220594406,
"learning_rate": 3.530007297721239e-05,
"loss": 0.3518,
"mean_token_accuracy": 0.8981701731681824,
"num_tokens": 1528846.0,
"step": 377
},
{
"entropy": 0.327960979193449,
"epoch": 0.8230811105062602,
"grad_norm": 0.21721476316452026,
"learning_rate": 3.494833216875421e-05,
"loss": 0.2854,
"mean_token_accuracy": 0.915936678647995,
"num_tokens": 1532720.0,
"step": 378
},
{
"entropy": 0.3281715139746666,
"epoch": 0.8252585737615677,
"grad_norm": 0.27801278233528137,
"learning_rate": 3.4600315179437807e-05,
"loss": 0.3094,
"mean_token_accuracy": 0.9122365713119507,
"num_tokens": 1536770.0,
"step": 379
},
{
"entropy": 0.319459468126297,
"epoch": 0.8274360370168753,
"grad_norm": 0.24818798899650574,
"learning_rate": 3.425603927673195e-05,
"loss": 0.2909,
"mean_token_accuracy": 0.9143448621034622,
"num_tokens": 1540543.0,
"step": 380
},
{
"entropy": 0.29846663028001785,
"epoch": 0.829613500272183,
"grad_norm": 0.2553517520427704,
"learning_rate": 3.3915521542484794e-05,
"loss": 0.2984,
"mean_token_accuracy": 0.9117088168859482,
"num_tokens": 1544682.0,
"step": 381
},
{
"entropy": 0.3208995833992958,
"epoch": 0.8317909635274905,
"grad_norm": 0.23631241917610168,
"learning_rate": 3.357877887207648e-05,
"loss": 0.3218,
"mean_token_accuracy": 0.9085069596767426,
"num_tokens": 1548933.0,
"step": 382
},
{
"entropy": 0.3497694879770279,
"epoch": 0.833968426782798,
"grad_norm": 0.26314374804496765,
"learning_rate": 3.3245827973580754e-05,
"loss": 0.3651,
"mean_token_accuracy": 0.8973031789064407,
"num_tokens": 1553109.0,
"step": 383
},
{
"entropy": 0.36065296083688736,
"epoch": 0.8361458900381056,
"grad_norm": 0.2554258704185486,
"learning_rate": 3.2916685366936016e-05,
"loss": 0.3572,
"mean_token_accuracy": 0.8984216153621674,
"num_tokens": 1557199.0,
"step": 384
},
{
"entropy": 0.3203965201973915,
"epoch": 0.8383233532934131,
"grad_norm": 0.2560184597969055,
"learning_rate": 3.259136738312565e-05,
"loss": 0.3107,
"mean_token_accuracy": 0.9113545119762421,
"num_tokens": 1560942.0,
"step": 385
},
{
"entropy": 0.3545750603079796,
"epoch": 0.8405008165487208,
"grad_norm": 0.23520711064338684,
"learning_rate": 3.226989016336767e-05,
"loss": 0.3295,
"mean_token_accuracy": 0.8977851718664169,
"num_tokens": 1565528.0,
"step": 386
},
{
"entropy": 0.27805931866168976,
"epoch": 0.8426782798040283,
"grad_norm": 0.22847194969654083,
"learning_rate": 3.1952269658313963e-05,
"loss": 0.2647,
"mean_token_accuracy": 0.9223105758428574,
"num_tokens": 1569618.0,
"step": 387
},
{
"entropy": 0.36420372873544693,
"epoch": 0.8448557430593359,
"grad_norm": 0.2458695024251938,
"learning_rate": 3.163852162725872e-05,
"loss": 0.349,
"mean_token_accuracy": 0.8980138152837753,
"num_tokens": 1573505.0,
"step": 388
},
{
"entropy": 0.3188191279768944,
"epoch": 0.8470332063146434,
"grad_norm": 0.245536670088768,
"learning_rate": 3.1328661637356714e-05,
"loss": 0.3177,
"mean_token_accuracy": 0.907622441649437,
"num_tokens": 1577568.0,
"step": 389
},
{
"entropy": 0.3238792344927788,
"epoch": 0.8492106695699511,
"grad_norm": 0.24584944546222687,
"learning_rate": 3.102270506285067e-05,
"loss": 0.3085,
"mean_token_accuracy": 0.9090628027915955,
"num_tokens": 1581202.0,
"step": 390
},
{
"entropy": 0.34554795920848846,
"epoch": 0.8513881328252586,
"grad_norm": 0.24180692434310913,
"learning_rate": 3.072066708430862e-05,
"loss": 0.3203,
"mean_token_accuracy": 0.9024082869291306,
"num_tokens": 1585340.0,
"step": 391
},
{
"entropy": 0.31679805368185043,
"epoch": 0.8535655960805661,
"grad_norm": 0.23670694231987,
"learning_rate": 3.042256268787063e-05,
"loss": 0.2891,
"mean_token_accuracy": 0.9171215295791626,
"num_tokens": 1589570.0,
"step": 392
},
{
"entropy": 0.316896952688694,
"epoch": 0.8557430593358737,
"grad_norm": 0.26047396659851074,
"learning_rate": 3.0128406664505215e-05,
"loss": 0.3237,
"mean_token_accuracy": 0.9058733284473419,
"num_tokens": 1593421.0,
"step": 393
},
{
"entropy": 0.3199731484055519,
"epoch": 0.8579205225911812,
"grad_norm": 0.2323935478925705,
"learning_rate": 2.9838213609275546e-05,
"loss": 0.3018,
"mean_token_accuracy": 0.9120573252439499,
"num_tokens": 1597598.0,
"step": 394
},
{
"entropy": 0.29843273013830185,
"epoch": 0.8600979858464889,
"grad_norm": 0.2387438267469406,
"learning_rate": 2.9551997920615187e-05,
"loss": 0.2862,
"mean_token_accuracy": 0.9175356030464172,
"num_tokens": 1601591.0,
"step": 395
},
{
"entropy": 0.31333109736442566,
"epoch": 0.8622754491017964,
"grad_norm": 0.23580299317836761,
"learning_rate": 2.926977379961374e-05,
"loss": 0.3098,
"mean_token_accuracy": 0.911782830953598,
"num_tokens": 1606156.0,
"step": 396
},
{
"entropy": 0.32873860746622086,
"epoch": 0.864452912357104,
"grad_norm": 0.23804928362369537,
"learning_rate": 2.899155524931224e-05,
"loss": 0.3171,
"mean_token_accuracy": 0.9060818552970886,
"num_tokens": 1610215.0,
"step": 397
},
{
"entropy": 0.331471748650074,
"epoch": 0.8666303756124115,
"grad_norm": 0.22940973937511444,
"learning_rate": 2.8717356074008345e-05,
"loss": 0.3201,
"mean_token_accuracy": 0.905473530292511,
"num_tokens": 1614427.0,
"step": 398
},
{
"entropy": 0.33943046629428864,
"epoch": 0.8688078388677191,
"grad_norm": 0.24828903377056122,
"learning_rate": 2.844718987857145e-05,
"loss": 0.3408,
"mean_token_accuracy": 0.8990557938814163,
"num_tokens": 1618891.0,
"step": 399
},
{
"entropy": 0.33763300627470016,
"epoch": 0.8709853021230267,
"grad_norm": 0.25826534628868103,
"learning_rate": 2.818107006776761e-05,
"loss": 0.3195,
"mean_token_accuracy": 0.9027258008718491,
"num_tokens": 1622659.0,
"step": 400
},
{
"entropy": 0.29499682784080505,
"epoch": 0.8731627653783343,
"grad_norm": 0.22961440682411194,
"learning_rate": 2.7919009845594502e-05,
"loss": 0.2923,
"mean_token_accuracy": 0.9152926355600357,
"num_tokens": 1626858.0,
"step": 401
},
{
"entropy": 0.3353520557284355,
"epoch": 0.8753402286336418,
"grad_norm": 0.25194504857063293,
"learning_rate": 2.7661022214626153e-05,
"loss": 0.3207,
"mean_token_accuracy": 0.9085413068532944,
"num_tokens": 1630448.0,
"step": 402
},
{
"entropy": 0.29210612177848816,
"epoch": 0.8775176918889493,
"grad_norm": 0.2511427402496338,
"learning_rate": 2.7407119975368006e-05,
"loss": 0.2815,
"mean_token_accuracy": 0.9171009808778763,
"num_tokens": 1634411.0,
"step": 403
},
{
"entropy": 0.35340818017721176,
"epoch": 0.8796951551442569,
"grad_norm": 0.24676676094532013,
"learning_rate": 2.7157315725621612e-05,
"loss": 0.3692,
"mean_token_accuracy": 0.905316099524498,
"num_tokens": 1638404.0,
"step": 404
},
{
"entropy": 0.3412262871861458,
"epoch": 0.8818726183995645,
"grad_norm": 0.27478235960006714,
"learning_rate": 2.6911621859859658e-05,
"loss": 0.3472,
"mean_token_accuracy": 0.90118607878685,
"num_tokens": 1642162.0,
"step": 405
},
{
"entropy": 0.33481264114379883,
"epoch": 0.8840500816548721,
"grad_norm": 0.2933956980705261,
"learning_rate": 2.6670050568610972e-05,
"loss": 0.3248,
"mean_token_accuracy": 0.9072499722242355,
"num_tokens": 1646171.0,
"step": 406
},
{
"entropy": 0.3591442406177521,
"epoch": 0.8862275449101796,
"grad_norm": 0.21709908545017242,
"learning_rate": 2.6432613837855658e-05,
"loss": 0.3407,
"mean_token_accuracy": 0.9071426689624786,
"num_tokens": 1650504.0,
"step": 407
},
{
"entropy": 0.32970624417066574,
"epoch": 0.8884050081654872,
"grad_norm": 0.23687736690044403,
"learning_rate": 2.6199323448430458e-05,
"loss": 0.3135,
"mean_token_accuracy": 0.903979942202568,
"num_tokens": 1654507.0,
"step": 408
},
{
"entropy": 0.3415728807449341,
"epoch": 0.8905824714207947,
"grad_norm": 0.2553468644618988,
"learning_rate": 2.597019097544409e-05,
"loss": 0.3039,
"mean_token_accuracy": 0.9025170505046844,
"num_tokens": 1658421.0,
"step": 409
},
{
"entropy": 0.29549212008714676,
"epoch": 0.8927599346761024,
"grad_norm": 0.21464505791664124,
"learning_rate": 2.574522778770308e-05,
"loss": 0.2634,
"mean_token_accuracy": 0.9200884401798248,
"num_tokens": 1662809.0,
"step": 410
},
{
"entropy": 0.3326757438480854,
"epoch": 0.8949373979314099,
"grad_norm": 0.23331218957901,
"learning_rate": 2.5524445047147567e-05,
"loss": 0.319,
"mean_token_accuracy": 0.900556892156601,
"num_tokens": 1667221.0,
"step": 411
},
{
"entropy": 0.31935854256153107,
"epoch": 0.8971148611867175,
"grad_norm": 0.23457299172878265,
"learning_rate": 2.5307853708297523e-05,
"loss": 0.3045,
"mean_token_accuracy": 0.9045213311910629,
"num_tokens": 1671381.0,
"step": 412
},
{
"entropy": 0.3340509235858917,
"epoch": 0.899292324442025,
"grad_norm": 0.23886168003082275,
"learning_rate": 2.5095464517709277e-05,
"loss": 0.3264,
"mean_token_accuracy": 0.899304986000061,
"num_tokens": 1675656.0,
"step": 413
},
{
"entropy": 0.3181797042489052,
"epoch": 0.9014697876973327,
"grad_norm": 0.24742458760738373,
"learning_rate": 2.4887288013442218e-05,
"loss": 0.2988,
"mean_token_accuracy": 0.9066351801156998,
"num_tokens": 1679259.0,
"step": 414
},
{
"entropy": 0.3163676857948303,
"epoch": 0.9036472509526402,
"grad_norm": 0.25340980291366577,
"learning_rate": 2.468333452453597e-05,
"loss": 0.2979,
"mean_token_accuracy": 0.9118978530168533,
"num_tokens": 1683245.0,
"step": 415
},
{
"entropy": 0.30397678166627884,
"epoch": 0.9058247142079477,
"grad_norm": 0.2358277142047882,
"learning_rate": 2.4483614170497916e-05,
"loss": 0.2955,
"mean_token_accuracy": 0.9145314395427704,
"num_tokens": 1687531.0,
"step": 416
},
{
"entropy": 0.34245041757822037,
"epoch": 0.9080021774632553,
"grad_norm": 0.23215466737747192,
"learning_rate": 2.4288136860801048e-05,
"loss": 0.326,
"mean_token_accuracy": 0.9006476998329163,
"num_tokens": 1692172.0,
"step": 417
},
{
"entropy": 0.3470025435090065,
"epoch": 0.9101796407185628,
"grad_norm": 0.26786699891090393,
"learning_rate": 2.409691229439239e-05,
"loss": 0.3668,
"mean_token_accuracy": 0.8918263465166092,
"num_tokens": 1696141.0,
"step": 418
},
{
"entropy": 0.30502913892269135,
"epoch": 0.9123571039738705,
"grad_norm": 0.23780497908592224,
"learning_rate": 2.3909949959211657e-05,
"loss": 0.2906,
"mean_token_accuracy": 0.9070711433887482,
"num_tokens": 1700408.0,
"step": 419
},
{
"entropy": 0.3096166178584099,
"epoch": 0.914534567229178,
"grad_norm": 0.21969804167747498,
"learning_rate": 2.372725913172055e-05,
"loss": 0.32,
"mean_token_accuracy": 0.9115228056907654,
"num_tokens": 1704797.0,
"step": 420
},
{
"entropy": 0.30217302590608597,
"epoch": 0.9167120304844856,
"grad_norm": 0.23517285287380219,
"learning_rate": 2.3548848876442465e-05,
"loss": 0.2789,
"mean_token_accuracy": 0.9120800346136093,
"num_tokens": 1708762.0,
"step": 421
},
{
"entropy": 0.27675122022628784,
"epoch": 0.9188894937397931,
"grad_norm": 0.2593907415866852,
"learning_rate": 2.337472804551281e-05,
"loss": 0.2552,
"mean_token_accuracy": 0.9166678935289383,
"num_tokens": 1712763.0,
"step": 422
},
{
"entropy": 0.31945841014385223,
"epoch": 0.9210669569951007,
"grad_norm": 0.22665663063526154,
"learning_rate": 2.320490527823968e-05,
"loss": 0.322,
"mean_token_accuracy": 0.9008611887693405,
"num_tokens": 1717586.0,
"step": 423
},
{
"entropy": 0.28783877938985825,
"epoch": 0.9232444202504083,
"grad_norm": 0.2106105536222458,
"learning_rate": 2.303938900067531e-05,
"loss": 0.2571,
"mean_token_accuracy": 0.9197226613759995,
"num_tokens": 1722046.0,
"step": 424
},
{
"entropy": 0.31481262296438217,
"epoch": 0.9254218835057159,
"grad_norm": 0.24338746070861816,
"learning_rate": 2.2878187425197893e-05,
"loss": 0.3072,
"mean_token_accuracy": 0.9047886729240417,
"num_tokens": 1726207.0,
"step": 425
},
{
"entropy": 0.35147786885499954,
"epoch": 0.9275993467610234,
"grad_norm": 0.2515200078487396,
"learning_rate": 2.272130855010421e-05,
"loss": 0.3496,
"mean_token_accuracy": 0.8965179175138474,
"num_tokens": 1730155.0,
"step": 426
},
{
"entropy": 0.36182061582803726,
"epoch": 0.929776810016331,
"grad_norm": 0.2628372609615326,
"learning_rate": 2.2568760159212745e-05,
"loss": 0.3187,
"mean_token_accuracy": 0.9001797884702682,
"num_tokens": 1733927.0,
"step": 427
},
{
"entropy": 0.32963769882917404,
"epoch": 0.9319542732716385,
"grad_norm": 0.26346680521965027,
"learning_rate": 2.2420549821477435e-05,
"loss": 0.311,
"mean_token_accuracy": 0.9040227830410004,
"num_tokens": 1737774.0,
"step": 428
},
{
"entropy": 0.37061919271945953,
"epoch": 0.9341317365269461,
"grad_norm": 0.2579784691333771,
"learning_rate": 2.227668489061219e-05,
"loss": 0.3676,
"mean_token_accuracy": 0.8960554301738739,
"num_tokens": 1741942.0,
"step": 429
},
{
"entropy": 0.3078198730945587,
"epoch": 0.9363091997822537,
"grad_norm": 0.24415822327136993,
"learning_rate": 2.2137172504725956e-05,
"loss": 0.2881,
"mean_token_accuracy": 0.912653386592865,
"num_tokens": 1745914.0,
"step": 430
},
{
"entropy": 0.3296479806303978,
"epoch": 0.9384866630375612,
"grad_norm": 0.25575825572013855,
"learning_rate": 2.2002019585968637e-05,
"loss": 0.3096,
"mean_token_accuracy": 0.9089950323104858,
"num_tokens": 1749929.0,
"step": 431
},
{
"entropy": 0.3241398259997368,
"epoch": 0.9406641262928688,
"grad_norm": 0.2516978085041046,
"learning_rate": 2.187123284018753e-05,
"loss": 0.3186,
"mean_token_accuracy": 0.9034547358751297,
"num_tokens": 1753992.0,
"step": 432
},
{
"entropy": 0.3980755880475044,
"epoch": 0.9428415895481764,
"grad_norm": 0.24856629967689514,
"learning_rate": 2.174481875659472e-05,
"loss": 0.3749,
"mean_token_accuracy": 0.8908516466617584,
"num_tokens": 1758062.0,
"step": 433
},
{
"entropy": 0.3143734037876129,
"epoch": 0.945019052803484,
"grad_norm": 0.25844618678092957,
"learning_rate": 2.1622783607444988e-05,
"loss": 0.2784,
"mean_token_accuracy": 0.922119140625,
"num_tokens": 1761689.0,
"step": 434
},
{
"entropy": 0.3378266841173172,
"epoch": 0.9471965160587915,
"grad_norm": 0.24213889241218567,
"learning_rate": 2.150513344772469e-05,
"loss": 0.3155,
"mean_token_accuracy": 0.9061428606510162,
"num_tokens": 1766010.0,
"step": 435
},
{
"entropy": 0.35330820083618164,
"epoch": 0.9493739793140991,
"grad_norm": 0.2620498836040497,
"learning_rate": 2.1391874114851294e-05,
"loss": 0.3583,
"mean_token_accuracy": 0.9004585295915604,
"num_tokens": 1769801.0,
"step": 436
},
{
"entropy": 0.2881145551800728,
"epoch": 0.9515514425694066,
"grad_norm": 0.24421681463718414,
"learning_rate": 2.128301122838377e-05,
"loss": 0.3026,
"mean_token_accuracy": 0.9104648381471634,
"num_tokens": 1774342.0,
"step": 437
},
{
"entropy": 0.3526333123445511,
"epoch": 0.9537289058247143,
"grad_norm": 0.2302054911851883,
"learning_rate": 2.117855018974369e-05,
"loss": 0.3199,
"mean_token_accuracy": 0.9067949205636978,
"num_tokens": 1778412.0,
"step": 438
},
{
"entropy": 0.32418397441506386,
"epoch": 0.9559063690800218,
"grad_norm": 0.21741004288196564,
"learning_rate": 2.107849618194735e-05,
"loss": 0.3114,
"mean_token_accuracy": 0.9031261652708054,
"num_tokens": 1782995.0,
"step": 439
},
{
"entropy": 0.30877869576215744,
"epoch": 0.9580838323353293,
"grad_norm": 0.23063865303993225,
"learning_rate": 2.0982854169348503e-05,
"loss": 0.2949,
"mean_token_accuracy": 0.9094719737768173,
"num_tokens": 1787537.0,
"step": 440
},
{
"entropy": 0.3279525935649872,
"epoch": 0.9602612955906369,
"grad_norm": 0.2691234350204468,
"learning_rate": 2.0891628897392087e-05,
"loss": 0.345,
"mean_token_accuracy": 0.8982786238193512,
"num_tokens": 1791355.0,
"step": 441
},
{
"entropy": 0.3470368981361389,
"epoch": 0.9624387588459444,
"grad_norm": 0.26819464564323425,
"learning_rate": 2.0804824892378765e-05,
"loss": 0.3414,
"mean_token_accuracy": 0.9030001610517502,
"num_tokens": 1795467.0,
"step": 442
},
{
"entropy": 0.3493390902876854,
"epoch": 0.9646162221012521,
"grad_norm": 0.23444399237632751,
"learning_rate": 2.0722446461240352e-05,
"loss": 0.3442,
"mean_token_accuracy": 0.8999157398939133,
"num_tokens": 1800109.0,
"step": 443
},
{
"entropy": 0.3092813342809677,
"epoch": 0.9667936853565596,
"grad_norm": 0.23800377547740936,
"learning_rate": 2.0644497691326106e-05,
"loss": 0.2999,
"mean_token_accuracy": 0.9111448973417282,
"num_tokens": 1804018.0,
"step": 444
},
{
"entropy": 0.29666490107774734,
"epoch": 0.9689711486118672,
"grad_norm": 0.22874487936496735,
"learning_rate": 2.0570982450199913e-05,
"loss": 0.2858,
"mean_token_accuracy": 0.9175421446561813,
"num_tokens": 1808059.0,
"step": 445
},
{
"entropy": 0.3779358044266701,
"epoch": 0.9711486118671747,
"grad_norm": 0.2360084503889084,
"learning_rate": 2.0501904385448447e-05,
"loss": 0.3668,
"mean_token_accuracy": 0.9037110358476639,
"num_tokens": 1812165.0,
"step": 446
},
{
"entropy": 0.3430086299777031,
"epoch": 0.9733260751224823,
"grad_norm": 0.2596234679222107,
"learning_rate": 2.043726692450014e-05,
"loss": 0.3233,
"mean_token_accuracy": 0.9003089815378189,
"num_tokens": 1815708.0,
"step": 447
},
{
"entropy": 0.3329969719052315,
"epoch": 0.9755035383777899,
"grad_norm": 0.25411558151245117,
"learning_rate": 2.037707327445511e-05,
"loss": 0.3299,
"mean_token_accuracy": 0.9008579254150391,
"num_tokens": 1819635.0,
"step": 448
},
{
"entropy": 0.3378527835011482,
"epoch": 0.9776810016330975,
"grad_norm": 0.2512282431125641,
"learning_rate": 2.0321326421926097e-05,
"loss": 0.3325,
"mean_token_accuracy": 0.9022142142057419,
"num_tokens": 1823694.0,
"step": 449
},
{
"entropy": 0.34048717468976974,
"epoch": 0.979858464888405,
"grad_norm": 0.24113033711910248,
"learning_rate": 2.0270029132890223e-05,
"loss": 0.344,
"mean_token_accuracy": 0.9008767306804657,
"num_tokens": 1827735.0,
"step": 450
},
{
"entropy": 0.3050212487578392,
"epoch": 0.9820359281437125,
"grad_norm": 0.21851961314678192,
"learning_rate": 2.0223183952551785e-05,
"loss": 0.2795,
"mean_token_accuracy": 0.917202040553093,
"num_tokens": 1831884.0,
"step": 451
},
{
"entropy": 0.3319382965564728,
"epoch": 0.9842133913990201,
"grad_norm": 0.24525989592075348,
"learning_rate": 2.018079320521593e-05,
"loss": 0.3079,
"mean_token_accuracy": 0.9144886583089828,
"num_tokens": 1835507.0,
"step": 452
},
{
"entropy": 0.34535887837409973,
"epoch": 0.9863908546543277,
"grad_norm": 0.2506140172481537,
"learning_rate": 2.0142858994173404e-05,
"loss": 0.3436,
"mean_token_accuracy": 0.9002240151166916,
"num_tokens": 1839606.0,
"step": 453
},
{
"entropy": 0.3276618719100952,
"epoch": 0.9885683179096353,
"grad_norm": 0.2481948435306549,
"learning_rate": 2.0109383201596102e-05,
"loss": 0.3105,
"mean_token_accuracy": 0.9108982384204865,
"num_tokens": 1843500.0,
"step": 454
},
{
"entropy": 0.3270680084824562,
"epoch": 0.9907457811649428,
"grad_norm": 0.2625768780708313,
"learning_rate": 2.0080367488443743e-05,
"loss": 0.328,
"mean_token_accuracy": 0.9026461988687515,
"num_tokens": 1847739.0,
"step": 455
},
{
"entropy": 0.34614715725183487,
"epoch": 0.9929232444202504,
"grad_norm": 0.2605029046535492,
"learning_rate": 2.0055813294381443e-05,
"loss": 0.3467,
"mean_token_accuracy": 0.9046141803264618,
"num_tokens": 1851928.0,
"step": 456
},
{
"entropy": 0.284773550927639,
"epoch": 0.995100707675558,
"grad_norm": 0.22357277572155,
"learning_rate": 2.00357218377083e-05,
"loss": 0.2689,
"mean_token_accuracy": 0.9219858795404434,
"num_tokens": 1856283.0,
"step": 457
},
{
"entropy": 0.356322281062603,
"epoch": 0.9972781709308656,
"grad_norm": 0.23450958728790283,
"learning_rate": 2.0020094115296876e-05,
"loss": 0.3562,
"mean_token_accuracy": 0.9017274230718613,
"num_tokens": 1861007.0,
"step": 458
},
{
"entropy": 0.2814597636461258,
"epoch": 0.9994556341861731,
"grad_norm": 0.2359769642353058,
"learning_rate": 2.0008930902543854e-05,
"loss": 0.2653,
"mean_token_accuracy": 0.9179674088954926,
"num_tokens": 1865010.0,
"step": 459
},
{
"entropy": 0.42821022868156433,
"epoch": 1.0,
"grad_norm": 0.8799027800559998,
"learning_rate": 2.0002232753331453e-05,
"loss": 0.4353,
"mean_token_accuracy": 0.8921568393707275,
"num_tokens": 1865318.0,
"step": 460
}
],
"logging_steps": 1,
"max_steps": 460,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0112748518481592e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}