qwen3-0.6b-sft-capybara / trainer_state.json
TheFloatingString's picture
Upload folder using huggingface_hub
965bd6d verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 5928,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.3182146310806275,
"epoch": 0.005060728744939271,
"grad_norm": 0.36533281207084656,
"learning_rate": 1.9969635627530365e-05,
"loss": 2.0792,
"mean_token_accuracy": 0.5860633730888367,
"num_tokens": 59233.0,
"step": 10
},
{
"entropy": 1.2874402403831482,
"epoch": 0.010121457489878543,
"grad_norm": 0.47160375118255615,
"learning_rate": 1.9935897435897437e-05,
"loss": 2.0417,
"mean_token_accuracy": 0.5939164876937866,
"num_tokens": 114581.0,
"step": 20
},
{
"entropy": 1.1730836629867554,
"epoch": 0.015182186234817813,
"grad_norm": 0.3855545222759247,
"learning_rate": 1.990215924426451e-05,
"loss": 1.8448,
"mean_token_accuracy": 0.6225896775722504,
"num_tokens": 170799.0,
"step": 30
},
{
"entropy": 1.328663158416748,
"epoch": 0.020242914979757085,
"grad_norm": 0.3587476313114166,
"learning_rate": 1.986842105263158e-05,
"loss": 2.0267,
"mean_token_accuracy": 0.5886577606201172,
"num_tokens": 224687.0,
"step": 40
},
{
"entropy": 1.3244597673416139,
"epoch": 0.025303643724696356,
"grad_norm": 0.4023756980895996,
"learning_rate": 1.9834682860998653e-05,
"loss": 1.9579,
"mean_token_accuracy": 0.5918201506137848,
"num_tokens": 282567.0,
"step": 50
},
{
"entropy": 1.3242129743099214,
"epoch": 0.030364372469635626,
"grad_norm": 0.4889814257621765,
"learning_rate": 1.9800944669365722e-05,
"loss": 1.9161,
"mean_token_accuracy": 0.6058241128921509,
"num_tokens": 336705.0,
"step": 60
},
{
"entropy": 1.353874671459198,
"epoch": 0.0354251012145749,
"grad_norm": 0.43763861060142517,
"learning_rate": 1.9767206477732795e-05,
"loss": 1.8797,
"mean_token_accuracy": 0.6002139091491699,
"num_tokens": 395328.0,
"step": 70
},
{
"entropy": 1.2600490927696228,
"epoch": 0.04048582995951417,
"grad_norm": 0.6143773198127747,
"learning_rate": 1.9733468286099865e-05,
"loss": 1.7122,
"mean_token_accuracy": 0.6283527314662933,
"num_tokens": 448766.0,
"step": 80
},
{
"entropy": 1.318302822113037,
"epoch": 0.04554655870445344,
"grad_norm": 0.37836670875549316,
"learning_rate": 1.9699730094466938e-05,
"loss": 1.7054,
"mean_token_accuracy": 0.6172383666038513,
"num_tokens": 502153.0,
"step": 90
},
{
"entropy": 1.395500862598419,
"epoch": 0.05060728744939271,
"grad_norm": 0.42456531524658203,
"learning_rate": 1.966599190283401e-05,
"loss": 1.7468,
"mean_token_accuracy": 0.6102555096149445,
"num_tokens": 559789.0,
"step": 100
},
{
"entropy": 1.2470922768115997,
"epoch": 0.05566801619433198,
"grad_norm": 0.3632870316505432,
"learning_rate": 1.963225371120108e-05,
"loss": 1.5353,
"mean_token_accuracy": 0.6436724066734314,
"num_tokens": 620090.0,
"step": 110
},
{
"entropy": 1.4007501482963562,
"epoch": 0.06072874493927125,
"grad_norm": 0.36180490255355835,
"learning_rate": 1.9598515519568153e-05,
"loss": 1.7042,
"mean_token_accuracy": 0.6169813573360443,
"num_tokens": 674400.0,
"step": 120
},
{
"entropy": 1.509167194366455,
"epoch": 0.06578947368421052,
"grad_norm": 0.33648917078971863,
"learning_rate": 1.9564777327935226e-05,
"loss": 1.8149,
"mean_token_accuracy": 0.6050768792629242,
"num_tokens": 732008.0,
"step": 130
},
{
"entropy": 1.406454861164093,
"epoch": 0.0708502024291498,
"grad_norm": 0.2894444167613983,
"learning_rate": 1.9531039136302295e-05,
"loss": 1.6417,
"mean_token_accuracy": 0.6224378228187561,
"num_tokens": 790088.0,
"step": 140
},
{
"entropy": 1.5152673125267029,
"epoch": 0.07591093117408906,
"grad_norm": 0.28545448184013367,
"learning_rate": 1.949730094466937e-05,
"loss": 1.7609,
"mean_token_accuracy": 0.6055684983730316,
"num_tokens": 846716.0,
"step": 150
},
{
"entropy": 1.3967717170715332,
"epoch": 0.08097165991902834,
"grad_norm": 0.26414811611175537,
"learning_rate": 1.9463562753036438e-05,
"loss": 1.5725,
"mean_token_accuracy": 0.6291777551174164,
"num_tokens": 909154.0,
"step": 160
},
{
"entropy": 1.5347481608390807,
"epoch": 0.0860323886639676,
"grad_norm": 0.27282679080963135,
"learning_rate": 1.942982456140351e-05,
"loss": 1.7069,
"mean_token_accuracy": 0.6087481796741485,
"num_tokens": 967058.0,
"step": 170
},
{
"entropy": 1.4826232314109802,
"epoch": 0.09109311740890688,
"grad_norm": 0.23245945572853088,
"learning_rate": 1.939608636977058e-05,
"loss": 1.636,
"mean_token_accuracy": 0.6207191824913025,
"num_tokens": 1022407.0,
"step": 180
},
{
"entropy": 1.5127532601356506,
"epoch": 0.09615384615384616,
"grad_norm": 0.2711787223815918,
"learning_rate": 1.9362348178137653e-05,
"loss": 1.6767,
"mean_token_accuracy": 0.615806394815445,
"num_tokens": 1079738.0,
"step": 190
},
{
"entropy": 1.6158159017562865,
"epoch": 0.10121457489878542,
"grad_norm": 0.29755550622940063,
"learning_rate": 1.9328609986504726e-05,
"loss": 1.7642,
"mean_token_accuracy": 0.6007438480854035,
"num_tokens": 1140680.0,
"step": 200
},
{
"entropy": 1.4902734279632568,
"epoch": 0.1062753036437247,
"grad_norm": 0.24520562589168549,
"learning_rate": 1.9294871794871796e-05,
"loss": 1.5893,
"mean_token_accuracy": 0.6293672084808349,
"num_tokens": 1194492.0,
"step": 210
},
{
"entropy": 1.6347212672233582,
"epoch": 0.11133603238866396,
"grad_norm": 0.3082791566848755,
"learning_rate": 1.926113360323887e-05,
"loss": 1.7482,
"mean_token_accuracy": 0.602141198515892,
"num_tokens": 1252053.0,
"step": 220
},
{
"entropy": 1.5750308752059936,
"epoch": 0.11639676113360324,
"grad_norm": 0.23394237458705902,
"learning_rate": 1.922739541160594e-05,
"loss": 1.6651,
"mean_token_accuracy": 0.6140229105949402,
"num_tokens": 1308749.0,
"step": 230
},
{
"entropy": 1.5293641209602356,
"epoch": 0.1214574898785425,
"grad_norm": 0.22243493795394897,
"learning_rate": 1.919365721997301e-05,
"loss": 1.5962,
"mean_token_accuracy": 0.6277494192123413,
"num_tokens": 1371806.0,
"step": 240
},
{
"entropy": 1.5892576217651366,
"epoch": 0.12651821862348178,
"grad_norm": 0.23461221158504486,
"learning_rate": 1.915991902834008e-05,
"loss": 1.6669,
"mean_token_accuracy": 0.6198325097560883,
"num_tokens": 1427210.0,
"step": 250
},
{
"entropy": 1.6534079551696776,
"epoch": 0.13157894736842105,
"grad_norm": 0.2797304391860962,
"learning_rate": 1.9126180836707153e-05,
"loss": 1.7432,
"mean_token_accuracy": 0.6030194580554962,
"num_tokens": 1485664.0,
"step": 260
},
{
"entropy": 1.6070345997810365,
"epoch": 0.13663967611336034,
"grad_norm": 0.22065305709838867,
"learning_rate": 1.9092442645074226e-05,
"loss": 1.677,
"mean_token_accuracy": 0.6108350694179535,
"num_tokens": 1544169.0,
"step": 270
},
{
"entropy": 1.658397912979126,
"epoch": 0.1417004048582996,
"grad_norm": 0.17878006398677826,
"learning_rate": 1.9058704453441296e-05,
"loss": 1.7484,
"mean_token_accuracy": 0.6061225473880768,
"num_tokens": 1607852.0,
"step": 280
},
{
"entropy": 1.5805446743965148,
"epoch": 0.14676113360323886,
"grad_norm": 0.20498958230018616,
"learning_rate": 1.902496626180837e-05,
"loss": 1.6358,
"mean_token_accuracy": 0.6217520833015442,
"num_tokens": 1667280.0,
"step": 290
},
{
"entropy": 1.553944504261017,
"epoch": 0.15182186234817813,
"grad_norm": 0.2072789967060089,
"learning_rate": 1.899122807017544e-05,
"loss": 1.6016,
"mean_token_accuracy": 0.6277549624443054,
"num_tokens": 1722987.0,
"step": 300
},
{
"entropy": 1.6091766953468323,
"epoch": 0.15688259109311742,
"grad_norm": 0.25766435265541077,
"learning_rate": 1.895748987854251e-05,
"loss": 1.6603,
"mean_token_accuracy": 0.6145843267440796,
"num_tokens": 1777611.0,
"step": 310
},
{
"entropy": 1.4922061681747436,
"epoch": 0.16194331983805668,
"grad_norm": 0.23709791898727417,
"learning_rate": 1.8923751686909584e-05,
"loss": 1.5237,
"mean_token_accuracy": 0.638946932554245,
"num_tokens": 1833335.0,
"step": 320
},
{
"entropy": 1.5376826167106628,
"epoch": 0.16700404858299595,
"grad_norm": 0.24256624281406403,
"learning_rate": 1.8890013495276657e-05,
"loss": 1.5813,
"mean_token_accuracy": 0.625263386964798,
"num_tokens": 1883303.0,
"step": 330
},
{
"entropy": 1.5892139554023743,
"epoch": 0.1720647773279352,
"grad_norm": 0.20020949840545654,
"learning_rate": 1.8856275303643726e-05,
"loss": 1.6522,
"mean_token_accuracy": 0.6241094172000885,
"num_tokens": 1937007.0,
"step": 340
},
{
"entropy": 1.5604022860527038,
"epoch": 0.1771255060728745,
"grad_norm": 0.2134305238723755,
"learning_rate": 1.8822537112010796e-05,
"loss": 1.6036,
"mean_token_accuracy": 0.617218679189682,
"num_tokens": 1996601.0,
"step": 350
},
{
"entropy": 1.6344910740852356,
"epoch": 0.18218623481781376,
"grad_norm": 0.2528083622455597,
"learning_rate": 1.878879892037787e-05,
"loss": 1.6885,
"mean_token_accuracy": 0.6135373294353486,
"num_tokens": 2051920.0,
"step": 360
},
{
"entropy": 1.6087428450584411,
"epoch": 0.18724696356275303,
"grad_norm": 0.3239048421382904,
"learning_rate": 1.8755060728744942e-05,
"loss": 1.687,
"mean_token_accuracy": 0.6124200880527496,
"num_tokens": 2108082.0,
"step": 370
},
{
"entropy": 1.5165512919425965,
"epoch": 0.19230769230769232,
"grad_norm": 0.21001844108104706,
"learning_rate": 1.872132253711201e-05,
"loss": 1.5231,
"mean_token_accuracy": 0.6357404530048371,
"num_tokens": 2164317.0,
"step": 380
},
{
"entropy": 1.4784175634384156,
"epoch": 0.19736842105263158,
"grad_norm": 0.21521133184432983,
"learning_rate": 1.8687584345479084e-05,
"loss": 1.5055,
"mean_token_accuracy": 0.6435703456401825,
"num_tokens": 2222720.0,
"step": 390
},
{
"entropy": 1.6346607565879823,
"epoch": 0.20242914979757085,
"grad_norm": 0.24888823926448822,
"learning_rate": 1.8653846153846157e-05,
"loss": 1.6701,
"mean_token_accuracy": 0.6212630808353424,
"num_tokens": 2284597.0,
"step": 400
},
{
"entropy": 1.6906925320625306,
"epoch": 0.2074898785425101,
"grad_norm": 0.21836967766284943,
"learning_rate": 1.8620107962213227e-05,
"loss": 1.7238,
"mean_token_accuracy": 0.606383764743805,
"num_tokens": 2340566.0,
"step": 410
},
{
"entropy": 1.4906925320625306,
"epoch": 0.2125506072874494,
"grad_norm": 0.21778637170791626,
"learning_rate": 1.85863697705803e-05,
"loss": 1.5135,
"mean_token_accuracy": 0.6399281203746796,
"num_tokens": 2397861.0,
"step": 420
},
{
"entropy": 1.6318996787071227,
"epoch": 0.21761133603238866,
"grad_norm": 0.2725844979286194,
"learning_rate": 1.8552631578947373e-05,
"loss": 1.707,
"mean_token_accuracy": 0.6183918356895447,
"num_tokens": 2453175.0,
"step": 430
},
{
"entropy": 1.530117428302765,
"epoch": 0.22267206477732793,
"grad_norm": 0.20461727678775787,
"learning_rate": 1.8518893387314442e-05,
"loss": 1.5423,
"mean_token_accuracy": 0.6370847761631012,
"num_tokens": 2511372.0,
"step": 440
},
{
"entropy": 1.5733375310897828,
"epoch": 0.22773279352226722,
"grad_norm": 0.2394452542066574,
"learning_rate": 1.848515519568151e-05,
"loss": 1.5909,
"mean_token_accuracy": 0.6229382216930389,
"num_tokens": 2571400.0,
"step": 450
},
{
"entropy": 1.4420257091522217,
"epoch": 0.23279352226720648,
"grad_norm": 0.23069949448108673,
"learning_rate": 1.8451417004048584e-05,
"loss": 1.4763,
"mean_token_accuracy": 0.6464443206787109,
"num_tokens": 2630220.0,
"step": 460
},
{
"entropy": 1.5972508192062378,
"epoch": 0.23785425101214575,
"grad_norm": 0.22586746513843536,
"learning_rate": 1.8417678812415657e-05,
"loss": 1.6251,
"mean_token_accuracy": 0.6198283314704895,
"num_tokens": 2688923.0,
"step": 470
},
{
"entropy": 1.4670196294784545,
"epoch": 0.242914979757085,
"grad_norm": 0.23567302525043488,
"learning_rate": 1.8383940620782727e-05,
"loss": 1.4654,
"mean_token_accuracy": 0.6410723388195038,
"num_tokens": 2745340.0,
"step": 480
},
{
"entropy": 1.5074788808822632,
"epoch": 0.2479757085020243,
"grad_norm": 0.2870822548866272,
"learning_rate": 1.83502024291498e-05,
"loss": 1.532,
"mean_token_accuracy": 0.6343021392822266,
"num_tokens": 2796794.0,
"step": 490
},
{
"entropy": 1.5003413558006287,
"epoch": 0.25303643724696356,
"grad_norm": 0.19105228781700134,
"learning_rate": 1.8316464237516873e-05,
"loss": 1.5132,
"mean_token_accuracy": 0.643144553899765,
"num_tokens": 2853606.0,
"step": 500
},
{
"entropy": 1.4724809288978578,
"epoch": 0.25809716599190285,
"grad_norm": 0.2321540117263794,
"learning_rate": 1.8282726045883942e-05,
"loss": 1.499,
"mean_token_accuracy": 0.6394071221351624,
"num_tokens": 2910686.0,
"step": 510
},
{
"entropy": 1.5466928601264953,
"epoch": 0.2631578947368421,
"grad_norm": 0.2588091790676117,
"learning_rate": 1.8248987854251015e-05,
"loss": 1.5745,
"mean_token_accuracy": 0.6278865933418274,
"num_tokens": 2969316.0,
"step": 520
},
{
"entropy": 1.4397364735603333,
"epoch": 0.2682186234817814,
"grad_norm": 0.22344444692134857,
"learning_rate": 1.8215249662618085e-05,
"loss": 1.4459,
"mean_token_accuracy": 0.6503956913948059,
"num_tokens": 3021973.0,
"step": 530
},
{
"entropy": 1.5749887704849244,
"epoch": 0.2732793522267207,
"grad_norm": 0.20665939152240753,
"learning_rate": 1.8181511470985158e-05,
"loss": 1.6023,
"mean_token_accuracy": 0.622279840707779,
"num_tokens": 3081510.0,
"step": 540
},
{
"entropy": 1.519134557247162,
"epoch": 0.2783400809716599,
"grad_norm": 0.20693500339984894,
"learning_rate": 1.8147773279352227e-05,
"loss": 1.5253,
"mean_token_accuracy": 0.6335927963256835,
"num_tokens": 3139438.0,
"step": 550
},
{
"entropy": 1.4850042462348938,
"epoch": 0.2834008097165992,
"grad_norm": 0.20233699679374695,
"learning_rate": 1.81140350877193e-05,
"loss": 1.5081,
"mean_token_accuracy": 0.6392342805862427,
"num_tokens": 3194184.0,
"step": 560
},
{
"entropy": 1.6362142205238341,
"epoch": 0.28846153846153844,
"grad_norm": 0.19187521934509277,
"learning_rate": 1.808029689608637e-05,
"loss": 1.6497,
"mean_token_accuracy": 0.6131653010845184,
"num_tokens": 3253449.0,
"step": 570
},
{
"entropy": 1.5829873204231262,
"epoch": 0.2935222672064777,
"grad_norm": 0.21769073605537415,
"learning_rate": 1.8046558704453442e-05,
"loss": 1.6063,
"mean_token_accuracy": 0.6185498893260956,
"num_tokens": 3309330.0,
"step": 580
},
{
"entropy": 1.5365434288978577,
"epoch": 0.298582995951417,
"grad_norm": 0.20103144645690918,
"learning_rate": 1.8012820512820515e-05,
"loss": 1.5559,
"mean_token_accuracy": 0.6354671478271484,
"num_tokens": 3368237.0,
"step": 590
},
{
"entropy": 1.5507975578308106,
"epoch": 0.30364372469635625,
"grad_norm": 0.20210447907447815,
"learning_rate": 1.7979082321187585e-05,
"loss": 1.5848,
"mean_token_accuracy": 0.6247013151645661,
"num_tokens": 3429760.0,
"step": 600
},
{
"entropy": 1.668788731098175,
"epoch": 0.30870445344129555,
"grad_norm": 0.23076701164245605,
"learning_rate": 1.7945344129554658e-05,
"loss": 1.7186,
"mean_token_accuracy": 0.6151426732540131,
"num_tokens": 3481424.0,
"step": 610
},
{
"entropy": 1.4967810273170472,
"epoch": 0.31376518218623484,
"grad_norm": 0.18658699095249176,
"learning_rate": 1.791160593792173e-05,
"loss": 1.5039,
"mean_token_accuracy": 0.6395800650119782,
"num_tokens": 3540974.0,
"step": 620
},
{
"entropy": 1.5561461091041564,
"epoch": 0.3188259109311741,
"grad_norm": 0.2277403026819229,
"learning_rate": 1.78778677462888e-05,
"loss": 1.5933,
"mean_token_accuracy": 0.6238772809505463,
"num_tokens": 3599247.0,
"step": 630
},
{
"entropy": 1.6327290773391723,
"epoch": 0.32388663967611336,
"grad_norm": 0.21525472402572632,
"learning_rate": 1.784412955465587e-05,
"loss": 1.6583,
"mean_token_accuracy": 0.6137534499168396,
"num_tokens": 3656797.0,
"step": 640
},
{
"entropy": 1.6783543229103088,
"epoch": 0.32894736842105265,
"grad_norm": 0.2178918868303299,
"learning_rate": 1.7810391363022943e-05,
"loss": 1.7236,
"mean_token_accuracy": 0.6091976821422577,
"num_tokens": 3712597.0,
"step": 650
},
{
"entropy": 1.458414077758789,
"epoch": 0.3340080971659919,
"grad_norm": 0.2724186182022095,
"learning_rate": 1.7776653171390016e-05,
"loss": 1.4712,
"mean_token_accuracy": 0.6499071300029755,
"num_tokens": 3769258.0,
"step": 660
},
{
"entropy": 1.4010087609291078,
"epoch": 0.3390688259109312,
"grad_norm": 0.24354684352874756,
"learning_rate": 1.7742914979757085e-05,
"loss": 1.4125,
"mean_token_accuracy": 0.6563641846179962,
"num_tokens": 3827461.0,
"step": 670
},
{
"entropy": 1.5870350360870362,
"epoch": 0.3441295546558704,
"grad_norm": 0.20323996245861053,
"learning_rate": 1.7709176788124158e-05,
"loss": 1.6215,
"mean_token_accuracy": 0.624784529209137,
"num_tokens": 3884189.0,
"step": 680
},
{
"entropy": 1.6935706734657288,
"epoch": 0.3491902834008097,
"grad_norm": 0.24285322427749634,
"learning_rate": 1.767543859649123e-05,
"loss": 1.7141,
"mean_token_accuracy": 0.6097041130065918,
"num_tokens": 3939148.0,
"step": 690
},
{
"entropy": 1.5216692566871644,
"epoch": 0.354251012145749,
"grad_norm": 0.24251361191272736,
"learning_rate": 1.76417004048583e-05,
"loss": 1.526,
"mean_token_accuracy": 0.6344065189361572,
"num_tokens": 3996059.0,
"step": 700
},
{
"entropy": 1.584353470802307,
"epoch": 0.35931174089068824,
"grad_norm": 0.22013038396835327,
"learning_rate": 1.7607962213225373e-05,
"loss": 1.5894,
"mean_token_accuracy": 0.6244750499725342,
"num_tokens": 4056179.0,
"step": 710
},
{
"entropy": 1.499224054813385,
"epoch": 0.3643724696356275,
"grad_norm": 0.22103145718574524,
"learning_rate": 1.7574224021592443e-05,
"loss": 1.5209,
"mean_token_accuracy": 0.6322570383548737,
"num_tokens": 4114329.0,
"step": 720
},
{
"entropy": 1.3952741980552674,
"epoch": 0.3694331983805668,
"grad_norm": 0.19164645671844482,
"learning_rate": 1.7540485829959516e-05,
"loss": 1.4095,
"mean_token_accuracy": 0.6568491697311402,
"num_tokens": 4167072.0,
"step": 730
},
{
"entropy": 1.484491491317749,
"epoch": 0.37449392712550605,
"grad_norm": 0.22778365015983582,
"learning_rate": 1.7506747638326585e-05,
"loss": 1.5054,
"mean_token_accuracy": 0.6413045108318329,
"num_tokens": 4225902.0,
"step": 740
},
{
"entropy": 1.5875544667243957,
"epoch": 0.37955465587044535,
"grad_norm": 0.22424441576004028,
"learning_rate": 1.7473009446693658e-05,
"loss": 1.6189,
"mean_token_accuracy": 0.6218379735946655,
"num_tokens": 4282716.0,
"step": 750
},
{
"entropy": 1.5914431929588317,
"epoch": 0.38461538461538464,
"grad_norm": 0.22598877549171448,
"learning_rate": 1.743927125506073e-05,
"loss": 1.629,
"mean_token_accuracy": 0.6168800354003906,
"num_tokens": 4341769.0,
"step": 760
},
{
"entropy": 1.588646697998047,
"epoch": 0.3896761133603239,
"grad_norm": 0.24020566046237946,
"learning_rate": 1.74055330634278e-05,
"loss": 1.5962,
"mean_token_accuracy": 0.6249550580978394,
"num_tokens": 4400298.0,
"step": 770
},
{
"entropy": 1.524513852596283,
"epoch": 0.39473684210526316,
"grad_norm": 0.19308218359947205,
"learning_rate": 1.7371794871794873e-05,
"loss": 1.5494,
"mean_token_accuracy": 0.6319825410842895,
"num_tokens": 4456170.0,
"step": 780
},
{
"entropy": 1.5187025308609008,
"epoch": 0.39979757085020245,
"grad_norm": 0.2745817303657532,
"learning_rate": 1.7338056680161946e-05,
"loss": 1.5286,
"mean_token_accuracy": 0.641098040342331,
"num_tokens": 4509439.0,
"step": 790
},
{
"entropy": 1.5416448593139649,
"epoch": 0.4048582995951417,
"grad_norm": 0.2520337998867035,
"learning_rate": 1.7304318488529016e-05,
"loss": 1.5522,
"mean_token_accuracy": 0.6389577805995941,
"num_tokens": 4568509.0,
"step": 800
},
{
"entropy": 1.622413122653961,
"epoch": 0.409919028340081,
"grad_norm": 0.20173849165439606,
"learning_rate": 1.7270580296896085e-05,
"loss": 1.6312,
"mean_token_accuracy": 0.6254597425460815,
"num_tokens": 4621157.0,
"step": 810
},
{
"entropy": 1.6604674816131593,
"epoch": 0.4149797570850202,
"grad_norm": 0.23679770529270172,
"learning_rate": 1.723684210526316e-05,
"loss": 1.6884,
"mean_token_accuracy": 0.6142423152923584,
"num_tokens": 4673418.0,
"step": 820
},
{
"entropy": 1.512584674358368,
"epoch": 0.4200404858299595,
"grad_norm": 0.22097937762737274,
"learning_rate": 1.720310391363023e-05,
"loss": 1.5394,
"mean_token_accuracy": 0.6410868644714356,
"num_tokens": 4731386.0,
"step": 830
},
{
"entropy": 1.4811100244522095,
"epoch": 0.4251012145748988,
"grad_norm": 0.1975807249546051,
"learning_rate": 1.71693657219973e-05,
"loss": 1.474,
"mean_token_accuracy": 0.6388140618801117,
"num_tokens": 4784928.0,
"step": 840
},
{
"entropy": 1.6224814653396606,
"epoch": 0.43016194331983804,
"grad_norm": 0.21695128083229065,
"learning_rate": 1.7135627530364374e-05,
"loss": 1.6465,
"mean_token_accuracy": 0.6171948432922363,
"num_tokens": 4844351.0,
"step": 850
},
{
"entropy": 1.4179201185703278,
"epoch": 0.4352226720647773,
"grad_norm": 0.2105616182088852,
"learning_rate": 1.7101889338731447e-05,
"loss": 1.4287,
"mean_token_accuracy": 0.6521054327487945,
"num_tokens": 4902506.0,
"step": 860
},
{
"entropy": 1.5097766757011413,
"epoch": 0.4402834008097166,
"grad_norm": 0.23443420231342316,
"learning_rate": 1.7068151147098516e-05,
"loss": 1.526,
"mean_token_accuracy": 0.6371770858764648,
"num_tokens": 4957377.0,
"step": 870
},
{
"entropy": 1.499946367740631,
"epoch": 0.44534412955465585,
"grad_norm": 0.1935402899980545,
"learning_rate": 1.703441295546559e-05,
"loss": 1.536,
"mean_token_accuracy": 0.6418095469474793,
"num_tokens": 5019057.0,
"step": 880
},
{
"entropy": 1.5840212941169738,
"epoch": 0.45040485829959515,
"grad_norm": 0.2871309518814087,
"learning_rate": 1.7000674763832662e-05,
"loss": 1.5944,
"mean_token_accuracy": 0.6274131119251252,
"num_tokens": 5072125.0,
"step": 890
},
{
"entropy": 1.6296968936920166,
"epoch": 0.45546558704453444,
"grad_norm": 0.19836841523647308,
"learning_rate": 1.696693657219973e-05,
"loss": 1.6328,
"mean_token_accuracy": 0.621731948852539,
"num_tokens": 5127397.0,
"step": 900
},
{
"entropy": 1.5373274445533753,
"epoch": 0.4605263157894737,
"grad_norm": 0.24680444598197937,
"learning_rate": 1.69331983805668e-05,
"loss": 1.5417,
"mean_token_accuracy": 0.628632801771164,
"num_tokens": 5179785.0,
"step": 910
},
{
"entropy": 1.4075371384620667,
"epoch": 0.46558704453441296,
"grad_norm": 0.23700740933418274,
"learning_rate": 1.6899460188933874e-05,
"loss": 1.4108,
"mean_token_accuracy": 0.6608946800231934,
"num_tokens": 5235573.0,
"step": 920
},
{
"entropy": 1.5140914797782898,
"epoch": 0.4706477732793522,
"grad_norm": 0.23013481497764587,
"learning_rate": 1.6865721997300947e-05,
"loss": 1.5085,
"mean_token_accuracy": 0.6322543203830719,
"num_tokens": 5294146.0,
"step": 930
},
{
"entropy": 1.5315414309501647,
"epoch": 0.4757085020242915,
"grad_norm": 0.27098962664604187,
"learning_rate": 1.6831983805668016e-05,
"loss": 1.5617,
"mean_token_accuracy": 0.6292850613594055,
"num_tokens": 5350989.0,
"step": 940
},
{
"entropy": 1.479003095626831,
"epoch": 0.4807692307692308,
"grad_norm": 0.1984509378671646,
"learning_rate": 1.679824561403509e-05,
"loss": 1.4812,
"mean_token_accuracy": 0.6419821918010712,
"num_tokens": 5407325.0,
"step": 950
},
{
"entropy": 1.5653569459915162,
"epoch": 0.48582995951417,
"grad_norm": 0.2867957353591919,
"learning_rate": 1.6764507422402162e-05,
"loss": 1.6079,
"mean_token_accuracy": 0.6316430389881134,
"num_tokens": 5460946.0,
"step": 960
},
{
"entropy": 1.476916539669037,
"epoch": 0.4908906882591093,
"grad_norm": 0.30787891149520874,
"learning_rate": 1.673076923076923e-05,
"loss": 1.48,
"mean_token_accuracy": 0.6418763399124146,
"num_tokens": 5521311.0,
"step": 970
},
{
"entropy": 1.521777379512787,
"epoch": 0.4959514170040486,
"grad_norm": 0.22446390986442566,
"learning_rate": 1.6697031039136305e-05,
"loss": 1.5293,
"mean_token_accuracy": 0.6305320382118225,
"num_tokens": 5586273.0,
"step": 980
},
{
"entropy": 1.5841980934143067,
"epoch": 0.5010121457489879,
"grad_norm": 0.24676790833473206,
"learning_rate": 1.6663292847503377e-05,
"loss": 1.6064,
"mean_token_accuracy": 0.6247429788112641,
"num_tokens": 5645607.0,
"step": 990
},
{
"entropy": 1.5454851269721985,
"epoch": 0.5060728744939271,
"grad_norm": 0.2086755633354187,
"learning_rate": 1.6629554655870447e-05,
"loss": 1.5715,
"mean_token_accuracy": 0.6307513952255249,
"num_tokens": 5703782.0,
"step": 1000
},
{
"entropy": 1.479654276371002,
"epoch": 0.5111336032388664,
"grad_norm": 0.21369728446006775,
"learning_rate": 1.6595816464237517e-05,
"loss": 1.4857,
"mean_token_accuracy": 0.6503060281276702,
"num_tokens": 5756050.0,
"step": 1010
},
{
"entropy": 1.5413155913352967,
"epoch": 0.5161943319838057,
"grad_norm": 0.29068905115127563,
"learning_rate": 1.656207827260459e-05,
"loss": 1.5625,
"mean_token_accuracy": 0.6323202788829804,
"num_tokens": 5813575.0,
"step": 1020
},
{
"entropy": 1.5284120678901671,
"epoch": 0.521255060728745,
"grad_norm": 0.26866260170936584,
"learning_rate": 1.6528340080971662e-05,
"loss": 1.5296,
"mean_token_accuracy": 0.6371418595314026,
"num_tokens": 5868292.0,
"step": 1030
},
{
"entropy": 1.5233107686042786,
"epoch": 0.5263157894736842,
"grad_norm": 0.2544384300708771,
"learning_rate": 1.6494601889338732e-05,
"loss": 1.5192,
"mean_token_accuracy": 0.629064416885376,
"num_tokens": 5926581.0,
"step": 1040
},
{
"entropy": 1.4819204211235046,
"epoch": 0.5313765182186235,
"grad_norm": 0.2691729962825775,
"learning_rate": 1.6460863697705805e-05,
"loss": 1.489,
"mean_token_accuracy": 0.6453047692775726,
"num_tokens": 5983604.0,
"step": 1050
},
{
"entropy": 1.343429481983185,
"epoch": 0.5364372469635628,
"grad_norm": 0.21679846942424774,
"learning_rate": 1.6427125506072878e-05,
"loss": 1.34,
"mean_token_accuracy": 0.669063252210617,
"num_tokens": 6040931.0,
"step": 1060
},
{
"entropy": 1.620318102836609,
"epoch": 0.541497975708502,
"grad_norm": 0.2846720516681671,
"learning_rate": 1.6393387314439947e-05,
"loss": 1.6464,
"mean_token_accuracy": 0.625487893819809,
"num_tokens": 6094196.0,
"step": 1070
},
{
"entropy": 1.6185827255249023,
"epoch": 0.5465587044534413,
"grad_norm": 0.24272854626178741,
"learning_rate": 1.635964912280702e-05,
"loss": 1.6422,
"mean_token_accuracy": 0.6269473850727081,
"num_tokens": 6150350.0,
"step": 1080
},
{
"entropy": 1.5225468039512635,
"epoch": 0.5516194331983806,
"grad_norm": 0.2274954468011856,
"learning_rate": 1.632591093117409e-05,
"loss": 1.5128,
"mean_token_accuracy": 0.6347517490386962,
"num_tokens": 6203671.0,
"step": 1090
},
{
"entropy": 1.4914517521858215,
"epoch": 0.5566801619433198,
"grad_norm": 0.20096349716186523,
"learning_rate": 1.6292172739541163e-05,
"loss": 1.5056,
"mean_token_accuracy": 0.6353028774261474,
"num_tokens": 6264849.0,
"step": 1100
},
{
"entropy": 1.5485971808433532,
"epoch": 0.5617408906882592,
"grad_norm": 0.24010322988033295,
"learning_rate": 1.6258434547908232e-05,
"loss": 1.5398,
"mean_token_accuracy": 0.6348303139209748,
"num_tokens": 6321385.0,
"step": 1110
},
{
"entropy": 1.5580425620079041,
"epoch": 0.5668016194331984,
"grad_norm": 0.21382348239421844,
"learning_rate": 1.6224696356275305e-05,
"loss": 1.5824,
"mean_token_accuracy": 0.6257192850112915,
"num_tokens": 6377498.0,
"step": 1120
},
{
"entropy": 1.5573256254196166,
"epoch": 0.5718623481781376,
"grad_norm": 0.24488642811775208,
"learning_rate": 1.6190958164642378e-05,
"loss": 1.5628,
"mean_token_accuracy": 0.6246356785297393,
"num_tokens": 6432433.0,
"step": 1130
},
{
"entropy": 1.4993282079696655,
"epoch": 0.5769230769230769,
"grad_norm": 0.2223263829946518,
"learning_rate": 1.6157219973009447e-05,
"loss": 1.5111,
"mean_token_accuracy": 0.6382519125938415,
"num_tokens": 6492547.0,
"step": 1140
},
{
"entropy": 1.4915427923202516,
"epoch": 0.5819838056680162,
"grad_norm": 0.232344850897789,
"learning_rate": 1.612348178137652e-05,
"loss": 1.5079,
"mean_token_accuracy": 0.6373468995094299,
"num_tokens": 6545726.0,
"step": 1150
},
{
"entropy": 1.561433982849121,
"epoch": 0.5870445344129555,
"grad_norm": 0.2586466073989868,
"learning_rate": 1.6089743589743593e-05,
"loss": 1.5638,
"mean_token_accuracy": 0.6296638369560241,
"num_tokens": 6606186.0,
"step": 1160
},
{
"entropy": 1.4575641989707946,
"epoch": 0.5921052631578947,
"grad_norm": 0.23262882232666016,
"learning_rate": 1.6056005398110663e-05,
"loss": 1.4734,
"mean_token_accuracy": 0.646199643611908,
"num_tokens": 6666588.0,
"step": 1170
},
{
"entropy": 1.5205667972564698,
"epoch": 0.597165991902834,
"grad_norm": 0.2673611044883728,
"learning_rate": 1.6022267206477736e-05,
"loss": 1.5302,
"mean_token_accuracy": 0.6332932889461518,
"num_tokens": 6728351.0,
"step": 1180
},
{
"entropy": 1.5646514296531677,
"epoch": 0.6022267206477733,
"grad_norm": 0.24620375037193298,
"learning_rate": 1.5988529014844805e-05,
"loss": 1.5848,
"mean_token_accuracy": 0.6307655155658722,
"num_tokens": 6789626.0,
"step": 1190
},
{
"entropy": 1.597201144695282,
"epoch": 0.6072874493927125,
"grad_norm": 0.28606894612312317,
"learning_rate": 1.5954790823211878e-05,
"loss": 1.5779,
"mean_token_accuracy": 0.6305352866649627,
"num_tokens": 6840612.0,
"step": 1200
},
{
"entropy": 1.6090473532676697,
"epoch": 0.6123481781376519,
"grad_norm": 0.26432231068611145,
"learning_rate": 1.5921052631578948e-05,
"loss": 1.6361,
"mean_token_accuracy": 0.6166266143321991,
"num_tokens": 6898491.0,
"step": 1210
},
{
"entropy": 1.5265617489814758,
"epoch": 0.6174089068825911,
"grad_norm": 0.24568380415439606,
"learning_rate": 1.588731443994602e-05,
"loss": 1.5239,
"mean_token_accuracy": 0.6334243714809418,
"num_tokens": 6953682.0,
"step": 1220
},
{
"entropy": 1.5580573081970215,
"epoch": 0.6224696356275303,
"grad_norm": 0.2606264650821686,
"learning_rate": 1.5853576248313093e-05,
"loss": 1.5652,
"mean_token_accuracy": 0.6310077726840972,
"num_tokens": 7009187.0,
"step": 1230
},
{
"entropy": 1.5315574645996093,
"epoch": 0.6275303643724697,
"grad_norm": 0.23248089849948883,
"learning_rate": 1.5819838056680163e-05,
"loss": 1.5515,
"mean_token_accuracy": 0.635067343711853,
"num_tokens": 7069568.0,
"step": 1240
},
{
"entropy": 1.4524394154548645,
"epoch": 0.6325910931174089,
"grad_norm": 0.20559658110141754,
"learning_rate": 1.5786099865047236e-05,
"loss": 1.4655,
"mean_token_accuracy": 0.644383716583252,
"num_tokens": 7132908.0,
"step": 1250
},
{
"entropy": 1.5834705471992492,
"epoch": 0.6376518218623481,
"grad_norm": 0.2312365472316742,
"learning_rate": 1.5752361673414305e-05,
"loss": 1.6107,
"mean_token_accuracy": 0.6216763257980347,
"num_tokens": 7193948.0,
"step": 1260
},
{
"entropy": 1.5223916292190551,
"epoch": 0.6427125506072875,
"grad_norm": 0.302206426858902,
"learning_rate": 1.5718623481781378e-05,
"loss": 1.5347,
"mean_token_accuracy": 0.6370865941047669,
"num_tokens": 7249715.0,
"step": 1270
},
{
"entropy": 1.68130704164505,
"epoch": 0.6477732793522267,
"grad_norm": 0.24234986305236816,
"learning_rate": 1.5684885290148448e-05,
"loss": 1.703,
"mean_token_accuracy": 0.6063290297985077,
"num_tokens": 7306114.0,
"step": 1280
},
{
"entropy": 1.6868065476417542,
"epoch": 0.652834008097166,
"grad_norm": 0.2558751702308655,
"learning_rate": 1.565114709851552e-05,
"loss": 1.7002,
"mean_token_accuracy": 0.6132429718971253,
"num_tokens": 7363295.0,
"step": 1290
},
{
"entropy": 1.4368155479431153,
"epoch": 0.6578947368421053,
"grad_norm": 0.3175618350505829,
"learning_rate": 1.561740890688259e-05,
"loss": 1.4368,
"mean_token_accuracy": 0.6544794201850891,
"num_tokens": 7415420.0,
"step": 1300
},
{
"entropy": 1.5615759491920471,
"epoch": 0.6629554655870445,
"grad_norm": 0.2953908443450928,
"learning_rate": 1.5583670715249663e-05,
"loss": 1.5617,
"mean_token_accuracy": 0.6310927093029022,
"num_tokens": 7475642.0,
"step": 1310
},
{
"entropy": 1.4271193981170653,
"epoch": 0.6680161943319838,
"grad_norm": 0.24695925414562225,
"learning_rate": 1.5549932523616736e-05,
"loss": 1.4189,
"mean_token_accuracy": 0.6522926926612854,
"num_tokens": 7538029.0,
"step": 1320
},
{
"entropy": 1.5500613093376159,
"epoch": 0.6730769230769231,
"grad_norm": 0.2324494868516922,
"learning_rate": 1.5516194331983806e-05,
"loss": 1.5641,
"mean_token_accuracy": 0.626498419046402,
"num_tokens": 7597460.0,
"step": 1330
},
{
"entropy": 1.476065456867218,
"epoch": 0.6781376518218624,
"grad_norm": 0.2418016493320465,
"learning_rate": 1.548245614035088e-05,
"loss": 1.4751,
"mean_token_accuracy": 0.641443008184433,
"num_tokens": 7652792.0,
"step": 1340
},
{
"entropy": 1.5325765252113341,
"epoch": 0.6831983805668016,
"grad_norm": 0.23513104021549225,
"learning_rate": 1.544871794871795e-05,
"loss": 1.5499,
"mean_token_accuracy": 0.6278112173080445,
"num_tokens": 7706166.0,
"step": 1350
},
{
"entropy": 1.5952306509017944,
"epoch": 0.6882591093117408,
"grad_norm": 0.22960874438285828,
"learning_rate": 1.541497975708502e-05,
"loss": 1.6124,
"mean_token_accuracy": 0.623203706741333,
"num_tokens": 7762524.0,
"step": 1360
},
{
"entropy": 1.4605698585510254,
"epoch": 0.6933198380566802,
"grad_norm": 0.2283059060573578,
"learning_rate": 1.5381241565452094e-05,
"loss": 1.4702,
"mean_token_accuracy": 0.6456966698169708,
"num_tokens": 7816597.0,
"step": 1370
},
{
"entropy": 1.3722566485404968,
"epoch": 0.6983805668016194,
"grad_norm": 0.24912376701831818,
"learning_rate": 1.5347503373819163e-05,
"loss": 1.3777,
"mean_token_accuracy": 0.6624338209629059,
"num_tokens": 7878907.0,
"step": 1380
},
{
"entropy": 1.5658705353736877,
"epoch": 0.7034412955465587,
"grad_norm": 0.26213786005973816,
"learning_rate": 1.5313765182186236e-05,
"loss": 1.5614,
"mean_token_accuracy": 0.6281410813331604,
"num_tokens": 7931234.0,
"step": 1390
},
{
"entropy": 1.4248001098632812,
"epoch": 0.708502024291498,
"grad_norm": 0.3189115822315216,
"learning_rate": 1.5280026990553306e-05,
"loss": 1.4343,
"mean_token_accuracy": 0.6542839646339417,
"num_tokens": 7983537.0,
"step": 1400
},
{
"entropy": 1.499564802646637,
"epoch": 0.7135627530364372,
"grad_norm": 0.24217011034488678,
"learning_rate": 1.5246288798920379e-05,
"loss": 1.5238,
"mean_token_accuracy": 0.6327670216560364,
"num_tokens": 8039434.0,
"step": 1410
},
{
"entropy": 1.5084555625915528,
"epoch": 0.7186234817813765,
"grad_norm": 0.21525943279266357,
"learning_rate": 1.521255060728745e-05,
"loss": 1.5051,
"mean_token_accuracy": 0.6452975988388061,
"num_tokens": 8095407.0,
"step": 1420
},
{
"entropy": 1.5463826656341553,
"epoch": 0.7236842105263158,
"grad_norm": 0.25616827607154846,
"learning_rate": 1.5178812415654523e-05,
"loss": 1.5526,
"mean_token_accuracy": 0.6282021820545196,
"num_tokens": 8150109.0,
"step": 1430
},
{
"entropy": 1.6857656121253968,
"epoch": 0.728744939271255,
"grad_norm": 0.25321727991104126,
"learning_rate": 1.5145074224021594e-05,
"loss": 1.7184,
"mean_token_accuracy": 0.6112756371498108,
"num_tokens": 8214438.0,
"step": 1440
},
{
"entropy": 1.5806215167045594,
"epoch": 0.7338056680161943,
"grad_norm": 0.21112073957920074,
"learning_rate": 1.5111336032388665e-05,
"loss": 1.5852,
"mean_token_accuracy": 0.6202045798301696,
"num_tokens": 8273743.0,
"step": 1450
},
{
"entropy": 1.4979040026664734,
"epoch": 0.7388663967611336,
"grad_norm": 0.22126545011997223,
"learning_rate": 1.5077597840755738e-05,
"loss": 1.5201,
"mean_token_accuracy": 0.637889975309372,
"num_tokens": 8334507.0,
"step": 1460
},
{
"entropy": 1.5463458061218263,
"epoch": 0.7439271255060729,
"grad_norm": 0.22952505946159363,
"learning_rate": 1.5043859649122808e-05,
"loss": 1.5498,
"mean_token_accuracy": 0.6345071375370026,
"num_tokens": 8390405.0,
"step": 1470
},
{
"entropy": 1.423577868938446,
"epoch": 0.7489878542510121,
"grad_norm": 0.2474886029958725,
"learning_rate": 1.5010121457489879e-05,
"loss": 1.4306,
"mean_token_accuracy": 0.655298399925232,
"num_tokens": 8452056.0,
"step": 1480
},
{
"entropy": 1.5899730324745178,
"epoch": 0.7540485829959515,
"grad_norm": 0.2736392021179199,
"learning_rate": 1.497638326585695e-05,
"loss": 1.581,
"mean_token_accuracy": 0.6186748504638672,
"num_tokens": 8511999.0,
"step": 1490
},
{
"entropy": 1.5433414101600647,
"epoch": 0.7591093117408907,
"grad_norm": 0.2836778163909912,
"learning_rate": 1.4942645074224023e-05,
"loss": 1.5544,
"mean_token_accuracy": 0.6286308348178864,
"num_tokens": 8566021.0,
"step": 1500
},
{
"entropy": 1.4887511134147644,
"epoch": 0.7641700404858299,
"grad_norm": 0.33601313829421997,
"learning_rate": 1.4908906882591094e-05,
"loss": 1.4994,
"mean_token_accuracy": 0.6406654596328736,
"num_tokens": 8622757.0,
"step": 1510
},
{
"entropy": 1.5212846279144288,
"epoch": 0.7692307692307693,
"grad_norm": 0.2853647470474243,
"learning_rate": 1.4875168690958165e-05,
"loss": 1.5409,
"mean_token_accuracy": 0.6337429225444794,
"num_tokens": 8677777.0,
"step": 1520
},
{
"entropy": 1.4735643148422242,
"epoch": 0.7742914979757085,
"grad_norm": 0.2369018942117691,
"learning_rate": 1.4841430499325238e-05,
"loss": 1.4812,
"mean_token_accuracy": 0.6412514448165894,
"num_tokens": 8735792.0,
"step": 1530
},
{
"entropy": 1.5245864272117615,
"epoch": 0.7793522267206477,
"grad_norm": 0.2317512333393097,
"learning_rate": 1.480769230769231e-05,
"loss": 1.5354,
"mean_token_accuracy": 0.6362193703651429,
"num_tokens": 8795324.0,
"step": 1540
},
{
"entropy": 1.487471914291382,
"epoch": 0.7844129554655871,
"grad_norm": 0.24812865257263184,
"learning_rate": 1.477395411605938e-05,
"loss": 1.487,
"mean_token_accuracy": 0.6461592555046082,
"num_tokens": 8848190.0,
"step": 1550
},
{
"entropy": 1.446857714653015,
"epoch": 0.7894736842105263,
"grad_norm": 0.23715689778327942,
"learning_rate": 1.474021592442645e-05,
"loss": 1.4494,
"mean_token_accuracy": 0.654287850856781,
"num_tokens": 8900078.0,
"step": 1560
},
{
"entropy": 1.6414281487464906,
"epoch": 0.7945344129554656,
"grad_norm": 0.26817786693573,
"learning_rate": 1.4706477732793523e-05,
"loss": 1.6536,
"mean_token_accuracy": 0.6186295211315155,
"num_tokens": 8955471.0,
"step": 1570
},
{
"entropy": 1.5608402729034423,
"epoch": 0.7995951417004049,
"grad_norm": 0.2652844190597534,
"learning_rate": 1.4672739541160594e-05,
"loss": 1.5787,
"mean_token_accuracy": 0.62896608710289,
"num_tokens": 9013912.0,
"step": 1580
},
{
"entropy": 1.5290770292282105,
"epoch": 0.8046558704453441,
"grad_norm": 0.25053921341896057,
"learning_rate": 1.4639001349527666e-05,
"loss": 1.543,
"mean_token_accuracy": 0.6325620353221894,
"num_tokens": 9073236.0,
"step": 1590
},
{
"entropy": 1.473749542236328,
"epoch": 0.8097165991902834,
"grad_norm": 0.2638007402420044,
"learning_rate": 1.4605263157894739e-05,
"loss": 1.4962,
"mean_token_accuracy": 0.64018235206604,
"num_tokens": 9130345.0,
"step": 1600
},
{
"entropy": 1.4807411432266235,
"epoch": 0.8147773279352226,
"grad_norm": 0.2131456434726715,
"learning_rate": 1.457152496626181e-05,
"loss": 1.4896,
"mean_token_accuracy": 0.6396925866603851,
"num_tokens": 9181695.0,
"step": 1610
},
{
"entropy": 1.4747131943702698,
"epoch": 0.819838056680162,
"grad_norm": 0.25145605206489563,
"learning_rate": 1.4537786774628881e-05,
"loss": 1.4513,
"mean_token_accuracy": 0.6473784625530243,
"num_tokens": 9237367.0,
"step": 1620
},
{
"entropy": 1.5602935075759887,
"epoch": 0.8248987854251012,
"grad_norm": 0.24879582226276398,
"learning_rate": 1.4504048582995954e-05,
"loss": 1.565,
"mean_token_accuracy": 0.6289263606071472,
"num_tokens": 9302101.0,
"step": 1630
},
{
"entropy": 1.4359328031539917,
"epoch": 0.8299595141700404,
"grad_norm": 0.21965323388576508,
"learning_rate": 1.4470310391363025e-05,
"loss": 1.4408,
"mean_token_accuracy": 0.6550322711467743,
"num_tokens": 9361115.0,
"step": 1640
},
{
"entropy": 1.5193968892097474,
"epoch": 0.8350202429149798,
"grad_norm": 0.27555471658706665,
"learning_rate": 1.4436572199730096e-05,
"loss": 1.5173,
"mean_token_accuracy": 0.6335371434688568,
"num_tokens": 9417109.0,
"step": 1650
},
{
"entropy": 1.5528843998908997,
"epoch": 0.840080971659919,
"grad_norm": 0.2689385414123535,
"learning_rate": 1.4402834008097166e-05,
"loss": 1.5668,
"mean_token_accuracy": 0.6325760573148728,
"num_tokens": 9473473.0,
"step": 1660
},
{
"entropy": 1.4268815875053407,
"epoch": 0.8451417004048583,
"grad_norm": 0.3029450476169586,
"learning_rate": 1.4369095816464239e-05,
"loss": 1.4197,
"mean_token_accuracy": 0.6514874160289764,
"num_tokens": 9530575.0,
"step": 1670
},
{
"entropy": 1.4315476655960082,
"epoch": 0.8502024291497976,
"grad_norm": 0.24891141057014465,
"learning_rate": 1.433535762483131e-05,
"loss": 1.4228,
"mean_token_accuracy": 0.656501293182373,
"num_tokens": 9590931.0,
"step": 1680
},
{
"entropy": 1.5360273122787476,
"epoch": 0.8552631578947368,
"grad_norm": 0.30486878752708435,
"learning_rate": 1.4301619433198381e-05,
"loss": 1.5474,
"mean_token_accuracy": 0.6348777890205384,
"num_tokens": 9644652.0,
"step": 1690
},
{
"entropy": 1.6101372838020325,
"epoch": 0.8603238866396761,
"grad_norm": 0.23739294707775116,
"learning_rate": 1.4267881241565454e-05,
"loss": 1.6222,
"mean_token_accuracy": 0.6213286280632019,
"num_tokens": 9697296.0,
"step": 1700
},
{
"entropy": 1.56304851770401,
"epoch": 0.8653846153846154,
"grad_norm": 0.2499363124370575,
"learning_rate": 1.4234143049932525e-05,
"loss": 1.5642,
"mean_token_accuracy": 0.6282478511333466,
"num_tokens": 9755265.0,
"step": 1710
},
{
"entropy": 1.4945539951324462,
"epoch": 0.8704453441295547,
"grad_norm": 0.24991373717784882,
"learning_rate": 1.4200404858299596e-05,
"loss": 1.5336,
"mean_token_accuracy": 0.6350914716720581,
"num_tokens": 9815899.0,
"step": 1720
},
{
"entropy": 1.5779843926429749,
"epoch": 0.8755060728744939,
"grad_norm": 0.24115176498889923,
"learning_rate": 1.416666666666667e-05,
"loss": 1.5933,
"mean_token_accuracy": 0.6283825278282166,
"num_tokens": 9872513.0,
"step": 1730
},
{
"entropy": 1.4204454302787781,
"epoch": 0.8805668016194332,
"grad_norm": 0.22373662889003754,
"learning_rate": 1.413292847503374e-05,
"loss": 1.4136,
"mean_token_accuracy": 0.6557290494441986,
"num_tokens": 9932083.0,
"step": 1740
},
{
"entropy": 1.636140561103821,
"epoch": 0.8856275303643725,
"grad_norm": 0.29674816131591797,
"learning_rate": 1.409919028340081e-05,
"loss": 1.662,
"mean_token_accuracy": 0.6216094970703125,
"num_tokens": 9988494.0,
"step": 1750
},
{
"entropy": 1.549510085582733,
"epoch": 0.8906882591093117,
"grad_norm": 0.24920591711997986,
"learning_rate": 1.4065452091767881e-05,
"loss": 1.5553,
"mean_token_accuracy": 0.6351737916469574,
"num_tokens": 10041605.0,
"step": 1760
},
{
"entropy": 1.5425897359848022,
"epoch": 0.895748987854251,
"grad_norm": 0.2719487249851227,
"learning_rate": 1.4031713900134953e-05,
"loss": 1.5457,
"mean_token_accuracy": 0.6341227173805237,
"num_tokens": 10097471.0,
"step": 1770
},
{
"entropy": 1.5892379999160766,
"epoch": 0.9008097165991903,
"grad_norm": 0.26108458638191223,
"learning_rate": 1.3997975708502025e-05,
"loss": 1.5846,
"mean_token_accuracy": 0.6257834196090698,
"num_tokens": 10157839.0,
"step": 1780
},
{
"entropy": 1.5164817094802856,
"epoch": 0.9058704453441295,
"grad_norm": 0.255862295627594,
"learning_rate": 1.3964237516869097e-05,
"loss": 1.5325,
"mean_token_accuracy": 0.6302552342414856,
"num_tokens": 10215568.0,
"step": 1790
},
{
"entropy": 1.5202425956726073,
"epoch": 0.9109311740890689,
"grad_norm": 0.2746359705924988,
"learning_rate": 1.3930499325236168e-05,
"loss": 1.5264,
"mean_token_accuracy": 0.6395917236804962,
"num_tokens": 10277752.0,
"step": 1800
},
{
"entropy": 1.5994849681854248,
"epoch": 0.9159919028340081,
"grad_norm": 0.259244441986084,
"learning_rate": 1.389676113360324e-05,
"loss": 1.6126,
"mean_token_accuracy": 0.6206628024578095,
"num_tokens": 10332436.0,
"step": 1810
},
{
"entropy": 1.5928335905075073,
"epoch": 0.9210526315789473,
"grad_norm": 0.30553993582725525,
"learning_rate": 1.3863022941970312e-05,
"loss": 1.604,
"mean_token_accuracy": 0.6238301634788513,
"num_tokens": 10385660.0,
"step": 1820
},
{
"entropy": 1.5503159523010255,
"epoch": 0.9261133603238867,
"grad_norm": 0.2695212662220001,
"learning_rate": 1.3829284750337383e-05,
"loss": 1.5727,
"mean_token_accuracy": 0.6283754229545593,
"num_tokens": 10440034.0,
"step": 1830
},
{
"entropy": 1.472425067424774,
"epoch": 0.9311740890688259,
"grad_norm": 0.26096370816230774,
"learning_rate": 1.3795546558704453e-05,
"loss": 1.4744,
"mean_token_accuracy": 0.6468591213226318,
"num_tokens": 10495586.0,
"step": 1840
},
{
"entropy": 1.4272591471672058,
"epoch": 0.9362348178137652,
"grad_norm": 0.2956947088241577,
"learning_rate": 1.3761808367071526e-05,
"loss": 1.4446,
"mean_token_accuracy": 0.6488463521003723,
"num_tokens": 10546414.0,
"step": 1850
},
{
"entropy": 1.4026084661483764,
"epoch": 0.9412955465587044,
"grad_norm": 0.24682804942131042,
"learning_rate": 1.3728070175438597e-05,
"loss": 1.3906,
"mean_token_accuracy": 0.6536332130432129,
"num_tokens": 10603382.0,
"step": 1860
},
{
"entropy": 1.585541033744812,
"epoch": 0.9463562753036437,
"grad_norm": 0.28304097056388855,
"learning_rate": 1.3694331983805668e-05,
"loss": 1.5972,
"mean_token_accuracy": 0.6255220711231232,
"num_tokens": 10666030.0,
"step": 1870
},
{
"entropy": 1.580546224117279,
"epoch": 0.951417004048583,
"grad_norm": 0.2616841793060303,
"learning_rate": 1.3660593792172741e-05,
"loss": 1.6051,
"mean_token_accuracy": 0.6219939827919007,
"num_tokens": 10725741.0,
"step": 1880
},
{
"entropy": 1.6499082326889039,
"epoch": 0.9564777327935222,
"grad_norm": 0.2620835304260254,
"learning_rate": 1.3626855600539812e-05,
"loss": 1.6969,
"mean_token_accuracy": 0.6166241288185119,
"num_tokens": 10787880.0,
"step": 1890
},
{
"entropy": 1.3888215899467469,
"epoch": 0.9615384615384616,
"grad_norm": 0.2680383324623108,
"learning_rate": 1.3593117408906883e-05,
"loss": 1.3917,
"mean_token_accuracy": 0.6527835667133332,
"num_tokens": 10844894.0,
"step": 1900
},
{
"entropy": 1.3836533963680266,
"epoch": 0.9665991902834008,
"grad_norm": 0.35761716961860657,
"learning_rate": 1.3559379217273956e-05,
"loss": 1.3895,
"mean_token_accuracy": 0.6636650562286377,
"num_tokens": 10900721.0,
"step": 1910
},
{
"entropy": 1.451544201374054,
"epoch": 0.97165991902834,
"grad_norm": 0.26495417952537537,
"learning_rate": 1.3525641025641028e-05,
"loss": 1.447,
"mean_token_accuracy": 0.6403470158576965,
"num_tokens": 10951838.0,
"step": 1920
},
{
"entropy": 1.5138379335403442,
"epoch": 0.9767206477732794,
"grad_norm": 0.23315957188606262,
"learning_rate": 1.3491902834008099e-05,
"loss": 1.5385,
"mean_token_accuracy": 0.6303693652153015,
"num_tokens": 11010569.0,
"step": 1930
},
{
"entropy": 1.5039880394935607,
"epoch": 0.9817813765182186,
"grad_norm": 0.26653018593788147,
"learning_rate": 1.3458164642375168e-05,
"loss": 1.515,
"mean_token_accuracy": 0.6446199715137482,
"num_tokens": 11068307.0,
"step": 1940
},
{
"entropy": 1.5039002180099488,
"epoch": 0.9868421052631579,
"grad_norm": 0.24144147336483002,
"learning_rate": 1.3424426450742241e-05,
"loss": 1.5012,
"mean_token_accuracy": 0.6414350152015686,
"num_tokens": 11131378.0,
"step": 1950
},
{
"entropy": 1.5108654856681825,
"epoch": 0.9919028340080972,
"grad_norm": 0.33613070845603943,
"learning_rate": 1.3390688259109312e-05,
"loss": 1.5229,
"mean_token_accuracy": 0.6330624580383301,
"num_tokens": 11189667.0,
"step": 1960
},
{
"entropy": 1.43316547870636,
"epoch": 0.9969635627530364,
"grad_norm": 0.27450039982795715,
"learning_rate": 1.3356950067476384e-05,
"loss": 1.4358,
"mean_token_accuracy": 0.6528611719608307,
"num_tokens": 11248100.0,
"step": 1970
},
{
"entropy": 1.5782551288604736,
"epoch": 1.0020242914979758,
"grad_norm": 0.2942919433116913,
"learning_rate": 1.3323211875843457e-05,
"loss": 1.5945,
"mean_token_accuracy": 0.622716897726059,
"num_tokens": 11301434.0,
"step": 1980
},
{
"entropy": 1.5513013124465942,
"epoch": 1.007085020242915,
"grad_norm": 0.4627493619918823,
"learning_rate": 1.3289473684210528e-05,
"loss": 1.5645,
"mean_token_accuracy": 0.6323555290699006,
"num_tokens": 11357709.0,
"step": 1990
},
{
"entropy": 1.5218539357185363,
"epoch": 1.0121457489878543,
"grad_norm": 0.29789215326309204,
"learning_rate": 1.3255735492577599e-05,
"loss": 1.5296,
"mean_token_accuracy": 0.6385591834783554,
"num_tokens": 11409081.0,
"step": 2000
},
{
"entropy": 1.5782111883163452,
"epoch": 1.0172064777327936,
"grad_norm": 0.3623863458633423,
"learning_rate": 1.3221997300944672e-05,
"loss": 1.5815,
"mean_token_accuracy": 0.6234244406223297,
"num_tokens": 11461701.0,
"step": 2010
},
{
"entropy": 1.478236198425293,
"epoch": 1.0222672064777327,
"grad_norm": 0.24126943945884705,
"learning_rate": 1.3188259109311743e-05,
"loss": 1.4773,
"mean_token_accuracy": 0.6408190190792084,
"num_tokens": 11522781.0,
"step": 2020
},
{
"entropy": 1.474450170993805,
"epoch": 1.027327935222672,
"grad_norm": 0.27630022168159485,
"learning_rate": 1.3154520917678813e-05,
"loss": 1.4777,
"mean_token_accuracy": 0.6390757083892822,
"num_tokens": 11577690.0,
"step": 2030
},
{
"entropy": 1.38731769323349,
"epoch": 1.0323886639676114,
"grad_norm": 0.2594892382621765,
"learning_rate": 1.3120782726045884e-05,
"loss": 1.4113,
"mean_token_accuracy": 0.6555228769779206,
"num_tokens": 11634378.0,
"step": 2040
},
{
"entropy": 1.4996397018432617,
"epoch": 1.0374493927125505,
"grad_norm": 0.29768475890159607,
"learning_rate": 1.3087044534412957e-05,
"loss": 1.5046,
"mean_token_accuracy": 0.6385474681854248,
"num_tokens": 11691197.0,
"step": 2050
},
{
"entropy": 1.7156208992004394,
"epoch": 1.04251012145749,
"grad_norm": 0.30838677287101746,
"learning_rate": 1.3053306342780028e-05,
"loss": 1.7196,
"mean_token_accuracy": 0.6081624507904053,
"num_tokens": 11744812.0,
"step": 2060
},
{
"entropy": 1.4287778735160828,
"epoch": 1.0475708502024292,
"grad_norm": 0.30164098739624023,
"learning_rate": 1.3019568151147099e-05,
"loss": 1.4251,
"mean_token_accuracy": 0.6514438152313232,
"num_tokens": 11798182.0,
"step": 2070
},
{
"entropy": 1.6277110576629639,
"epoch": 1.0526315789473684,
"grad_norm": 0.27688923478126526,
"learning_rate": 1.2985829959514172e-05,
"loss": 1.637,
"mean_token_accuracy": 0.6167466878890991,
"num_tokens": 11853640.0,
"step": 2080
},
{
"entropy": 1.3780420899391175,
"epoch": 1.0576923076923077,
"grad_norm": 0.2407483607530594,
"learning_rate": 1.2952091767881243e-05,
"loss": 1.3775,
"mean_token_accuracy": 0.6617866694927216,
"num_tokens": 11909613.0,
"step": 2090
},
{
"entropy": 1.4581809163093566,
"epoch": 1.062753036437247,
"grad_norm": 0.3337167203426361,
"learning_rate": 1.2918353576248314e-05,
"loss": 1.4533,
"mean_token_accuracy": 0.6537846267223358,
"num_tokens": 11967740.0,
"step": 2100
},
{
"entropy": 1.4676265239715576,
"epoch": 1.0678137651821862,
"grad_norm": 0.2601131796836853,
"learning_rate": 1.2884615384615386e-05,
"loss": 1.4607,
"mean_token_accuracy": 0.6463825047016144,
"num_tokens": 12020775.0,
"step": 2110
},
{
"entropy": 1.5144903063774109,
"epoch": 1.0728744939271255,
"grad_norm": 0.276044636964798,
"learning_rate": 1.2850877192982459e-05,
"loss": 1.5184,
"mean_token_accuracy": 0.6339675188064575,
"num_tokens": 12081273.0,
"step": 2120
},
{
"entropy": 1.5458029508590698,
"epoch": 1.0779352226720649,
"grad_norm": 0.3157075047492981,
"learning_rate": 1.2817139001349528e-05,
"loss": 1.5519,
"mean_token_accuracy": 0.6369691550731659,
"num_tokens": 12134672.0,
"step": 2130
},
{
"entropy": 1.4140147149562836,
"epoch": 1.082995951417004,
"grad_norm": 0.32847243547439575,
"learning_rate": 1.27834008097166e-05,
"loss": 1.4223,
"mean_token_accuracy": 0.6571628749370575,
"num_tokens": 12193683.0,
"step": 2140
},
{
"entropy": 1.5222583651542663,
"epoch": 1.0880566801619433,
"grad_norm": 0.2528051435947418,
"learning_rate": 1.274966261808367e-05,
"loss": 1.5229,
"mean_token_accuracy": 0.6361405253410339,
"num_tokens": 12249416.0,
"step": 2150
},
{
"entropy": 1.5322677731513976,
"epoch": 1.0931174089068827,
"grad_norm": 0.25397226214408875,
"learning_rate": 1.2715924426450743e-05,
"loss": 1.5353,
"mean_token_accuracy": 0.6307880222797394,
"num_tokens": 12312488.0,
"step": 2160
},
{
"entropy": 1.4451451063156129,
"epoch": 1.0981781376518218,
"grad_norm": 0.3207351565361023,
"learning_rate": 1.2682186234817815e-05,
"loss": 1.4532,
"mean_token_accuracy": 0.6479784369468689,
"num_tokens": 12365397.0,
"step": 2170
},
{
"entropy": 1.6216472387313843,
"epoch": 1.1032388663967612,
"grad_norm": 0.22639265656471252,
"learning_rate": 1.2648448043184886e-05,
"loss": 1.6331,
"mean_token_accuracy": 0.6182599663734436,
"num_tokens": 12426280.0,
"step": 2180
},
{
"entropy": 1.417205023765564,
"epoch": 1.1082995951417005,
"grad_norm": 0.31163787841796875,
"learning_rate": 1.2614709851551959e-05,
"loss": 1.4197,
"mean_token_accuracy": 0.6472279012203217,
"num_tokens": 12481836.0,
"step": 2190
},
{
"entropy": 1.5859375596046448,
"epoch": 1.1133603238866396,
"grad_norm": 0.2581881582736969,
"learning_rate": 1.258097165991903e-05,
"loss": 1.5947,
"mean_token_accuracy": 0.6293219923973083,
"num_tokens": 12537386.0,
"step": 2200
},
{
"entropy": 1.6218139290809632,
"epoch": 1.118421052631579,
"grad_norm": 0.27295926213264465,
"learning_rate": 1.2547233468286101e-05,
"loss": 1.6235,
"mean_token_accuracy": 0.6221803069114685,
"num_tokens": 12591068.0,
"step": 2210
},
{
"entropy": 1.5492971539497375,
"epoch": 1.123481781376518,
"grad_norm": 0.28580132126808167,
"learning_rate": 1.251349527665317e-05,
"loss": 1.5594,
"mean_token_accuracy": 0.6253218352794647,
"num_tokens": 12648353.0,
"step": 2220
},
{
"entropy": 1.649086058139801,
"epoch": 1.1285425101214575,
"grad_norm": 0.24511824548244476,
"learning_rate": 1.2479757085020244e-05,
"loss": 1.6621,
"mean_token_accuracy": 0.6179795920848846,
"num_tokens": 12700443.0,
"step": 2230
},
{
"entropy": 1.4533384203910829,
"epoch": 1.1336032388663968,
"grad_norm": 0.3033972382545471,
"learning_rate": 1.2446018893387315e-05,
"loss": 1.4451,
"mean_token_accuracy": 0.6498919248580932,
"num_tokens": 12748990.0,
"step": 2240
},
{
"entropy": 1.5436882257461548,
"epoch": 1.1386639676113361,
"grad_norm": 0.2811788022518158,
"learning_rate": 1.2412280701754386e-05,
"loss": 1.5508,
"mean_token_accuracy": 0.6290224313735961,
"num_tokens": 12807477.0,
"step": 2250
},
{
"entropy": 1.448672115802765,
"epoch": 1.1437246963562753,
"grad_norm": 0.29944077134132385,
"learning_rate": 1.2378542510121459e-05,
"loss": 1.4598,
"mean_token_accuracy": 0.6483164548873901,
"num_tokens": 12869123.0,
"step": 2260
},
{
"entropy": 1.3786328792572022,
"epoch": 1.1487854251012146,
"grad_norm": 0.27392685413360596,
"learning_rate": 1.234480431848853e-05,
"loss": 1.3767,
"mean_token_accuracy": 0.6597134828567505,
"num_tokens": 12924572.0,
"step": 2270
},
{
"entropy": 1.5663957238197326,
"epoch": 1.1538461538461537,
"grad_norm": 0.3136812150478363,
"learning_rate": 1.2311066126855601e-05,
"loss": 1.5661,
"mean_token_accuracy": 0.6283589959144592,
"num_tokens": 12982715.0,
"step": 2280
},
{
"entropy": 1.4102508783340455,
"epoch": 1.158906882591093,
"grad_norm": 0.33586448431015015,
"learning_rate": 1.2277327935222674e-05,
"loss": 1.4242,
"mean_token_accuracy": 0.6464997053146362,
"num_tokens": 13035535.0,
"step": 2290
},
{
"entropy": 1.4415246963500976,
"epoch": 1.1639676113360324,
"grad_norm": 0.24208928644657135,
"learning_rate": 1.2243589743589746e-05,
"loss": 1.4572,
"mean_token_accuracy": 0.6485124588012695,
"num_tokens": 13098688.0,
"step": 2300
},
{
"entropy": 1.490816557407379,
"epoch": 1.1690283400809718,
"grad_norm": 0.27268052101135254,
"learning_rate": 1.2209851551956815e-05,
"loss": 1.4841,
"mean_token_accuracy": 0.6446694970130921,
"num_tokens": 13155516.0,
"step": 2310
},
{
"entropy": 1.41130930185318,
"epoch": 1.174089068825911,
"grad_norm": 0.3298867642879486,
"learning_rate": 1.2176113360323886e-05,
"loss": 1.4114,
"mean_token_accuracy": 0.6582064151763916,
"num_tokens": 13208333.0,
"step": 2320
},
{
"entropy": 1.6078017115592957,
"epoch": 1.1791497975708503,
"grad_norm": 0.2950042188167572,
"learning_rate": 1.214237516869096e-05,
"loss": 1.6164,
"mean_token_accuracy": 0.6207537829875946,
"num_tokens": 13264796.0,
"step": 2330
},
{
"entropy": 1.500421929359436,
"epoch": 1.1842105263157894,
"grad_norm": 0.2659217417240143,
"learning_rate": 1.210863697705803e-05,
"loss": 1.5125,
"mean_token_accuracy": 0.6368428528308868,
"num_tokens": 13325761.0,
"step": 2340
},
{
"entropy": 1.511633825302124,
"epoch": 1.1892712550607287,
"grad_norm": 0.2882932722568512,
"learning_rate": 1.2074898785425102e-05,
"loss": 1.5265,
"mean_token_accuracy": 0.6347347319126129,
"num_tokens": 13381225.0,
"step": 2350
},
{
"entropy": 1.4531208157539368,
"epoch": 1.194331983805668,
"grad_norm": 0.2595268487930298,
"learning_rate": 1.2041160593792175e-05,
"loss": 1.4615,
"mean_token_accuracy": 0.6477943778038024,
"num_tokens": 13443099.0,
"step": 2360
},
{
"entropy": 1.4483809113502502,
"epoch": 1.1993927125506072,
"grad_norm": 0.31083598732948303,
"learning_rate": 1.2007422402159246e-05,
"loss": 1.4345,
"mean_token_accuracy": 0.6418466746807099,
"num_tokens": 13492349.0,
"step": 2370
},
{
"entropy": 1.4612587809562683,
"epoch": 1.2044534412955465,
"grad_norm": 0.3023878037929535,
"learning_rate": 1.1973684210526317e-05,
"loss": 1.4644,
"mean_token_accuracy": 0.6457450866699219,
"num_tokens": 13553635.0,
"step": 2380
},
{
"entropy": 1.503815734386444,
"epoch": 1.209514170040486,
"grad_norm": 0.2668578326702118,
"learning_rate": 1.193994601889339e-05,
"loss": 1.5031,
"mean_token_accuracy": 0.6367665946483612,
"num_tokens": 13610860.0,
"step": 2390
},
{
"entropy": 1.5158751249313354,
"epoch": 1.214574898785425,
"grad_norm": 0.22731706500053406,
"learning_rate": 1.1906207827260461e-05,
"loss": 1.5166,
"mean_token_accuracy": 0.6408901572227478,
"num_tokens": 13671500.0,
"step": 2400
},
{
"entropy": 1.4245959162712096,
"epoch": 1.2196356275303644,
"grad_norm": 0.23208104074001312,
"learning_rate": 1.187246963562753e-05,
"loss": 1.4395,
"mean_token_accuracy": 0.650111585855484,
"num_tokens": 13732700.0,
"step": 2410
},
{
"entropy": 1.5526673555374146,
"epoch": 1.2246963562753037,
"grad_norm": 0.3204510807991028,
"learning_rate": 1.1838731443994602e-05,
"loss": 1.5659,
"mean_token_accuracy": 0.6272344350814819,
"num_tokens": 13792638.0,
"step": 2420
},
{
"entropy": 1.4588525891304016,
"epoch": 1.2297570850202428,
"grad_norm": 0.2778925895690918,
"learning_rate": 1.1804993252361675e-05,
"loss": 1.4745,
"mean_token_accuracy": 0.6453329682350158,
"num_tokens": 13848701.0,
"step": 2430
},
{
"entropy": 1.3035455107688905,
"epoch": 1.2348178137651822,
"grad_norm": 0.26574888825416565,
"learning_rate": 1.1771255060728746e-05,
"loss": 1.3013,
"mean_token_accuracy": 0.680269593000412,
"num_tokens": 13903243.0,
"step": 2440
},
{
"entropy": 1.5677086472511292,
"epoch": 1.2398785425101215,
"grad_norm": 0.2806277573108673,
"learning_rate": 1.1737516869095817e-05,
"loss": 1.5653,
"mean_token_accuracy": 0.6303077161312103,
"num_tokens": 13962439.0,
"step": 2450
},
{
"entropy": 1.4167581439018249,
"epoch": 1.2449392712550607,
"grad_norm": 0.2721521258354187,
"learning_rate": 1.1703778677462888e-05,
"loss": 1.4122,
"mean_token_accuracy": 0.6505212604999542,
"num_tokens": 14017529.0,
"step": 2460
},
{
"entropy": 1.5344619274139404,
"epoch": 1.25,
"grad_norm": 0.2629392445087433,
"learning_rate": 1.1670040485829961e-05,
"loss": 1.5489,
"mean_token_accuracy": 0.6296425819396972,
"num_tokens": 14074333.0,
"step": 2470
},
{
"entropy": 1.4288833916187287,
"epoch": 1.2550607287449393,
"grad_norm": 0.28045085072517395,
"learning_rate": 1.1636302294197033e-05,
"loss": 1.4332,
"mean_token_accuracy": 0.6531016409397126,
"num_tokens": 14131260.0,
"step": 2480
},
{
"entropy": 1.4341704964637756,
"epoch": 1.2601214574898785,
"grad_norm": 0.27869343757629395,
"learning_rate": 1.1602564102564104e-05,
"loss": 1.4245,
"mean_token_accuracy": 0.6531503915786743,
"num_tokens": 14187704.0,
"step": 2490
},
{
"entropy": 1.5492194533348083,
"epoch": 1.2651821862348178,
"grad_norm": 0.3610108494758606,
"learning_rate": 1.1568825910931173e-05,
"loss": 1.5493,
"mean_token_accuracy": 0.6251341938972473,
"num_tokens": 14244227.0,
"step": 2500
},
{
"entropy": 1.4428314566612244,
"epoch": 1.2702429149797572,
"grad_norm": 0.2730664908885956,
"learning_rate": 1.1535087719298246e-05,
"loss": 1.4481,
"mean_token_accuracy": 0.6439902603626251,
"num_tokens": 14301363.0,
"step": 2510
},
{
"entropy": 1.6202573895454406,
"epoch": 1.2753036437246963,
"grad_norm": 0.2632329761981964,
"learning_rate": 1.1501349527665317e-05,
"loss": 1.6394,
"mean_token_accuracy": 0.6166090041399002,
"num_tokens": 14358360.0,
"step": 2520
},
{
"entropy": 1.4789348363876342,
"epoch": 1.2803643724696356,
"grad_norm": 0.31635069847106934,
"learning_rate": 1.1467611336032389e-05,
"loss": 1.4909,
"mean_token_accuracy": 0.6398876368999481,
"num_tokens": 14414169.0,
"step": 2530
},
{
"entropy": 1.5108978629112244,
"epoch": 1.285425101214575,
"grad_norm": 0.32884782552719116,
"learning_rate": 1.1433873144399461e-05,
"loss": 1.5177,
"mean_token_accuracy": 0.6348686575889587,
"num_tokens": 14475715.0,
"step": 2540
},
{
"entropy": 1.419902467727661,
"epoch": 1.290485829959514,
"grad_norm": 0.2587096095085144,
"learning_rate": 1.1400134952766533e-05,
"loss": 1.4162,
"mean_token_accuracy": 0.6549311280250549,
"num_tokens": 14534625.0,
"step": 2550
},
{
"entropy": 1.4202989101409913,
"epoch": 1.2955465587044535,
"grad_norm": 0.3693634271621704,
"learning_rate": 1.1366396761133604e-05,
"loss": 1.4086,
"mean_token_accuracy": 0.6512441515922547,
"num_tokens": 14587225.0,
"step": 2560
},
{
"entropy": 1.646610152721405,
"epoch": 1.3006072874493926,
"grad_norm": 0.2674924433231354,
"learning_rate": 1.1332658569500677e-05,
"loss": 1.6561,
"mean_token_accuracy": 0.6122437655925751,
"num_tokens": 14645474.0,
"step": 2570
},
{
"entropy": 1.5521462559700012,
"epoch": 1.305668016194332,
"grad_norm": 0.2970985770225525,
"learning_rate": 1.1298920377867748e-05,
"loss": 1.5528,
"mean_token_accuracy": 0.6343778431415558,
"num_tokens": 14702700.0,
"step": 2580
},
{
"entropy": 1.556568205356598,
"epoch": 1.3107287449392713,
"grad_norm": 0.2645126283168793,
"learning_rate": 1.1265182186234818e-05,
"loss": 1.5629,
"mean_token_accuracy": 0.6288919091224671,
"num_tokens": 14757931.0,
"step": 2590
},
{
"entropy": 1.4472679018974304,
"epoch": 1.3157894736842106,
"grad_norm": 0.2335396409034729,
"learning_rate": 1.1231443994601889e-05,
"loss": 1.4551,
"mean_token_accuracy": 0.6467409670352936,
"num_tokens": 14814936.0,
"step": 2600
},
{
"entropy": 1.5397544741630553,
"epoch": 1.3208502024291497,
"grad_norm": 0.2709454298019409,
"learning_rate": 1.1197705802968962e-05,
"loss": 1.5446,
"mean_token_accuracy": 0.6299596786499023,
"num_tokens": 14875733.0,
"step": 2610
},
{
"entropy": 1.4886646032333375,
"epoch": 1.325910931174089,
"grad_norm": 0.35333138704299927,
"learning_rate": 1.1163967611336033e-05,
"loss": 1.4863,
"mean_token_accuracy": 0.63787921667099,
"num_tokens": 14929765.0,
"step": 2620
},
{
"entropy": 1.453588593006134,
"epoch": 1.3309716599190282,
"grad_norm": 0.27809369564056396,
"learning_rate": 1.1130229419703104e-05,
"loss": 1.4697,
"mean_token_accuracy": 0.6435807704925537,
"num_tokens": 14990890.0,
"step": 2630
},
{
"entropy": 1.5616032361984253,
"epoch": 1.3360323886639676,
"grad_norm": 0.30011820793151855,
"learning_rate": 1.1096491228070177e-05,
"loss": 1.5712,
"mean_token_accuracy": 0.6284253001213074,
"num_tokens": 15051025.0,
"step": 2640
},
{
"entropy": 1.5883211970329285,
"epoch": 1.341093117408907,
"grad_norm": 0.2934761345386505,
"learning_rate": 1.1062753036437248e-05,
"loss": 1.5979,
"mean_token_accuracy": 0.6245935201644898,
"num_tokens": 15108003.0,
"step": 2650
},
{
"entropy": 1.6321449398994445,
"epoch": 1.3461538461538463,
"grad_norm": 0.2740587890148163,
"learning_rate": 1.102901484480432e-05,
"loss": 1.6281,
"mean_token_accuracy": 0.6189014375209808,
"num_tokens": 15164177.0,
"step": 2660
},
{
"entropy": 1.5974322438240052,
"epoch": 1.3512145748987854,
"grad_norm": 0.26599040627479553,
"learning_rate": 1.0995276653171392e-05,
"loss": 1.5986,
"mean_token_accuracy": 0.6264194548130035,
"num_tokens": 15219699.0,
"step": 2670
},
{
"entropy": 1.6627001881599426,
"epoch": 1.3562753036437247,
"grad_norm": 0.35696741938591003,
"learning_rate": 1.0961538461538464e-05,
"loss": 1.6903,
"mean_token_accuracy": 0.6077935576438904,
"num_tokens": 15275379.0,
"step": 2680
},
{
"entropy": 1.4766412138938905,
"epoch": 1.3613360323886639,
"grad_norm": 0.32456570863723755,
"learning_rate": 1.0927800269905533e-05,
"loss": 1.473,
"mean_token_accuracy": 0.6439019083976746,
"num_tokens": 15334742.0,
"step": 2690
},
{
"entropy": 1.534727895259857,
"epoch": 1.3663967611336032,
"grad_norm": 0.30418887734413147,
"learning_rate": 1.0894062078272604e-05,
"loss": 1.5354,
"mean_token_accuracy": 0.6321536242961884,
"num_tokens": 15384467.0,
"step": 2700
},
{
"entropy": 1.4464212298393249,
"epoch": 1.3714574898785425,
"grad_norm": 0.2574264407157898,
"learning_rate": 1.0860323886639677e-05,
"loss": 1.4397,
"mean_token_accuracy": 0.6525548756122589,
"num_tokens": 15446519.0,
"step": 2710
},
{
"entropy": 1.5958146333694458,
"epoch": 1.376518218623482,
"grad_norm": 0.28892847895622253,
"learning_rate": 1.0826585695006748e-05,
"loss": 1.5915,
"mean_token_accuracy": 0.6221986651420593,
"num_tokens": 15505401.0,
"step": 2720
},
{
"entropy": 1.6138377904891967,
"epoch": 1.381578947368421,
"grad_norm": 0.2827686667442322,
"learning_rate": 1.079284750337382e-05,
"loss": 1.6357,
"mean_token_accuracy": 0.622738265991211,
"num_tokens": 15563279.0,
"step": 2730
},
{
"entropy": 1.54440039396286,
"epoch": 1.3866396761133604,
"grad_norm": 0.2887682318687439,
"learning_rate": 1.0759109311740893e-05,
"loss": 1.5273,
"mean_token_accuracy": 0.6313063859939575,
"num_tokens": 15618960.0,
"step": 2740
},
{
"entropy": 1.4138375759124755,
"epoch": 1.3917004048582995,
"grad_norm": 0.36498573422431946,
"learning_rate": 1.0725371120107964e-05,
"loss": 1.4189,
"mean_token_accuracy": 0.6591821730136871,
"num_tokens": 15670366.0,
"step": 2750
},
{
"entropy": 1.4741955041885375,
"epoch": 1.3967611336032388,
"grad_norm": 0.3496224284172058,
"learning_rate": 1.0691632928475035e-05,
"loss": 1.4764,
"mean_token_accuracy": 0.6401580095291137,
"num_tokens": 15724201.0,
"step": 2760
},
{
"entropy": 1.5501426219940186,
"epoch": 1.4018218623481782,
"grad_norm": 0.26639312505722046,
"learning_rate": 1.0657894736842108e-05,
"loss": 1.546,
"mean_token_accuracy": 0.6274727523326874,
"num_tokens": 15783636.0,
"step": 2770
},
{
"entropy": 1.557990849018097,
"epoch": 1.4068825910931175,
"grad_norm": 0.34502512216567993,
"learning_rate": 1.0624156545209177e-05,
"loss": 1.5781,
"mean_token_accuracy": 0.6323262035846711,
"num_tokens": 15841694.0,
"step": 2780
},
{
"entropy": 1.455728328227997,
"epoch": 1.4119433198380567,
"grad_norm": 0.2952381372451782,
"learning_rate": 1.0590418353576249e-05,
"loss": 1.4547,
"mean_token_accuracy": 0.6422502875328064,
"num_tokens": 15896343.0,
"step": 2790
},
{
"entropy": 1.6696131229400635,
"epoch": 1.417004048582996,
"grad_norm": 0.2534728944301605,
"learning_rate": 1.055668016194332e-05,
"loss": 1.6816,
"mean_token_accuracy": 0.607040387392044,
"num_tokens": 15952655.0,
"step": 2800
},
{
"entropy": 1.3929704070091247,
"epoch": 1.4220647773279351,
"grad_norm": 0.2545351982116699,
"learning_rate": 1.0522941970310391e-05,
"loss": 1.3815,
"mean_token_accuracy": 0.6631879568099975,
"num_tokens": 16009017.0,
"step": 2810
},
{
"entropy": 1.4616627931594848,
"epoch": 1.4271255060728745,
"grad_norm": 0.29235726594924927,
"learning_rate": 1.0489203778677464e-05,
"loss": 1.469,
"mean_token_accuracy": 0.6424239039421081,
"num_tokens": 16064974.0,
"step": 2820
},
{
"entropy": 1.5947192907333374,
"epoch": 1.4321862348178138,
"grad_norm": 0.4684313237667084,
"learning_rate": 1.0455465587044535e-05,
"loss": 1.6334,
"mean_token_accuracy": 0.620320850610733,
"num_tokens": 16121438.0,
"step": 2830
},
{
"entropy": 1.5052786350250245,
"epoch": 1.4372469635627532,
"grad_norm": 0.2901478707790375,
"learning_rate": 1.0421727395411606e-05,
"loss": 1.5228,
"mean_token_accuracy": 0.6432769238948822,
"num_tokens": 16177348.0,
"step": 2840
},
{
"entropy": 1.5745530486106873,
"epoch": 1.4423076923076923,
"grad_norm": 0.4461107552051544,
"learning_rate": 1.038798920377868e-05,
"loss": 1.5797,
"mean_token_accuracy": 0.628256207704544,
"num_tokens": 16233533.0,
"step": 2850
},
{
"entropy": 1.6525885105133056,
"epoch": 1.4473684210526316,
"grad_norm": 0.30729052424430847,
"learning_rate": 1.035425101214575e-05,
"loss": 1.659,
"mean_token_accuracy": 0.6156529784202576,
"num_tokens": 16288984.0,
"step": 2860
},
{
"entropy": 1.4813060998916625,
"epoch": 1.4524291497975708,
"grad_norm": 0.26118186116218567,
"learning_rate": 1.0320512820512822e-05,
"loss": 1.4694,
"mean_token_accuracy": 0.6394685864448547,
"num_tokens": 16347312.0,
"step": 2870
},
{
"entropy": 1.3725073099136353,
"epoch": 1.45748987854251,
"grad_norm": 0.24992327392101288,
"learning_rate": 1.0286774628879891e-05,
"loss": 1.3778,
"mean_token_accuracy": 0.6600593090057373,
"num_tokens": 16401182.0,
"step": 2880
},
{
"entropy": 1.5925581932067872,
"epoch": 1.4625506072874495,
"grad_norm": 0.3013634979724884,
"learning_rate": 1.0253036437246964e-05,
"loss": 1.5989,
"mean_token_accuracy": 0.6274087786674499,
"num_tokens": 16463180.0,
"step": 2890
},
{
"entropy": 1.395955240726471,
"epoch": 1.4676113360323888,
"grad_norm": 0.2821931540966034,
"learning_rate": 1.0219298245614035e-05,
"loss": 1.3955,
"mean_token_accuracy": 0.6572477340698242,
"num_tokens": 16524984.0,
"step": 2900
},
{
"entropy": 1.493795931339264,
"epoch": 1.472672064777328,
"grad_norm": 0.27723386883735657,
"learning_rate": 1.0185560053981107e-05,
"loss": 1.4988,
"mean_token_accuracy": 0.6318200826644897,
"num_tokens": 16585454.0,
"step": 2910
},
{
"entropy": 1.608326256275177,
"epoch": 1.4777327935222673,
"grad_norm": 0.24880221486091614,
"learning_rate": 1.015182186234818e-05,
"loss": 1.6037,
"mean_token_accuracy": 0.6237947404384613,
"num_tokens": 16642878.0,
"step": 2920
},
{
"entropy": 1.4563136458396913,
"epoch": 1.4827935222672064,
"grad_norm": 0.2714000940322876,
"learning_rate": 1.011808367071525e-05,
"loss": 1.4609,
"mean_token_accuracy": 0.6409155547618866,
"num_tokens": 16697425.0,
"step": 2930
},
{
"entropy": 1.4760780036449432,
"epoch": 1.4878542510121457,
"grad_norm": 0.3031882047653198,
"learning_rate": 1.0084345479082322e-05,
"loss": 1.4802,
"mean_token_accuracy": 0.6450917005538941,
"num_tokens": 16760118.0,
"step": 2940
},
{
"entropy": 1.493908405303955,
"epoch": 1.492914979757085,
"grad_norm": 0.2621052861213684,
"learning_rate": 1.0050607287449395e-05,
"loss": 1.4918,
"mean_token_accuracy": 0.6401423692703248,
"num_tokens": 16813749.0,
"step": 2950
},
{
"entropy": 1.6856267690658568,
"epoch": 1.4979757085020242,
"grad_norm": 0.26623499393463135,
"learning_rate": 1.0016869095816466e-05,
"loss": 1.6777,
"mean_token_accuracy": 0.6135709464550019,
"num_tokens": 16874289.0,
"step": 2960
},
{
"entropy": 1.4696342468261718,
"epoch": 1.5030364372469636,
"grad_norm": 0.2687808871269226,
"learning_rate": 9.983130904183537e-06,
"loss": 1.4727,
"mean_token_accuracy": 0.6447311758995056,
"num_tokens": 16930145.0,
"step": 2970
},
{
"entropy": 1.4744965791702271,
"epoch": 1.5080971659919027,
"grad_norm": 0.23845624923706055,
"learning_rate": 9.949392712550608e-06,
"loss": 1.4721,
"mean_token_accuracy": 0.645156466960907,
"num_tokens": 16984053.0,
"step": 2980
},
{
"entropy": 1.4356729149818421,
"epoch": 1.513157894736842,
"grad_norm": 0.3086620271205902,
"learning_rate": 9.91565452091768e-06,
"loss": 1.4271,
"mean_token_accuracy": 0.6454346477985382,
"num_tokens": 17040474.0,
"step": 2990
},
{
"entropy": 1.4318643450736999,
"epoch": 1.5182186234817814,
"grad_norm": 0.31296011805534363,
"learning_rate": 9.881916329284751e-06,
"loss": 1.4284,
"mean_token_accuracy": 0.6570405125617981,
"num_tokens": 17091443.0,
"step": 3000
},
{
"entropy": 1.486535382270813,
"epoch": 1.5232793522267207,
"grad_norm": 0.24280501902103424,
"learning_rate": 9.848178137651822e-06,
"loss": 1.4782,
"mean_token_accuracy": 0.6421392917633056,
"num_tokens": 17145789.0,
"step": 3010
},
{
"entropy": 1.3536667227745056,
"epoch": 1.52834008097166,
"grad_norm": 0.3393391966819763,
"learning_rate": 9.814439946018895e-06,
"loss": 1.3665,
"mean_token_accuracy": 0.659710270166397,
"num_tokens": 17200045.0,
"step": 3020
},
{
"entropy": 1.477955400943756,
"epoch": 1.5334008097165992,
"grad_norm": 0.2695980668067932,
"learning_rate": 9.780701754385966e-06,
"loss": 1.4773,
"mean_token_accuracy": 0.6442347228527069,
"num_tokens": 17253382.0,
"step": 3030
},
{
"entropy": 1.4808340609073638,
"epoch": 1.5384615384615383,
"grad_norm": 0.32629549503326416,
"learning_rate": 9.746963562753037e-06,
"loss": 1.487,
"mean_token_accuracy": 0.6431676924228669,
"num_tokens": 17306138.0,
"step": 3040
},
{
"entropy": 1.4529295325279237,
"epoch": 1.5435222672064777,
"grad_norm": 0.2536776661872864,
"learning_rate": 9.713225371120109e-06,
"loss": 1.4591,
"mean_token_accuracy": 0.6488350391387939,
"num_tokens": 17368255.0,
"step": 3050
},
{
"entropy": 1.438970947265625,
"epoch": 1.548582995951417,
"grad_norm": 0.26340344548225403,
"learning_rate": 9.67948717948718e-06,
"loss": 1.4513,
"mean_token_accuracy": 0.6449286341667175,
"num_tokens": 17426575.0,
"step": 3060
},
{
"entropy": 1.709315264225006,
"epoch": 1.5536437246963564,
"grad_norm": 0.31817519664764404,
"learning_rate": 9.645748987854253e-06,
"loss": 1.7215,
"mean_token_accuracy": 0.6018387496471405,
"num_tokens": 17488131.0,
"step": 3070
},
{
"entropy": 1.5443035364151,
"epoch": 1.5587044534412957,
"grad_norm": 0.3266107141971588,
"learning_rate": 9.612010796221324e-06,
"loss": 1.5511,
"mean_token_accuracy": 0.63644158244133,
"num_tokens": 17546517.0,
"step": 3080
},
{
"entropy": 1.6439727783203124,
"epoch": 1.5637651821862348,
"grad_norm": 0.25957760214805603,
"learning_rate": 9.578272604588395e-06,
"loss": 1.6584,
"mean_token_accuracy": 0.6159623801708222,
"num_tokens": 17605712.0,
"step": 3090
},
{
"entropy": 1.5427301168441772,
"epoch": 1.568825910931174,
"grad_norm": 0.27618587017059326,
"learning_rate": 9.544534412955466e-06,
"loss": 1.5451,
"mean_token_accuracy": 0.6260794997215271,
"num_tokens": 17659721.0,
"step": 3100
},
{
"entropy": 1.383056926727295,
"epoch": 1.5738866396761133,
"grad_norm": 0.3027380406856537,
"learning_rate": 9.510796221322538e-06,
"loss": 1.3743,
"mean_token_accuracy": 0.6610878467559814,
"num_tokens": 17714100.0,
"step": 3110
},
{
"entropy": 1.503647792339325,
"epoch": 1.5789473684210527,
"grad_norm": 0.29517048597335815,
"learning_rate": 9.47705802968961e-06,
"loss": 1.5053,
"mean_token_accuracy": 0.6365588068962097,
"num_tokens": 17771308.0,
"step": 3120
},
{
"entropy": 1.548321044445038,
"epoch": 1.584008097165992,
"grad_norm": 0.255573570728302,
"learning_rate": 9.44331983805668e-06,
"loss": 1.5523,
"mean_token_accuracy": 0.6278903543949127,
"num_tokens": 17828285.0,
"step": 3130
},
{
"entropy": 1.5210648417472838,
"epoch": 1.5890688259109311,
"grad_norm": 0.3004836142063141,
"learning_rate": 9.409581646423753e-06,
"loss": 1.5331,
"mean_token_accuracy": 0.6306875169277191,
"num_tokens": 17888745.0,
"step": 3140
},
{
"entropy": 1.4695003390312196,
"epoch": 1.5941295546558705,
"grad_norm": 0.2813291549682617,
"learning_rate": 9.375843454790824e-06,
"loss": 1.4673,
"mean_token_accuracy": 0.6395570158958435,
"num_tokens": 17949494.0,
"step": 3150
},
{
"entropy": 1.4948333382606507,
"epoch": 1.5991902834008096,
"grad_norm": 0.3244977593421936,
"learning_rate": 9.342105263157895e-06,
"loss": 1.5044,
"mean_token_accuracy": 0.6397220313549041,
"num_tokens": 18006803.0,
"step": 3160
},
{
"entropy": 1.4767319202423095,
"epoch": 1.604251012145749,
"grad_norm": 0.2612328827381134,
"learning_rate": 9.308367071524967e-06,
"loss": 1.4795,
"mean_token_accuracy": 0.6446912109851837,
"num_tokens": 18062396.0,
"step": 3170
},
{
"entropy": 1.5266436815261841,
"epoch": 1.6093117408906883,
"grad_norm": 0.3239694833755493,
"learning_rate": 9.274628879892038e-06,
"loss": 1.5418,
"mean_token_accuracy": 0.6299160838127136,
"num_tokens": 18110883.0,
"step": 3180
},
{
"entropy": 1.4701735734939576,
"epoch": 1.6143724696356276,
"grad_norm": 0.2672860324382782,
"learning_rate": 9.240890688259109e-06,
"loss": 1.4503,
"mean_token_accuracy": 0.6537446200847625,
"num_tokens": 18159767.0,
"step": 3190
},
{
"entropy": 1.4009694814682008,
"epoch": 1.6194331983805668,
"grad_norm": 0.29456961154937744,
"learning_rate": 9.207152496626182e-06,
"loss": 1.4045,
"mean_token_accuracy": 0.6546292185783387,
"num_tokens": 18217624.0,
"step": 3200
},
{
"entropy": 1.6173853397369384,
"epoch": 1.624493927125506,
"grad_norm": 0.30044859647750854,
"learning_rate": 9.173414304993253e-06,
"loss": 1.6125,
"mean_token_accuracy": 0.6170080423355102,
"num_tokens": 18277255.0,
"step": 3210
},
{
"entropy": 1.5926665306091308,
"epoch": 1.6295546558704452,
"grad_norm": 0.29986920952796936,
"learning_rate": 9.139676113360324e-06,
"loss": 1.6003,
"mean_token_accuracy": 0.6278518795967102,
"num_tokens": 18335766.0,
"step": 3220
},
{
"entropy": 1.4131199598312378,
"epoch": 1.6346153846153846,
"grad_norm": 0.33528971672058105,
"learning_rate": 9.105937921727396e-06,
"loss": 1.4244,
"mean_token_accuracy": 0.6553650915622711,
"num_tokens": 18392231.0,
"step": 3230
},
{
"entropy": 1.5079811573028565,
"epoch": 1.639676113360324,
"grad_norm": 0.32541990280151367,
"learning_rate": 9.072199730094467e-06,
"loss": 1.5182,
"mean_token_accuracy": 0.6279927968978882,
"num_tokens": 18447207.0,
"step": 3240
},
{
"entropy": 1.5234640002250672,
"epoch": 1.6447368421052633,
"grad_norm": 0.2562153935432434,
"learning_rate": 9.03846153846154e-06,
"loss": 1.5149,
"mean_token_accuracy": 0.6328702330589294,
"num_tokens": 18505601.0,
"step": 3250
},
{
"entropy": 1.6198287844657897,
"epoch": 1.6497975708502024,
"grad_norm": 0.3361916244029999,
"learning_rate": 9.004723346828611e-06,
"loss": 1.6255,
"mean_token_accuracy": 0.61873180270195,
"num_tokens": 18558902.0,
"step": 3260
},
{
"entropy": 1.4600189566612243,
"epoch": 1.6548582995951417,
"grad_norm": 0.304756760597229,
"learning_rate": 8.970985155195682e-06,
"loss": 1.4471,
"mean_token_accuracy": 0.6477857530117035,
"num_tokens": 18619635.0,
"step": 3270
},
{
"entropy": 1.5648074269294738,
"epoch": 1.6599190283400809,
"grad_norm": 0.30415093898773193,
"learning_rate": 8.937246963562753e-06,
"loss": 1.5767,
"mean_token_accuracy": 0.6242169559001922,
"num_tokens": 18674088.0,
"step": 3280
},
{
"entropy": 1.5605645179748535,
"epoch": 1.6649797570850202,
"grad_norm": 0.26909834146499634,
"learning_rate": 8.903508771929825e-06,
"loss": 1.5605,
"mean_token_accuracy": 0.635642808675766,
"num_tokens": 18734453.0,
"step": 3290
},
{
"entropy": 1.5742518544197082,
"epoch": 1.6700404858299596,
"grad_norm": 0.2826893925666809,
"learning_rate": 8.869770580296898e-06,
"loss": 1.5643,
"mean_token_accuracy": 0.6317550718784333,
"num_tokens": 18793346.0,
"step": 3300
},
{
"entropy": 1.6354934215545653,
"epoch": 1.675101214574899,
"grad_norm": 0.2833310067653656,
"learning_rate": 8.836032388663969e-06,
"loss": 1.6417,
"mean_token_accuracy": 0.6152911841869354,
"num_tokens": 18845698.0,
"step": 3310
},
{
"entropy": 1.5844687461853026,
"epoch": 1.680161943319838,
"grad_norm": 0.3369496762752533,
"learning_rate": 8.80229419703104e-06,
"loss": 1.59,
"mean_token_accuracy": 0.6237683236598969,
"num_tokens": 18903498.0,
"step": 3320
},
{
"entropy": 1.570583975315094,
"epoch": 1.6852226720647774,
"grad_norm": 0.36443012952804565,
"learning_rate": 8.768556005398111e-06,
"loss": 1.5757,
"mean_token_accuracy": 0.6224404633045196,
"num_tokens": 18957652.0,
"step": 3330
},
{
"entropy": 1.6000779747962952,
"epoch": 1.6902834008097165,
"grad_norm": 0.32085222005844116,
"learning_rate": 8.734817813765182e-06,
"loss": 1.6067,
"mean_token_accuracy": 0.6194514989852905,
"num_tokens": 19018526.0,
"step": 3340
},
{
"entropy": 1.4028130412101745,
"epoch": 1.6953441295546559,
"grad_norm": 0.2869940996170044,
"learning_rate": 8.701079622132255e-06,
"loss": 1.41,
"mean_token_accuracy": 0.6613210141658783,
"num_tokens": 19073801.0,
"step": 3350
},
{
"entropy": 1.5723829984664917,
"epoch": 1.7004048582995952,
"grad_norm": 0.3251384496688843,
"learning_rate": 8.667341430499327e-06,
"loss": 1.5647,
"mean_token_accuracy": 0.6301065504550933,
"num_tokens": 19128923.0,
"step": 3360
},
{
"entropy": 1.5013223052024842,
"epoch": 1.7054655870445345,
"grad_norm": 0.30307453870773315,
"learning_rate": 8.633603238866398e-06,
"loss": 1.4962,
"mean_token_accuracy": 0.6359362661838531,
"num_tokens": 19186932.0,
"step": 3370
},
{
"entropy": 1.602057731151581,
"epoch": 1.7105263157894737,
"grad_norm": 0.369747132062912,
"learning_rate": 8.599865047233469e-06,
"loss": 1.5956,
"mean_token_accuracy": 0.6288884073495865,
"num_tokens": 19245687.0,
"step": 3380
},
{
"entropy": 1.5921403288841247,
"epoch": 1.7155870445344128,
"grad_norm": 0.2498423159122467,
"learning_rate": 8.56612685560054e-06,
"loss": 1.5971,
"mean_token_accuracy": 0.621882963180542,
"num_tokens": 19307247.0,
"step": 3390
},
{
"entropy": 1.5030475974082946,
"epoch": 1.7206477732793521,
"grad_norm": 0.3407726585865021,
"learning_rate": 8.532388663967613e-06,
"loss": 1.5109,
"mean_token_accuracy": 0.6346028625965119,
"num_tokens": 19367320.0,
"step": 3400
},
{
"entropy": 1.4825459122657776,
"epoch": 1.7257085020242915,
"grad_norm": 0.27978742122650146,
"learning_rate": 8.498650472334684e-06,
"loss": 1.4831,
"mean_token_accuracy": 0.6394042372703552,
"num_tokens": 19429919.0,
"step": 3410
},
{
"entropy": 1.4931324481964112,
"epoch": 1.7307692307692308,
"grad_norm": 0.288116455078125,
"learning_rate": 8.464912280701755e-06,
"loss": 1.4957,
"mean_token_accuracy": 0.6380782008171082,
"num_tokens": 19485577.0,
"step": 3420
},
{
"entropy": 1.4949661374092102,
"epoch": 1.7358299595141702,
"grad_norm": 0.31869447231292725,
"learning_rate": 8.431174089068827e-06,
"loss": 1.4926,
"mean_token_accuracy": 0.6403613984584808,
"num_tokens": 19541077.0,
"step": 3430
},
{
"entropy": 1.596668827533722,
"epoch": 1.7408906882591093,
"grad_norm": 0.28910359740257263,
"learning_rate": 8.397435897435898e-06,
"loss": 1.6038,
"mean_token_accuracy": 0.6189565002918244,
"num_tokens": 19605457.0,
"step": 3440
},
{
"entropy": 1.4728464007377624,
"epoch": 1.7459514170040484,
"grad_norm": 0.27498626708984375,
"learning_rate": 8.36369770580297e-06,
"loss": 1.485,
"mean_token_accuracy": 0.6360372960567474,
"num_tokens": 19667915.0,
"step": 3450
},
{
"entropy": 1.5345482110977173,
"epoch": 1.7510121457489878,
"grad_norm": 0.2618876099586487,
"learning_rate": 8.32995951417004e-06,
"loss": 1.5381,
"mean_token_accuracy": 0.6326977252960205,
"num_tokens": 19725030.0,
"step": 3460
},
{
"entropy": 1.393745517730713,
"epoch": 1.7560728744939271,
"grad_norm": 0.28456103801727295,
"learning_rate": 8.296221322537113e-06,
"loss": 1.3836,
"mean_token_accuracy": 0.6631475329399109,
"num_tokens": 19784982.0,
"step": 3470
},
{
"entropy": 1.3884447813034058,
"epoch": 1.7611336032388665,
"grad_norm": 0.27543261647224426,
"learning_rate": 8.262483130904184e-06,
"loss": 1.3847,
"mean_token_accuracy": 0.6586002767086029,
"num_tokens": 19848002.0,
"step": 3480
},
{
"entropy": 1.4454799056053163,
"epoch": 1.7661943319838058,
"grad_norm": 0.36814162135124207,
"learning_rate": 8.228744939271256e-06,
"loss": 1.455,
"mean_token_accuracy": 0.6476415753364563,
"num_tokens": 19906592.0,
"step": 3490
},
{
"entropy": 1.3266122221946717,
"epoch": 1.771255060728745,
"grad_norm": 0.2580831050872803,
"learning_rate": 8.195006747638327e-06,
"loss": 1.329,
"mean_token_accuracy": 0.6699170589447021,
"num_tokens": 19963138.0,
"step": 3500
},
{
"entropy": 1.670899212360382,
"epoch": 1.776315789473684,
"grad_norm": 0.29895538091659546,
"learning_rate": 8.161268556005398e-06,
"loss": 1.6755,
"mean_token_accuracy": 0.613396269083023,
"num_tokens": 20021345.0,
"step": 3510
},
{
"entropy": 1.354777181148529,
"epoch": 1.7813765182186234,
"grad_norm": 0.34177860617637634,
"learning_rate": 8.12753036437247e-06,
"loss": 1.3456,
"mean_token_accuracy": 0.6686203420162201,
"num_tokens": 20072875.0,
"step": 3520
},
{
"entropy": 1.5505508065223694,
"epoch": 1.7864372469635628,
"grad_norm": 0.2592535614967346,
"learning_rate": 8.093792172739542e-06,
"loss": 1.5535,
"mean_token_accuracy": 0.6263529658317566,
"num_tokens": 20132207.0,
"step": 3530
},
{
"entropy": 1.485759150981903,
"epoch": 1.791497975708502,
"grad_norm": 0.2742493450641632,
"learning_rate": 8.060053981106613e-06,
"loss": 1.4964,
"mean_token_accuracy": 0.635893827676773,
"num_tokens": 20195010.0,
"step": 3540
},
{
"entropy": 1.5584338903427124,
"epoch": 1.7965587044534415,
"grad_norm": 0.2946804463863373,
"learning_rate": 8.026315789473685e-06,
"loss": 1.5553,
"mean_token_accuracy": 0.6236848413944245,
"num_tokens": 20257540.0,
"step": 3550
},
{
"entropy": 1.5169085144996644,
"epoch": 1.8016194331983806,
"grad_norm": 0.26114436984062195,
"learning_rate": 7.992577597840756e-06,
"loss": 1.5138,
"mean_token_accuracy": 0.6321025729179383,
"num_tokens": 20318045.0,
"step": 3560
},
{
"entropy": 1.337432289123535,
"epoch": 1.8066801619433197,
"grad_norm": 0.29184892773628235,
"learning_rate": 7.958839406207827e-06,
"loss": 1.3471,
"mean_token_accuracy": 0.6645301103591919,
"num_tokens": 20373587.0,
"step": 3570
},
{
"entropy": 1.5995257258415223,
"epoch": 1.811740890688259,
"grad_norm": 0.3016499876976013,
"learning_rate": 7.9251012145749e-06,
"loss": 1.5929,
"mean_token_accuracy": 0.6236974120140075,
"num_tokens": 20431451.0,
"step": 3580
},
{
"entropy": 1.623330581188202,
"epoch": 1.8168016194331984,
"grad_norm": 0.35448580980300903,
"learning_rate": 7.891363022941971e-06,
"loss": 1.6129,
"mean_token_accuracy": 0.6200532436370849,
"num_tokens": 20487984.0,
"step": 3590
},
{
"entropy": 1.5125982403755187,
"epoch": 1.8218623481781377,
"grad_norm": 0.32799309492111206,
"learning_rate": 7.857624831309042e-06,
"loss": 1.5025,
"mean_token_accuracy": 0.6384036839008331,
"num_tokens": 20541725.0,
"step": 3600
},
{
"entropy": 1.53478661775589,
"epoch": 1.8269230769230769,
"grad_norm": 0.32730069756507874,
"learning_rate": 7.823886639676114e-06,
"loss": 1.5294,
"mean_token_accuracy": 0.6311649143695831,
"num_tokens": 20600145.0,
"step": 3610
},
{
"entropy": 1.5410036087036132,
"epoch": 1.8319838056680162,
"grad_norm": 0.3669460117816925,
"learning_rate": 7.790148448043185e-06,
"loss": 1.5537,
"mean_token_accuracy": 0.6282461225986481,
"num_tokens": 20655732.0,
"step": 3620
},
{
"entropy": 1.453836238384247,
"epoch": 1.8370445344129553,
"grad_norm": 0.31468528509140015,
"learning_rate": 7.756410256410258e-06,
"loss": 1.4568,
"mean_token_accuracy": 0.6447117567062378,
"num_tokens": 20712525.0,
"step": 3630
},
{
"entropy": 1.475819957256317,
"epoch": 1.8421052631578947,
"grad_norm": 0.29064053297042847,
"learning_rate": 7.722672064777329e-06,
"loss": 1.4821,
"mean_token_accuracy": 0.6439218640327453,
"num_tokens": 20768387.0,
"step": 3640
},
{
"entropy": 1.4041451275348664,
"epoch": 1.847165991902834,
"grad_norm": 0.2812243700027466,
"learning_rate": 7.6889338731444e-06,
"loss": 1.4044,
"mean_token_accuracy": 0.6594688057899475,
"num_tokens": 20826082.0,
"step": 3650
},
{
"entropy": 1.6357195615768432,
"epoch": 1.8522267206477734,
"grad_norm": 0.2777828276157379,
"learning_rate": 7.655195681511471e-06,
"loss": 1.6412,
"mean_token_accuracy": 0.6139614999294281,
"num_tokens": 20882894.0,
"step": 3660
},
{
"entropy": 1.535701298713684,
"epoch": 1.8572874493927125,
"grad_norm": 0.3234771490097046,
"learning_rate": 7.6214574898785435e-06,
"loss": 1.5333,
"mean_token_accuracy": 0.6341882109642029,
"num_tokens": 20938122.0,
"step": 3670
},
{
"entropy": 1.5881774067878722,
"epoch": 1.8623481781376519,
"grad_norm": 0.3148040175437927,
"learning_rate": 7.587719298245615e-06,
"loss": 1.6014,
"mean_token_accuracy": 0.6275585472583771,
"num_tokens": 20995824.0,
"step": 3680
},
{
"entropy": 1.3315507769584656,
"epoch": 1.867408906882591,
"grad_norm": 0.327178031206131,
"learning_rate": 7.553981106612687e-06,
"loss": 1.3346,
"mean_token_accuracy": 0.668057644367218,
"num_tokens": 21050607.0,
"step": 3690
},
{
"entropy": 1.5097809910774231,
"epoch": 1.8724696356275303,
"grad_norm": 0.29023247957229614,
"learning_rate": 7.520242914979757e-06,
"loss": 1.5045,
"mean_token_accuracy": 0.6351129233837127,
"num_tokens": 21106066.0,
"step": 3700
},
{
"entropy": 1.4823681235313415,
"epoch": 1.8775303643724697,
"grad_norm": 0.3215828537940979,
"learning_rate": 7.486504723346829e-06,
"loss": 1.4818,
"mean_token_accuracy": 0.6453963398933411,
"num_tokens": 21164981.0,
"step": 3710
},
{
"entropy": 1.4289534091949463,
"epoch": 1.882591093117409,
"grad_norm": 0.3170277178287506,
"learning_rate": 7.452766531713901e-06,
"loss": 1.446,
"mean_token_accuracy": 0.6434959769248962,
"num_tokens": 21223777.0,
"step": 3720
},
{
"entropy": 1.5202300190925597,
"epoch": 1.8876518218623481,
"grad_norm": 0.2913142740726471,
"learning_rate": 7.4190283400809725e-06,
"loss": 1.5349,
"mean_token_accuracy": 0.6304753959178925,
"num_tokens": 21279646.0,
"step": 3730
},
{
"entropy": 1.6774636268615724,
"epoch": 1.8927125506072875,
"grad_norm": 0.33726078271865845,
"learning_rate": 7.385290148448044e-06,
"loss": 1.6783,
"mean_token_accuracy": 0.6076300263404846,
"num_tokens": 21335265.0,
"step": 3740
},
{
"entropy": 1.5423774600028992,
"epoch": 1.8977732793522266,
"grad_norm": 0.27264466881752014,
"learning_rate": 7.351551956815115e-06,
"loss": 1.5533,
"mean_token_accuracy": 0.6308148026466369,
"num_tokens": 21396070.0,
"step": 3750
},
{
"entropy": 1.4624953866004944,
"epoch": 1.902834008097166,
"grad_norm": 0.35332223773002625,
"learning_rate": 7.317813765182187e-06,
"loss": 1.4655,
"mean_token_accuracy": 0.641634488105774,
"num_tokens": 21452996.0,
"step": 3760
},
{
"entropy": 1.4817042350769043,
"epoch": 1.9078947368421053,
"grad_norm": 0.3333725035190582,
"learning_rate": 7.284075573549258e-06,
"loss": 1.4903,
"mean_token_accuracy": 0.6411226511001586,
"num_tokens": 21508906.0,
"step": 3770
},
{
"entropy": 1.5518399238586427,
"epoch": 1.9129554655870447,
"grad_norm": 0.2960481643676758,
"learning_rate": 7.25033738191633e-06,
"loss": 1.5445,
"mean_token_accuracy": 0.6266183733940125,
"num_tokens": 21568967.0,
"step": 3780
},
{
"entropy": 1.5748514771461486,
"epoch": 1.9180161943319838,
"grad_norm": 0.31355923414230347,
"learning_rate": 7.216599190283401e-06,
"loss": 1.5716,
"mean_token_accuracy": 0.6256311893463135,
"num_tokens": 21631403.0,
"step": 3790
},
{
"entropy": 1.450837540626526,
"epoch": 1.9230769230769231,
"grad_norm": 0.27845069766044617,
"learning_rate": 7.182860998650473e-06,
"loss": 1.4611,
"mean_token_accuracy": 0.650393956899643,
"num_tokens": 21688727.0,
"step": 3800
},
{
"entropy": 1.611345076560974,
"epoch": 1.9281376518218623,
"grad_norm": 0.2685949206352234,
"learning_rate": 7.149122807017545e-06,
"loss": 1.6126,
"mean_token_accuracy": 0.6262206137180328,
"num_tokens": 21742484.0,
"step": 3810
},
{
"entropy": 1.3427100419998168,
"epoch": 1.9331983805668016,
"grad_norm": 0.41044095158576965,
"learning_rate": 7.115384615384616e-06,
"loss": 1.3418,
"mean_token_accuracy": 0.663384473323822,
"num_tokens": 21799804.0,
"step": 3820
},
{
"entropy": 1.4495494306087493,
"epoch": 1.938259109311741,
"grad_norm": 0.5138364434242249,
"learning_rate": 7.081646423751688e-06,
"loss": 1.4481,
"mean_token_accuracy": 0.6450947999954224,
"num_tokens": 21858581.0,
"step": 3830
},
{
"entropy": 1.3911212921142577,
"epoch": 1.9433198380566803,
"grad_norm": 0.29537278413772583,
"learning_rate": 7.047908232118758e-06,
"loss": 1.3992,
"mean_token_accuracy": 0.6531029522418976,
"num_tokens": 21915585.0,
"step": 3840
},
{
"entropy": 1.4535645723342896,
"epoch": 1.9483805668016194,
"grad_norm": 0.25756731629371643,
"learning_rate": 7.0141700404858304e-06,
"loss": 1.4401,
"mean_token_accuracy": 0.6463619887828826,
"num_tokens": 21976079.0,
"step": 3850
},
{
"entropy": 1.4952040553092956,
"epoch": 1.9534412955465585,
"grad_norm": 0.3046974539756775,
"learning_rate": 6.9804318488529025e-06,
"loss": 1.5097,
"mean_token_accuracy": 0.6341541647911072,
"num_tokens": 22035025.0,
"step": 3860
},
{
"entropy": 1.5177082777023316,
"epoch": 1.958502024291498,
"grad_norm": 0.3251610994338989,
"learning_rate": 6.946693657219974e-06,
"loss": 1.5163,
"mean_token_accuracy": 0.6359520852565765,
"num_tokens": 22092788.0,
"step": 3870
},
{
"entropy": 1.4667699456214904,
"epoch": 1.9635627530364372,
"grad_norm": 0.3152090311050415,
"learning_rate": 6.912955465587045e-06,
"loss": 1.4715,
"mean_token_accuracy": 0.6418612182140351,
"num_tokens": 22153745.0,
"step": 3880
},
{
"entropy": 1.6101324200630187,
"epoch": 1.9686234817813766,
"grad_norm": 0.340584933757782,
"learning_rate": 6.879217273954116e-06,
"loss": 1.6212,
"mean_token_accuracy": 0.6180540084838867,
"num_tokens": 22211817.0,
"step": 3890
},
{
"entropy": 1.459115242958069,
"epoch": 1.973684210526316,
"grad_norm": 0.2879182696342468,
"learning_rate": 6.845479082321188e-06,
"loss": 1.4419,
"mean_token_accuracy": 0.6466407418251038,
"num_tokens": 22265817.0,
"step": 3900
},
{
"entropy": 1.4101441740989684,
"epoch": 1.978744939271255,
"grad_norm": 0.3250649571418762,
"learning_rate": 6.81174089068826e-06,
"loss": 1.4063,
"mean_token_accuracy": 0.6551910638809204,
"num_tokens": 22324629.0,
"step": 3910
},
{
"entropy": 1.6089503526687623,
"epoch": 1.9838056680161942,
"grad_norm": 0.3786233961582184,
"learning_rate": 6.7780026990553315e-06,
"loss": 1.6147,
"mean_token_accuracy": 0.6272029399871826,
"num_tokens": 22381691.0,
"step": 3920
},
{
"entropy": 1.3815577149391174,
"epoch": 1.9888663967611335,
"grad_norm": 0.304582417011261,
"learning_rate": 6.744264507422402e-06,
"loss": 1.3759,
"mean_token_accuracy": 0.657072639465332,
"num_tokens": 22432987.0,
"step": 3930
},
{
"entropy": 1.6114310383796693,
"epoch": 1.9939271255060729,
"grad_norm": 0.3556569218635559,
"learning_rate": 6.710526315789474e-06,
"loss": 1.6089,
"mean_token_accuracy": 0.6203605115413666,
"num_tokens": 22491567.0,
"step": 3940
},
{
"entropy": 1.5013386726379394,
"epoch": 1.9989878542510122,
"grad_norm": 0.3433378040790558,
"learning_rate": 6.676788124156546e-06,
"loss": 1.497,
"mean_token_accuracy": 0.6365504443645478,
"num_tokens": 22548351.0,
"step": 3950
},
{
"entropy": 1.4863505601882934,
"epoch": 2.0040485829959516,
"grad_norm": 0.348243772983551,
"learning_rate": 6.643049932523617e-06,
"loss": 1.4864,
"mean_token_accuracy": 0.6374901950359344,
"num_tokens": 22596557.0,
"step": 3960
},
{
"entropy": 1.5316878080368042,
"epoch": 2.0091093117408905,
"grad_norm": 0.32034119963645935,
"learning_rate": 6.609311740890689e-06,
"loss": 1.538,
"mean_token_accuracy": 0.6406886577606201,
"num_tokens": 22656578.0,
"step": 3970
},
{
"entropy": 1.422401201725006,
"epoch": 2.01417004048583,
"grad_norm": 0.2935118079185486,
"learning_rate": 6.57557354925776e-06,
"loss": 1.4232,
"mean_token_accuracy": 0.6517488479614257,
"num_tokens": 22715169.0,
"step": 3980
},
{
"entropy": 1.4487539887428285,
"epoch": 2.019230769230769,
"grad_norm": 0.311564177274704,
"learning_rate": 6.541835357624832e-06,
"loss": 1.4388,
"mean_token_accuracy": 0.6472173929214478,
"num_tokens": 22772089.0,
"step": 3990
},
{
"entropy": 1.5003145456314086,
"epoch": 2.0242914979757085,
"grad_norm": 0.2912486493587494,
"learning_rate": 6.508097165991904e-06,
"loss": 1.5015,
"mean_token_accuracy": 0.6321758210659028,
"num_tokens": 22834505.0,
"step": 4000
},
{
"entropy": 1.4098521590232849,
"epoch": 2.029352226720648,
"grad_norm": 0.29250964522361755,
"learning_rate": 6.474358974358975e-06,
"loss": 1.4107,
"mean_token_accuracy": 0.6528907954692841,
"num_tokens": 22889105.0,
"step": 4010
},
{
"entropy": 1.4532611846923829,
"epoch": 2.034412955465587,
"grad_norm": 0.34667733311653137,
"learning_rate": 6.440620782726047e-06,
"loss": 1.4581,
"mean_token_accuracy": 0.6446337521076202,
"num_tokens": 22942406.0,
"step": 4020
},
{
"entropy": 1.5700780391693114,
"epoch": 2.039473684210526,
"grad_norm": 0.3028770685195923,
"learning_rate": 6.406882591093117e-06,
"loss": 1.5643,
"mean_token_accuracy": 0.6249816060066223,
"num_tokens": 22996028.0,
"step": 4030
},
{
"entropy": 1.6611987948417664,
"epoch": 2.0445344129554655,
"grad_norm": 0.30681440234184265,
"learning_rate": 6.3731443994601894e-06,
"loss": 1.6827,
"mean_token_accuracy": 0.6147861301898956,
"num_tokens": 23051645.0,
"step": 4040
},
{
"entropy": 1.4732018947601317,
"epoch": 2.049595141700405,
"grad_norm": 0.26491233706474304,
"learning_rate": 6.3394062078272615e-06,
"loss": 1.466,
"mean_token_accuracy": 0.6404920816421509,
"num_tokens": 23105066.0,
"step": 4050
},
{
"entropy": 1.5172441840171813,
"epoch": 2.054655870445344,
"grad_norm": 0.3094307780265808,
"learning_rate": 6.305668016194333e-06,
"loss": 1.5004,
"mean_token_accuracy": 0.6372400879859924,
"num_tokens": 23157352.0,
"step": 4060
},
{
"entropy": 1.422630524635315,
"epoch": 2.0597165991902835,
"grad_norm": 0.29695579409599304,
"learning_rate": 6.271929824561404e-06,
"loss": 1.428,
"mean_token_accuracy": 0.6465956628322601,
"num_tokens": 23212465.0,
"step": 4070
},
{
"entropy": 1.4499358654022216,
"epoch": 2.064777327935223,
"grad_norm": 0.3413025438785553,
"learning_rate": 6.238191632928475e-06,
"loss": 1.4555,
"mean_token_accuracy": 0.6432287812232971,
"num_tokens": 23268400.0,
"step": 4080
},
{
"entropy": 1.433293628692627,
"epoch": 2.0698380566801617,
"grad_norm": 0.27788856625556946,
"learning_rate": 6.204453441295547e-06,
"loss": 1.4404,
"mean_token_accuracy": 0.6448906004428864,
"num_tokens": 23330858.0,
"step": 4090
},
{
"entropy": 1.527322268486023,
"epoch": 2.074898785425101,
"grad_norm": 0.28372228145599365,
"learning_rate": 6.170715249662618e-06,
"loss": 1.5369,
"mean_token_accuracy": 0.6296894669532775,
"num_tokens": 23388049.0,
"step": 4100
},
{
"entropy": 1.654162836074829,
"epoch": 2.0799595141700404,
"grad_norm": 0.3283277451992035,
"learning_rate": 6.1369770580296905e-06,
"loss": 1.6652,
"mean_token_accuracy": 0.6081342697143555,
"num_tokens": 23450327.0,
"step": 4110
},
{
"entropy": 1.5552624464035034,
"epoch": 2.08502024291498,
"grad_norm": 0.3101661205291748,
"learning_rate": 6.103238866396761e-06,
"loss": 1.5571,
"mean_token_accuracy": 0.6288932502269745,
"num_tokens": 23507582.0,
"step": 4120
},
{
"entropy": 1.5187462210655212,
"epoch": 2.090080971659919,
"grad_norm": 0.26190704107284546,
"learning_rate": 6.069500674763833e-06,
"loss": 1.5231,
"mean_token_accuracy": 0.6347708106040955,
"num_tokens": 23570085.0,
"step": 4130
},
{
"entropy": 1.4180486440658568,
"epoch": 2.0951417004048585,
"grad_norm": 0.24935229122638702,
"learning_rate": 6.035762483130905e-06,
"loss": 1.4134,
"mean_token_accuracy": 0.6535919070243835,
"num_tokens": 23629729.0,
"step": 4140
},
{
"entropy": 1.5712830781936646,
"epoch": 2.1002024291497974,
"grad_norm": 0.28485989570617676,
"learning_rate": 6.002024291497976e-06,
"loss": 1.5661,
"mean_token_accuracy": 0.6283676266670227,
"num_tokens": 23686822.0,
"step": 4150
},
{
"entropy": 1.487233829498291,
"epoch": 2.1052631578947367,
"grad_norm": 0.3802538812160492,
"learning_rate": 5.968286099865048e-06,
"loss": 1.5071,
"mean_token_accuracy": 0.636066097021103,
"num_tokens": 23743196.0,
"step": 4160
},
{
"entropy": 1.485396420955658,
"epoch": 2.110323886639676,
"grad_norm": 0.37386566400527954,
"learning_rate": 5.934547908232119e-06,
"loss": 1.4772,
"mean_token_accuracy": 0.6422532796859741,
"num_tokens": 23798229.0,
"step": 4170
},
{
"entropy": 1.535237228870392,
"epoch": 2.1153846153846154,
"grad_norm": 0.26898157596588135,
"learning_rate": 5.900809716599191e-06,
"loss": 1.5333,
"mean_token_accuracy": 0.6358494937419892,
"num_tokens": 23852408.0,
"step": 4180
},
{
"entropy": 1.5727092146873474,
"epoch": 2.1204453441295548,
"grad_norm": 0.3571448028087616,
"learning_rate": 5.867071524966263e-06,
"loss": 1.5678,
"mean_token_accuracy": 0.6239661037921905,
"num_tokens": 23902266.0,
"step": 4190
},
{
"entropy": 1.5237385392189027,
"epoch": 2.125506072874494,
"grad_norm": 0.28321143984794617,
"learning_rate": 5.833333333333334e-06,
"loss": 1.5365,
"mean_token_accuracy": 0.6352564930915833,
"num_tokens": 23959815.0,
"step": 4200
},
{
"entropy": 1.5299026012420653,
"epoch": 2.130566801619433,
"grad_norm": 0.3400108218193054,
"learning_rate": 5.799595141700405e-06,
"loss": 1.519,
"mean_token_accuracy": 0.6339640021324158,
"num_tokens": 24012133.0,
"step": 4210
},
{
"entropy": 1.657011294364929,
"epoch": 2.1356275303643724,
"grad_norm": 0.3595241606235504,
"learning_rate": 5.765856950067476e-06,
"loss": 1.668,
"mean_token_accuracy": 0.6125568807125091,
"num_tokens": 24063677.0,
"step": 4220
},
{
"entropy": 1.5003764629364014,
"epoch": 2.1406882591093117,
"grad_norm": 0.32139450311660767,
"learning_rate": 5.7321187584345484e-06,
"loss": 1.4876,
"mean_token_accuracy": 0.6435904741287232,
"num_tokens": 24120380.0,
"step": 4230
},
{
"entropy": 1.6574489951133728,
"epoch": 2.145748987854251,
"grad_norm": 0.30065852403640747,
"learning_rate": 5.6983805668016205e-06,
"loss": 1.6782,
"mean_token_accuracy": 0.6093615233898163,
"num_tokens": 24181603.0,
"step": 4240
},
{
"entropy": 1.4604612827301025,
"epoch": 2.1508097165991904,
"grad_norm": 0.28791046142578125,
"learning_rate": 5.664642375168692e-06,
"loss": 1.4376,
"mean_token_accuracy": 0.6457455456256866,
"num_tokens": 24239096.0,
"step": 4250
},
{
"entropy": 1.4780054807662963,
"epoch": 2.1558704453441297,
"grad_norm": 0.2827425003051758,
"learning_rate": 5.630904183535763e-06,
"loss": 1.4805,
"mean_token_accuracy": 0.6447736561298371,
"num_tokens": 24295397.0,
"step": 4260
},
{
"entropy": 1.4344088315963746,
"epoch": 2.1609311740890687,
"grad_norm": 0.3887704908847809,
"learning_rate": 5.597165991902834e-06,
"loss": 1.4266,
"mean_token_accuracy": 0.6494575679302216,
"num_tokens": 24345669.0,
"step": 4270
},
{
"entropy": 1.5128828644752503,
"epoch": 2.165991902834008,
"grad_norm": 0.34420716762542725,
"learning_rate": 5.563427800269906e-06,
"loss": 1.5186,
"mean_token_accuracy": 0.6373259782791137,
"num_tokens": 24403704.0,
"step": 4280
},
{
"entropy": 1.3984260201454162,
"epoch": 2.1710526315789473,
"grad_norm": 0.33548930287361145,
"learning_rate": 5.5296896086369774e-06,
"loss": 1.381,
"mean_token_accuracy": 0.6609737515449524,
"num_tokens": 24457935.0,
"step": 4290
},
{
"entropy": 1.4911738991737367,
"epoch": 2.1761133603238867,
"grad_norm": 0.2852116823196411,
"learning_rate": 5.4959514170040495e-06,
"loss": 1.4799,
"mean_token_accuracy": 0.6415831744670868,
"num_tokens": 24511977.0,
"step": 4300
},
{
"entropy": 1.4702451825141907,
"epoch": 2.181174089068826,
"grad_norm": 0.28457802534103394,
"learning_rate": 5.46221322537112e-06,
"loss": 1.4768,
"mean_token_accuracy": 0.6372047007083893,
"num_tokens": 24569954.0,
"step": 4310
},
{
"entropy": 1.4613691449165345,
"epoch": 2.1862348178137654,
"grad_norm": 0.31222304701805115,
"learning_rate": 5.428475033738192e-06,
"loss": 1.4692,
"mean_token_accuracy": 0.6442633271217346,
"num_tokens": 24625268.0,
"step": 4320
},
{
"entropy": 1.466537070274353,
"epoch": 2.1912955465587043,
"grad_norm": 0.2962714433670044,
"learning_rate": 5.394736842105264e-06,
"loss": 1.4664,
"mean_token_accuracy": 0.6492825329303742,
"num_tokens": 24688289.0,
"step": 4330
},
{
"entropy": 1.5810052037239075,
"epoch": 2.1963562753036436,
"grad_norm": 0.30552032589912415,
"learning_rate": 5.360998650472335e-06,
"loss": 1.5811,
"mean_token_accuracy": 0.6259881913661957,
"num_tokens": 24746697.0,
"step": 4340
},
{
"entropy": 1.4260846734046937,
"epoch": 2.201417004048583,
"grad_norm": 0.2985803484916687,
"learning_rate": 5.327260458839406e-06,
"loss": 1.4137,
"mean_token_accuracy": 0.6532795548439025,
"num_tokens": 24810772.0,
"step": 4350
},
{
"entropy": 1.5106618881225586,
"epoch": 2.2064777327935223,
"grad_norm": 0.33830076456069946,
"learning_rate": 5.293522267206478e-06,
"loss": 1.522,
"mean_token_accuracy": 0.6390328884124756,
"num_tokens": 24870122.0,
"step": 4360
},
{
"entropy": 1.527205801010132,
"epoch": 2.2115384615384617,
"grad_norm": 0.444986492395401,
"learning_rate": 5.25978407557355e-06,
"loss": 1.5237,
"mean_token_accuracy": 0.6333723068237305,
"num_tokens": 24929676.0,
"step": 4370
},
{
"entropy": 1.571653914451599,
"epoch": 2.216599190283401,
"grad_norm": 0.27972137928009033,
"learning_rate": 5.226045883940622e-06,
"loss": 1.5782,
"mean_token_accuracy": 0.62519211769104,
"num_tokens": 24984648.0,
"step": 4380
},
{
"entropy": 1.579957866668701,
"epoch": 2.22165991902834,
"grad_norm": 0.35601162910461426,
"learning_rate": 5.192307692307693e-06,
"loss": 1.5916,
"mean_token_accuracy": 0.6265009582042694,
"num_tokens": 25039282.0,
"step": 4390
},
{
"entropy": 1.590737247467041,
"epoch": 2.2267206477732793,
"grad_norm": 0.3328033685684204,
"learning_rate": 5.158569500674764e-06,
"loss": 1.5942,
"mean_token_accuracy": 0.6266931772232056,
"num_tokens": 25084698.0,
"step": 4400
},
{
"entropy": 1.4461635231971741,
"epoch": 2.2317813765182186,
"grad_norm": 0.3073853850364685,
"learning_rate": 5.124831309041835e-06,
"loss": 1.4532,
"mean_token_accuracy": 0.6430659115314483,
"num_tokens": 25145917.0,
"step": 4410
},
{
"entropy": 1.6023080706596375,
"epoch": 2.236842105263158,
"grad_norm": 0.38999930024147034,
"learning_rate": 5.0910931174089075e-06,
"loss": 1.6065,
"mean_token_accuracy": 0.6303758680820465,
"num_tokens": 25200499.0,
"step": 4420
},
{
"entropy": 1.403742289543152,
"epoch": 2.2419028340080973,
"grad_norm": 0.3020265996456146,
"learning_rate": 5.057354925775979e-06,
"loss": 1.3936,
"mean_token_accuracy": 0.6550646901130677,
"num_tokens": 25253626.0,
"step": 4430
},
{
"entropy": 1.5970208644866943,
"epoch": 2.246963562753036,
"grad_norm": 0.34803110361099243,
"learning_rate": 5.023616734143051e-06,
"loss": 1.6128,
"mean_token_accuracy": 0.6253244817256928,
"num_tokens": 25315718.0,
"step": 4440
},
{
"entropy": 1.4895619392395019,
"epoch": 2.2520242914979756,
"grad_norm": 0.295636385679245,
"learning_rate": 4.989878542510122e-06,
"loss": 1.4976,
"mean_token_accuracy": 0.6415492594242096,
"num_tokens": 25378490.0,
"step": 4450
},
{
"entropy": 1.500291097164154,
"epoch": 2.257085020242915,
"grad_norm": 0.29003915190696716,
"learning_rate": 4.956140350877193e-06,
"loss": 1.4741,
"mean_token_accuracy": 0.6455156445503235,
"num_tokens": 25435125.0,
"step": 4460
},
{
"entropy": 1.5137645125389099,
"epoch": 2.2621457489878543,
"grad_norm": 0.345222145318985,
"learning_rate": 4.922402159244265e-06,
"loss": 1.5106,
"mean_token_accuracy": 0.6373549580574036,
"num_tokens": 25492838.0,
"step": 4470
},
{
"entropy": 1.4126244068145752,
"epoch": 2.2672064777327936,
"grad_norm": 0.43444496393203735,
"learning_rate": 4.8886639676113364e-06,
"loss": 1.402,
"mean_token_accuracy": 0.6513433575630188,
"num_tokens": 25552113.0,
"step": 4480
},
{
"entropy": 1.5574785828590394,
"epoch": 2.272267206477733,
"grad_norm": 0.28663352131843567,
"learning_rate": 4.854925775978408e-06,
"loss": 1.5719,
"mean_token_accuracy": 0.6330413460731507,
"num_tokens": 25604938.0,
"step": 4490
},
{
"entropy": 1.5517175793647766,
"epoch": 2.2773279352226723,
"grad_norm": 0.3585723042488098,
"learning_rate": 4.82118758434548e-06,
"loss": 1.5492,
"mean_token_accuracy": 0.6311025798320771,
"num_tokens": 25663827.0,
"step": 4500
},
{
"entropy": 1.7192303657531738,
"epoch": 2.282388663967611,
"grad_norm": 0.3171631395816803,
"learning_rate": 4.787449392712551e-06,
"loss": 1.7084,
"mean_token_accuracy": 0.5979065060615539,
"num_tokens": 25718627.0,
"step": 4510
},
{
"entropy": 1.4433665156364441,
"epoch": 2.2874493927125505,
"grad_norm": 0.31859585642814636,
"learning_rate": 4.753711201079623e-06,
"loss": 1.431,
"mean_token_accuracy": 0.6453494548797607,
"num_tokens": 25779859.0,
"step": 4520
},
{
"entropy": 1.493071937561035,
"epoch": 2.29251012145749,
"grad_norm": 0.3323538303375244,
"learning_rate": 4.719973009446694e-06,
"loss": 1.5016,
"mean_token_accuracy": 0.6344216048717499,
"num_tokens": 25835705.0,
"step": 4530
},
{
"entropy": 1.5348315596580506,
"epoch": 2.2975708502024292,
"grad_norm": 0.29418283700942993,
"learning_rate": 4.686234817813765e-06,
"loss": 1.5299,
"mean_token_accuracy": 0.6337445557117463,
"num_tokens": 25896484.0,
"step": 4540
},
{
"entropy": 1.4027626633644104,
"epoch": 2.3026315789473686,
"grad_norm": 0.3454079031944275,
"learning_rate": 4.652496626180837e-06,
"loss": 1.3954,
"mean_token_accuracy": 0.6570545434951782,
"num_tokens": 25946989.0,
"step": 4550
},
{
"entropy": 1.4810479283332825,
"epoch": 2.3076923076923075,
"grad_norm": 0.30555200576782227,
"learning_rate": 4.618758434547909e-06,
"loss": 1.4935,
"mean_token_accuracy": 0.6418456912040711,
"num_tokens": 26005212.0,
"step": 4560
},
{
"entropy": 1.5501378655433655,
"epoch": 2.312753036437247,
"grad_norm": 0.2936731278896332,
"learning_rate": 4.585020242914981e-06,
"loss": 1.5493,
"mean_token_accuracy": 0.6311659216880798,
"num_tokens": 26061206.0,
"step": 4570
},
{
"entropy": 1.5965832471847534,
"epoch": 2.317813765182186,
"grad_norm": 0.3174577057361603,
"learning_rate": 4.551282051282052e-06,
"loss": 1.5986,
"mean_token_accuracy": 0.6272948026657105,
"num_tokens": 26117314.0,
"step": 4580
},
{
"entropy": 1.497817873954773,
"epoch": 2.3228744939271255,
"grad_norm": 0.3074813485145569,
"learning_rate": 4.517543859649123e-06,
"loss": 1.5177,
"mean_token_accuracy": 0.639699399471283,
"num_tokens": 26177625.0,
"step": 4590
},
{
"entropy": 1.398792815208435,
"epoch": 2.327935222672065,
"grad_norm": 0.3233450949192047,
"learning_rate": 4.483805668016194e-06,
"loss": 1.3972,
"mean_token_accuracy": 0.6578422546386719,
"num_tokens": 26229108.0,
"step": 4600
},
{
"entropy": 1.3582614064216614,
"epoch": 2.332995951417004,
"grad_norm": 0.3194423019886017,
"learning_rate": 4.4500674763832665e-06,
"loss": 1.3473,
"mean_token_accuracy": 0.6627348363399506,
"num_tokens": 26281682.0,
"step": 4610
},
{
"entropy": 1.4663148880004884,
"epoch": 2.3380566801619436,
"grad_norm": 0.317622572183609,
"learning_rate": 4.416329284750338e-06,
"loss": 1.4749,
"mean_token_accuracy": 0.6402939558029175,
"num_tokens": 26343090.0,
"step": 4620
},
{
"entropy": 1.4386041164398193,
"epoch": 2.3431174089068825,
"grad_norm": 0.37403181195259094,
"learning_rate": 4.382591093117409e-06,
"loss": 1.4399,
"mean_token_accuracy": 0.6470987558364868,
"num_tokens": 26398372.0,
"step": 4630
},
{
"entropy": 1.591576099395752,
"epoch": 2.348178137651822,
"grad_norm": 0.27833235263824463,
"learning_rate": 4.348852901484481e-06,
"loss": 1.6015,
"mean_token_accuracy": 0.6296046376228333,
"num_tokens": 26458865.0,
"step": 4640
},
{
"entropy": 1.4324705123901367,
"epoch": 2.353238866396761,
"grad_norm": 0.3234311044216156,
"learning_rate": 4.315114709851552e-06,
"loss": 1.4182,
"mean_token_accuracy": 0.6525469720363617,
"num_tokens": 26514094.0,
"step": 4650
},
{
"entropy": 1.5859549045562744,
"epoch": 2.3582995951417005,
"grad_norm": 0.31048783659935,
"learning_rate": 4.281376518218624e-06,
"loss": 1.6055,
"mean_token_accuracy": 0.6206431567668915,
"num_tokens": 26573568.0,
"step": 4660
},
{
"entropy": 1.4157851219177247,
"epoch": 2.36336032388664,
"grad_norm": 0.27004745602607727,
"learning_rate": 4.2476383265856954e-06,
"loss": 1.4191,
"mean_token_accuracy": 0.6526973366737365,
"num_tokens": 26628281.0,
"step": 4670
},
{
"entropy": 1.4219112515449523,
"epoch": 2.3684210526315788,
"grad_norm": 0.3162846863269806,
"learning_rate": 4.213900134952767e-06,
"loss": 1.4237,
"mean_token_accuracy": 0.6481447339057922,
"num_tokens": 26683329.0,
"step": 4680
},
{
"entropy": 1.474673593044281,
"epoch": 2.373481781376518,
"grad_norm": 0.2558523714542389,
"learning_rate": 4.180161943319838e-06,
"loss": 1.4789,
"mean_token_accuracy": 0.644309651851654,
"num_tokens": 26741726.0,
"step": 4690
},
{
"entropy": 1.545168387889862,
"epoch": 2.3785425101214575,
"grad_norm": 0.3100733160972595,
"learning_rate": 4.14642375168691e-06,
"loss": 1.5585,
"mean_token_accuracy": 0.6251280426979064,
"num_tokens": 26801987.0,
"step": 4700
},
{
"entropy": 1.4952475309371949,
"epoch": 2.383603238866397,
"grad_norm": 0.2840896546840668,
"learning_rate": 4.112685560053982e-06,
"loss": 1.4928,
"mean_token_accuracy": 0.6407946467399597,
"num_tokens": 26862449.0,
"step": 4710
},
{
"entropy": 1.3853577494621276,
"epoch": 2.388663967611336,
"grad_norm": 0.315100759267807,
"learning_rate": 4.078947368421053e-06,
"loss": 1.3891,
"mean_token_accuracy": 0.6517343044281005,
"num_tokens": 26923528.0,
"step": 4720
},
{
"entropy": 1.5327417492866515,
"epoch": 2.3937246963562755,
"grad_norm": 0.3072359561920166,
"learning_rate": 4.0452091767881244e-06,
"loss": 1.5438,
"mean_token_accuracy": 0.638210940361023,
"num_tokens": 26976129.0,
"step": 4730
},
{
"entropy": 1.6007991313934327,
"epoch": 2.3987854251012144,
"grad_norm": 0.28095099329948425,
"learning_rate": 4.011470985155196e-06,
"loss": 1.6025,
"mean_token_accuracy": 0.6204523742198944,
"num_tokens": 27030769.0,
"step": 4740
},
{
"entropy": 1.5538129091262818,
"epoch": 2.4038461538461537,
"grad_norm": 0.3622888922691345,
"learning_rate": 3.977732793522268e-06,
"loss": 1.5497,
"mean_token_accuracy": 0.6246297895908356,
"num_tokens": 27085119.0,
"step": 4750
},
{
"entropy": 1.4716430306434631,
"epoch": 2.408906882591093,
"grad_norm": 0.2776808738708496,
"learning_rate": 3.943994601889339e-06,
"loss": 1.4715,
"mean_token_accuracy": 0.6430730044841766,
"num_tokens": 27146308.0,
"step": 4760
},
{
"entropy": 1.4779613852500915,
"epoch": 2.4139676113360324,
"grad_norm": 0.30735519528388977,
"learning_rate": 3.910256410256411e-06,
"loss": 1.481,
"mean_token_accuracy": 0.6421349704265594,
"num_tokens": 27204236.0,
"step": 4770
},
{
"entropy": 1.6263086080551148,
"epoch": 2.419028340080972,
"grad_norm": 0.3509717881679535,
"learning_rate": 3.876518218623482e-06,
"loss": 1.6306,
"mean_token_accuracy": 0.6189518332481384,
"num_tokens": 27253795.0,
"step": 4780
},
{
"entropy": 1.5051485419273376,
"epoch": 2.4240890688259107,
"grad_norm": 0.36502060294151306,
"learning_rate": 3.842780026990553e-06,
"loss": 1.5045,
"mean_token_accuracy": 0.6390359103679657,
"num_tokens": 27311173.0,
"step": 4790
},
{
"entropy": 1.5122657060623168,
"epoch": 2.42914979757085,
"grad_norm": 0.35788798332214355,
"learning_rate": 3.8090418353576255e-06,
"loss": 1.4811,
"mean_token_accuracy": 0.6367557644844055,
"num_tokens": 27366839.0,
"step": 4800
},
{
"entropy": 1.5352485537528993,
"epoch": 2.4342105263157894,
"grad_norm": 0.2877010107040405,
"learning_rate": 3.7753036437246967e-06,
"loss": 1.5402,
"mean_token_accuracy": 0.6323030471801758,
"num_tokens": 27423988.0,
"step": 4810
},
{
"entropy": 1.329223895072937,
"epoch": 2.4392712550607287,
"grad_norm": 0.27826353907585144,
"learning_rate": 3.7415654520917683e-06,
"loss": 1.3322,
"mean_token_accuracy": 0.6661195576190948,
"num_tokens": 27482284.0,
"step": 4820
},
{
"entropy": 1.460306990146637,
"epoch": 2.444331983805668,
"grad_norm": 0.2664757966995239,
"learning_rate": 3.7078272604588395e-06,
"loss": 1.4645,
"mean_token_accuracy": 0.6439946055412292,
"num_tokens": 27542235.0,
"step": 4830
},
{
"entropy": 1.420573878288269,
"epoch": 2.4493927125506074,
"grad_norm": 0.3187576234340668,
"learning_rate": 3.674089068825911e-06,
"loss": 1.4271,
"mean_token_accuracy": 0.6494402289390564,
"num_tokens": 27606521.0,
"step": 4840
},
{
"entropy": 1.5605995893478393,
"epoch": 2.4544534412955468,
"grad_norm": 0.3589235842227936,
"learning_rate": 3.640350877192983e-06,
"loss": 1.5464,
"mean_token_accuracy": 0.636710187792778,
"num_tokens": 27655589.0,
"step": 4850
},
{
"entropy": 1.5500049710273742,
"epoch": 2.4595141700404857,
"grad_norm": 0.42818954586982727,
"learning_rate": 3.606612685560054e-06,
"loss": 1.5422,
"mean_token_accuracy": 0.6335929155349731,
"num_tokens": 27707321.0,
"step": 4860
},
{
"entropy": 1.5264463543891906,
"epoch": 2.464574898785425,
"grad_norm": 0.30446869134902954,
"learning_rate": 3.572874493927126e-06,
"loss": 1.5354,
"mean_token_accuracy": 0.6377040445804596,
"num_tokens": 27766680.0,
"step": 4870
},
{
"entropy": 1.5602357268333436,
"epoch": 2.4696356275303644,
"grad_norm": 0.31952470541000366,
"learning_rate": 3.5391363022941973e-06,
"loss": 1.563,
"mean_token_accuracy": 0.6299617826938629,
"num_tokens": 27825128.0,
"step": 4880
},
{
"entropy": 1.4979919075965882,
"epoch": 2.4746963562753037,
"grad_norm": 0.3032040596008301,
"learning_rate": 3.505398110661269e-06,
"loss": 1.5194,
"mean_token_accuracy": 0.6328540325164795,
"num_tokens": 27886765.0,
"step": 4890
},
{
"entropy": 1.571874487400055,
"epoch": 2.479757085020243,
"grad_norm": 0.3398491144180298,
"learning_rate": 3.47165991902834e-06,
"loss": 1.568,
"mean_token_accuracy": 0.6220065712928772,
"num_tokens": 27942690.0,
"step": 4900
},
{
"entropy": 1.4888028264045716,
"epoch": 2.484817813765182,
"grad_norm": 0.2785778343677521,
"learning_rate": 3.437921727395412e-06,
"loss": 1.482,
"mean_token_accuracy": 0.6426316261291504,
"num_tokens": 28001045.0,
"step": 4910
},
{
"entropy": 1.4304234504699707,
"epoch": 2.4898785425101213,
"grad_norm": 0.36416903138160706,
"learning_rate": 3.4041835357624834e-06,
"loss": 1.4412,
"mean_token_accuracy": 0.6480507373809814,
"num_tokens": 28060050.0,
"step": 4920
},
{
"entropy": 1.4549246668815612,
"epoch": 2.4949392712550607,
"grad_norm": 0.3209365904331207,
"learning_rate": 3.3704453441295546e-06,
"loss": 1.4444,
"mean_token_accuracy": 0.6485071182250977,
"num_tokens": 28119937.0,
"step": 4930
},
{
"entropy": 1.6035995841026307,
"epoch": 2.5,
"grad_norm": 0.3263776898384094,
"learning_rate": 3.3367071524966267e-06,
"loss": 1.596,
"mean_token_accuracy": 0.6212283372879028,
"num_tokens": 28176098.0,
"step": 4940
},
{
"entropy": 1.3420706629753112,
"epoch": 2.5050607287449393,
"grad_norm": 0.29616400599479675,
"learning_rate": 3.302968960863698e-06,
"loss": 1.3361,
"mean_token_accuracy": 0.6657415688037872,
"num_tokens": 28232898.0,
"step": 4950
},
{
"entropy": 1.5731253027915955,
"epoch": 2.5101214574898787,
"grad_norm": 0.2652728259563446,
"learning_rate": 3.2692307692307696e-06,
"loss": 1.569,
"mean_token_accuracy": 0.6270411610603333,
"num_tokens": 28289187.0,
"step": 4960
},
{
"entropy": 1.4383020401000977,
"epoch": 2.515182186234818,
"grad_norm": 0.3313502371311188,
"learning_rate": 3.2354925775978408e-06,
"loss": 1.4301,
"mean_token_accuracy": 0.6567471146583557,
"num_tokens": 28345870.0,
"step": 4970
},
{
"entropy": 1.4449619054794312,
"epoch": 2.520242914979757,
"grad_norm": 0.299467533826828,
"learning_rate": 3.2017543859649124e-06,
"loss": 1.4596,
"mean_token_accuracy": 0.6480660021305085,
"num_tokens": 28401335.0,
"step": 4980
},
{
"entropy": 1.407576084136963,
"epoch": 2.5253036437246963,
"grad_norm": 0.33703747391700745,
"learning_rate": 3.168016194331984e-06,
"loss": 1.4026,
"mean_token_accuracy": 0.6588316440582276,
"num_tokens": 28451027.0,
"step": 4990
},
{
"entropy": 1.6358988881111145,
"epoch": 2.5303643724696356,
"grad_norm": 0.3531615138053894,
"learning_rate": 3.1342780026990553e-06,
"loss": 1.6387,
"mean_token_accuracy": 0.6192252457141876,
"num_tokens": 28508717.0,
"step": 5000
},
{
"entropy": 1.530623769760132,
"epoch": 2.535425101214575,
"grad_norm": 0.2998420000076294,
"learning_rate": 3.1005398110661273e-06,
"loss": 1.5209,
"mean_token_accuracy": 0.6354014992713928,
"num_tokens": 28566256.0,
"step": 5010
},
{
"entropy": 1.5933383703231812,
"epoch": 2.5404858299595143,
"grad_norm": 0.3689696192741394,
"learning_rate": 3.0668016194331985e-06,
"loss": 1.5881,
"mean_token_accuracy": 0.6318571925163269,
"num_tokens": 28618249.0,
"step": 5020
},
{
"entropy": 1.4564833164215087,
"epoch": 2.5455465587044532,
"grad_norm": 0.30524808168411255,
"learning_rate": 3.03306342780027e-06,
"loss": 1.4375,
"mean_token_accuracy": 0.6440569698810578,
"num_tokens": 28674342.0,
"step": 5030
},
{
"entropy": 1.510752511024475,
"epoch": 2.5506072874493926,
"grad_norm": 0.3323598802089691,
"learning_rate": 2.999325236167342e-06,
"loss": 1.5278,
"mean_token_accuracy": 0.6354637145996094,
"num_tokens": 28731622.0,
"step": 5040
},
{
"entropy": 1.4739052295684814,
"epoch": 2.555668016194332,
"grad_norm": 0.31869643926620483,
"learning_rate": 2.965587044534413e-06,
"loss": 1.4649,
"mean_token_accuracy": 0.6425871312618255,
"num_tokens": 28791133.0,
"step": 5050
},
{
"entropy": 1.5100542187690735,
"epoch": 2.5607287449392713,
"grad_norm": 0.3328213095664978,
"learning_rate": 2.931848852901485e-06,
"loss": 1.5045,
"mean_token_accuracy": 0.6392671585083007,
"num_tokens": 28847713.0,
"step": 5060
},
{
"entropy": 1.4085248589515686,
"epoch": 2.5657894736842106,
"grad_norm": 0.281522661447525,
"learning_rate": 2.8981106612685563e-06,
"loss": 1.3982,
"mean_token_accuracy": 0.6513190269470215,
"num_tokens": 28910189.0,
"step": 5070
},
{
"entropy": 1.397442674636841,
"epoch": 2.57085020242915,
"grad_norm": 0.3210408091545105,
"learning_rate": 2.864372469635628e-06,
"loss": 1.3977,
"mean_token_accuracy": 0.6574838936328888,
"num_tokens": 28966241.0,
"step": 5080
},
{
"entropy": 1.5165488362312316,
"epoch": 2.5759109311740893,
"grad_norm": 0.31288620829582214,
"learning_rate": 2.830634278002699e-06,
"loss": 1.5124,
"mean_token_accuracy": 0.6387628674507141,
"num_tokens": 29026210.0,
"step": 5090
},
{
"entropy": 1.5974119186401368,
"epoch": 2.580971659919028,
"grad_norm": 0.3497001826763153,
"learning_rate": 2.796896086369771e-06,
"loss": 1.61,
"mean_token_accuracy": 0.6236252367496491,
"num_tokens": 29083556.0,
"step": 5100
},
{
"entropy": 1.5403811931610107,
"epoch": 2.5860323886639676,
"grad_norm": 0.31958791613578796,
"learning_rate": 2.7631578947368424e-06,
"loss": 1.5418,
"mean_token_accuracy": 0.634338253736496,
"num_tokens": 29142090.0,
"step": 5110
},
{
"entropy": 1.4701064825057983,
"epoch": 2.591093117408907,
"grad_norm": 0.28594285249710083,
"learning_rate": 2.7294197031039137e-06,
"loss": 1.4693,
"mean_token_accuracy": 0.6509437322616577,
"num_tokens": 29198039.0,
"step": 5120
},
{
"entropy": 1.508654534816742,
"epoch": 2.5961538461538463,
"grad_norm": 0.28295132517814636,
"learning_rate": 2.6956815114709857e-06,
"loss": 1.5107,
"mean_token_accuracy": 0.6393173456192016,
"num_tokens": 29258240.0,
"step": 5130
},
{
"entropy": 1.573255705833435,
"epoch": 2.601214574898785,
"grad_norm": 0.2459454983472824,
"learning_rate": 2.661943319838057e-06,
"loss": 1.5903,
"mean_token_accuracy": 0.6283860564231872,
"num_tokens": 29318879.0,
"step": 5140
},
{
"entropy": 1.5287572503089906,
"epoch": 2.6062753036437245,
"grad_norm": 0.31771403551101685,
"learning_rate": 2.6282051282051286e-06,
"loss": 1.5452,
"mean_token_accuracy": 0.6344579041004181,
"num_tokens": 29379919.0,
"step": 5150
},
{
"entropy": 1.3615296483039856,
"epoch": 2.611336032388664,
"grad_norm": 0.28625616431236267,
"learning_rate": 2.5944669365721998e-06,
"loss": 1.349,
"mean_token_accuracy": 0.6637236177921295,
"num_tokens": 29438959.0,
"step": 5160
},
{
"entropy": 1.4767539501190186,
"epoch": 2.616396761133603,
"grad_norm": 0.2911388874053955,
"learning_rate": 2.5607287449392714e-06,
"loss": 1.4775,
"mean_token_accuracy": 0.6405583918094635,
"num_tokens": 29495248.0,
"step": 5170
},
{
"entropy": 1.4118461966514588,
"epoch": 2.6214574898785425,
"grad_norm": 0.3035772442817688,
"learning_rate": 2.526990553306343e-06,
"loss": 1.4266,
"mean_token_accuracy": 0.6568454921245575,
"num_tokens": 29549374.0,
"step": 5180
},
{
"entropy": 1.3858314156532288,
"epoch": 2.626518218623482,
"grad_norm": 0.28831735253334045,
"learning_rate": 2.4932523616734143e-06,
"loss": 1.3659,
"mean_token_accuracy": 0.6626292169094086,
"num_tokens": 29608335.0,
"step": 5190
},
{
"entropy": 1.5293712258338927,
"epoch": 2.6315789473684212,
"grad_norm": 0.33819642663002014,
"learning_rate": 2.459514170040486e-06,
"loss": 1.5299,
"mean_token_accuracy": 0.629097181558609,
"num_tokens": 29666401.0,
"step": 5200
},
{
"entropy": 1.5522411942481995,
"epoch": 2.6366396761133606,
"grad_norm": 0.37447431683540344,
"learning_rate": 2.4257759784075576e-06,
"loss": 1.5546,
"mean_token_accuracy": 0.6252642631530761,
"num_tokens": 29722977.0,
"step": 5210
},
{
"entropy": 1.5046650171279907,
"epoch": 2.6417004048582995,
"grad_norm": 0.32877567410469055,
"learning_rate": 2.392037786774629e-06,
"loss": 1.4941,
"mean_token_accuracy": 0.6403312921524048,
"num_tokens": 29777693.0,
"step": 5220
},
{
"entropy": 1.4904412388801576,
"epoch": 2.646761133603239,
"grad_norm": 0.30846232175827026,
"learning_rate": 2.358299595141701e-06,
"loss": 1.4874,
"mean_token_accuracy": 0.6401443660259247,
"num_tokens": 29841451.0,
"step": 5230
},
{
"entropy": 1.4474842250347137,
"epoch": 2.651821862348178,
"grad_norm": 0.3371650278568268,
"learning_rate": 2.324561403508772e-06,
"loss": 1.4514,
"mean_token_accuracy": 0.6537328362464905,
"num_tokens": 29900142.0,
"step": 5240
},
{
"entropy": 1.5490441560745238,
"epoch": 2.6568825910931175,
"grad_norm": 0.28833135962486267,
"learning_rate": 2.2908232118758437e-06,
"loss": 1.5525,
"mean_token_accuracy": 0.6344904005527496,
"num_tokens": 29965665.0,
"step": 5250
},
{
"entropy": 1.4371688961982727,
"epoch": 2.6619433198380564,
"grad_norm": 0.27346664667129517,
"learning_rate": 2.257085020242915e-06,
"loss": 1.4386,
"mean_token_accuracy": 0.6554741203784943,
"num_tokens": 30020063.0,
"step": 5260
},
{
"entropy": 1.57616069316864,
"epoch": 2.667004048582996,
"grad_norm": 0.31261205673217773,
"learning_rate": 2.2233468286099865e-06,
"loss": 1.5878,
"mean_token_accuracy": 0.6287827432155609,
"num_tokens": 30079648.0,
"step": 5270
},
{
"entropy": 1.6309450030326844,
"epoch": 2.672064777327935,
"grad_norm": 0.36513420939445496,
"learning_rate": 2.189608636977058e-06,
"loss": 1.6362,
"mean_token_accuracy": 0.6139590203762054,
"num_tokens": 30139557.0,
"step": 5280
},
{
"entropy": 1.6020007967948913,
"epoch": 2.6771255060728745,
"grad_norm": 0.3361331522464752,
"learning_rate": 2.15587044534413e-06,
"loss": 1.5899,
"mean_token_accuracy": 0.623996788263321,
"num_tokens": 30194644.0,
"step": 5290
},
{
"entropy": 1.4187337517738343,
"epoch": 2.682186234817814,
"grad_norm": 0.3711530864238739,
"learning_rate": 2.1221322537112015e-06,
"loss": 1.4225,
"mean_token_accuracy": 0.6517966687679291,
"num_tokens": 30249182.0,
"step": 5300
},
{
"entropy": 1.4419126749038695,
"epoch": 2.687246963562753,
"grad_norm": 0.34213292598724365,
"learning_rate": 2.0883940620782727e-06,
"loss": 1.4502,
"mean_token_accuracy": 0.6504493892192841,
"num_tokens": 30307151.0,
"step": 5310
},
{
"entropy": 1.593650794029236,
"epoch": 2.6923076923076925,
"grad_norm": 0.2626771032810211,
"learning_rate": 2.0546558704453443e-06,
"loss": 1.5977,
"mean_token_accuracy": 0.6253896594047547,
"num_tokens": 30363799.0,
"step": 5320
},
{
"entropy": 1.505863094329834,
"epoch": 2.6973684210526314,
"grad_norm": 0.31610244512557983,
"learning_rate": 2.020917678812416e-06,
"loss": 1.507,
"mean_token_accuracy": 0.6344715654850006,
"num_tokens": 30420544.0,
"step": 5330
},
{
"entropy": 1.5584728479385377,
"epoch": 2.7024291497975708,
"grad_norm": 0.3088075518608093,
"learning_rate": 1.987179487179487e-06,
"loss": 1.5504,
"mean_token_accuracy": 0.6318127393722535,
"num_tokens": 30479027.0,
"step": 5340
},
{
"entropy": 1.464753222465515,
"epoch": 2.70748987854251,
"grad_norm": 0.4019823372364044,
"learning_rate": 1.953441295546559e-06,
"loss": 1.4567,
"mean_token_accuracy": 0.6482177615165711,
"num_tokens": 30534454.0,
"step": 5350
},
{
"entropy": 1.5660398364067079,
"epoch": 2.7125506072874495,
"grad_norm": 0.2922350764274597,
"learning_rate": 1.9197031039136304e-06,
"loss": 1.5742,
"mean_token_accuracy": 0.6296724855899811,
"num_tokens": 30591311.0,
"step": 5360
},
{
"entropy": 1.513328456878662,
"epoch": 2.717611336032389,
"grad_norm": 0.34194323420524597,
"learning_rate": 1.8859649122807019e-06,
"loss": 1.5109,
"mean_token_accuracy": 0.6368795096874237,
"num_tokens": 30648991.0,
"step": 5370
},
{
"entropy": 1.517847204208374,
"epoch": 2.7226720647773277,
"grad_norm": 0.35915765166282654,
"learning_rate": 1.8522267206477735e-06,
"loss": 1.5111,
"mean_token_accuracy": 0.6370461285114288,
"num_tokens": 30702765.0,
"step": 5380
},
{
"entropy": 1.4219279885292053,
"epoch": 2.727732793522267,
"grad_norm": 0.31105926632881165,
"learning_rate": 1.818488529014845e-06,
"loss": 1.416,
"mean_token_accuracy": 0.6535437107086182,
"num_tokens": 30759049.0,
"step": 5390
},
{
"entropy": 1.4244774222373962,
"epoch": 2.7327935222672064,
"grad_norm": 0.3058363199234009,
"learning_rate": 1.7847503373819164e-06,
"loss": 1.4116,
"mean_token_accuracy": 0.6499510526657104,
"num_tokens": 30815038.0,
"step": 5400
},
{
"entropy": 1.6432706594467164,
"epoch": 2.7378542510121457,
"grad_norm": 0.33452996611595154,
"learning_rate": 1.7510121457489878e-06,
"loss": 1.6396,
"mean_token_accuracy": 0.6203866958618164,
"num_tokens": 30871076.0,
"step": 5410
},
{
"entropy": 1.6394325613975524,
"epoch": 2.742914979757085,
"grad_norm": 0.283194363117218,
"learning_rate": 1.7172739541160596e-06,
"loss": 1.6447,
"mean_token_accuracy": 0.6192583978176117,
"num_tokens": 30931499.0,
"step": 5420
},
{
"entropy": 1.613875186443329,
"epoch": 2.7479757085020244,
"grad_norm": 0.3175935745239258,
"learning_rate": 1.683535762483131e-06,
"loss": 1.616,
"mean_token_accuracy": 0.6225574970245361,
"num_tokens": 30993640.0,
"step": 5430
},
{
"entropy": 1.6437927842140199,
"epoch": 2.753036437246964,
"grad_norm": 0.2761462926864624,
"learning_rate": 1.6497975708502027e-06,
"loss": 1.6461,
"mean_token_accuracy": 0.6168906092643738,
"num_tokens": 31046563.0,
"step": 5440
},
{
"entropy": 1.3887561798095702,
"epoch": 2.7580971659919027,
"grad_norm": 0.3212042450904846,
"learning_rate": 1.6160593792172741e-06,
"loss": 1.3872,
"mean_token_accuracy": 0.66387038230896,
"num_tokens": 31100699.0,
"step": 5450
},
{
"entropy": 1.5592396020889283,
"epoch": 2.763157894736842,
"grad_norm": 0.28648391366004944,
"learning_rate": 1.5823211875843455e-06,
"loss": 1.5583,
"mean_token_accuracy": 0.6273209810256958,
"num_tokens": 31164910.0,
"step": 5460
},
{
"entropy": 1.546663898229599,
"epoch": 2.7682186234817814,
"grad_norm": 0.3598899841308594,
"learning_rate": 1.548582995951417e-06,
"loss": 1.5324,
"mean_token_accuracy": 0.6319786071777344,
"num_tokens": 31220029.0,
"step": 5470
},
{
"entropy": 1.5003357887268067,
"epoch": 2.7732793522267207,
"grad_norm": 0.2860889732837677,
"learning_rate": 1.5148448043184886e-06,
"loss": 1.4952,
"mean_token_accuracy": 0.6411886811256409,
"num_tokens": 31279401.0,
"step": 5480
},
{
"entropy": 1.418429398536682,
"epoch": 2.77834008097166,
"grad_norm": 0.2821556627750397,
"learning_rate": 1.4811066126855602e-06,
"loss": 1.421,
"mean_token_accuracy": 0.6593441128730774,
"num_tokens": 31334950.0,
"step": 5490
},
{
"entropy": 1.5357149362564086,
"epoch": 2.783400809716599,
"grad_norm": 0.3190230131149292,
"learning_rate": 1.4473684210526317e-06,
"loss": 1.5381,
"mean_token_accuracy": 0.6347779989242553,
"num_tokens": 31392814.0,
"step": 5500
},
{
"entropy": 1.4718513011932373,
"epoch": 2.7884615384615383,
"grad_norm": 0.2940792441368103,
"learning_rate": 1.4136302294197033e-06,
"loss": 1.4801,
"mean_token_accuracy": 0.6389244079589844,
"num_tokens": 31449900.0,
"step": 5510
},
{
"entropy": 1.3709456086158753,
"epoch": 2.7935222672064777,
"grad_norm": 0.30266401171684265,
"learning_rate": 1.3798920377867747e-06,
"loss": 1.3599,
"mean_token_accuracy": 0.6677371621131897,
"num_tokens": 31503636.0,
"step": 5520
},
{
"entropy": 1.4986552715301513,
"epoch": 2.798582995951417,
"grad_norm": 0.35532623529434204,
"learning_rate": 1.3461538461538462e-06,
"loss": 1.5069,
"mean_token_accuracy": 0.6403627216815948,
"num_tokens": 31563188.0,
"step": 5530
},
{
"entropy": 1.53000727891922,
"epoch": 2.8036437246963564,
"grad_norm": 0.3287500739097595,
"learning_rate": 1.3124156545209176e-06,
"loss": 1.5289,
"mean_token_accuracy": 0.6303663849830627,
"num_tokens": 31622655.0,
"step": 5540
},
{
"entropy": 1.399080502986908,
"epoch": 2.8087044534412957,
"grad_norm": 0.2796313762664795,
"learning_rate": 1.2786774628879894e-06,
"loss": 1.3962,
"mean_token_accuracy": 0.6611886739730835,
"num_tokens": 31678734.0,
"step": 5550
},
{
"entropy": 1.4993074774742126,
"epoch": 2.813765182186235,
"grad_norm": 0.2762647867202759,
"learning_rate": 1.2449392712550609e-06,
"loss": 1.5019,
"mean_token_accuracy": 0.6430062472820282,
"num_tokens": 31738238.0,
"step": 5560
},
{
"entropy": 1.5500144839286805,
"epoch": 2.818825910931174,
"grad_norm": 0.4136376678943634,
"learning_rate": 1.2112010796221325e-06,
"loss": 1.5483,
"mean_token_accuracy": 0.6305223643779755,
"num_tokens": 31794084.0,
"step": 5570
},
{
"entropy": 1.378357458114624,
"epoch": 2.8238866396761133,
"grad_norm": 0.2796184718608856,
"learning_rate": 1.177462887989204e-06,
"loss": 1.3879,
"mean_token_accuracy": 0.6623157143592835,
"num_tokens": 31850160.0,
"step": 5580
},
{
"entropy": 1.5122020602226258,
"epoch": 2.8289473684210527,
"grad_norm": 0.3030454218387604,
"learning_rate": 1.1437246963562754e-06,
"loss": 1.5336,
"mean_token_accuracy": 0.6398816347122193,
"num_tokens": 31908958.0,
"step": 5590
},
{
"entropy": 1.4396753072738648,
"epoch": 2.834008097165992,
"grad_norm": 0.329406201839447,
"learning_rate": 1.109986504723347e-06,
"loss": 1.4524,
"mean_token_accuracy": 0.6439715623855591,
"num_tokens": 31963221.0,
"step": 5600
},
{
"entropy": 1.4529581308364867,
"epoch": 2.839068825910931,
"grad_norm": 0.30805808305740356,
"learning_rate": 1.0762483130904184e-06,
"loss": 1.4535,
"mean_token_accuracy": 0.6503434360027314,
"num_tokens": 32024028.0,
"step": 5610
},
{
"entropy": 1.4559079647064208,
"epoch": 2.8441295546558703,
"grad_norm": 0.2905729115009308,
"learning_rate": 1.0425101214574899e-06,
"loss": 1.4595,
"mean_token_accuracy": 0.6404688119888305,
"num_tokens": 32079570.0,
"step": 5620
},
{
"entropy": 1.566865622997284,
"epoch": 2.8491902834008096,
"grad_norm": 0.3712847828865051,
"learning_rate": 1.0087719298245615e-06,
"loss": 1.5897,
"mean_token_accuracy": 0.6281434834003449,
"num_tokens": 32140749.0,
"step": 5630
},
{
"entropy": 1.3378282070159913,
"epoch": 2.854251012145749,
"grad_norm": 0.34094497561454773,
"learning_rate": 9.750337381916331e-07,
"loss": 1.3271,
"mean_token_accuracy": 0.6688917458057404,
"num_tokens": 32197723.0,
"step": 5640
},
{
"entropy": 1.5701539039611816,
"epoch": 2.8593117408906883,
"grad_norm": 0.3105640113353729,
"learning_rate": 9.412955465587046e-07,
"loss": 1.5691,
"mean_token_accuracy": 0.631563925743103,
"num_tokens": 32253364.0,
"step": 5650
},
{
"entropy": 1.3988579750061034,
"epoch": 2.8643724696356276,
"grad_norm": 0.33697089552879333,
"learning_rate": 9.07557354925776e-07,
"loss": 1.3774,
"mean_token_accuracy": 0.6580399334430694,
"num_tokens": 32307759.0,
"step": 5660
},
{
"entropy": 1.5067651271820068,
"epoch": 2.869433198380567,
"grad_norm": 0.4209248721599579,
"learning_rate": 8.738191632928476e-07,
"loss": 1.5117,
"mean_token_accuracy": 0.6395208477973938,
"num_tokens": 32361901.0,
"step": 5670
},
{
"entropy": 1.4897794604301453,
"epoch": 2.8744939271255063,
"grad_norm": 0.26533105969429016,
"learning_rate": 8.400809716599192e-07,
"loss": 1.492,
"mean_token_accuracy": 0.6359480619430542,
"num_tokens": 32421074.0,
"step": 5680
},
{
"entropy": 1.5117918133735657,
"epoch": 2.8795546558704452,
"grad_norm": 0.2814977169036865,
"learning_rate": 8.063427800269906e-07,
"loss": 1.5099,
"mean_token_accuracy": 0.6378594696521759,
"num_tokens": 32475249.0,
"step": 5690
},
{
"entropy": 1.5758733749389648,
"epoch": 2.8846153846153846,
"grad_norm": 0.3215586543083191,
"learning_rate": 7.726045883940621e-07,
"loss": 1.5742,
"mean_token_accuracy": 0.6247093558311463,
"num_tokens": 32533856.0,
"step": 5700
},
{
"entropy": 1.5476130247116089,
"epoch": 2.889676113360324,
"grad_norm": 0.3249874413013458,
"learning_rate": 7.388663967611337e-07,
"loss": 1.5596,
"mean_token_accuracy": 0.630809611082077,
"num_tokens": 32590748.0,
"step": 5710
},
{
"entropy": 1.6522730588912964,
"epoch": 2.8947368421052633,
"grad_norm": 0.30724644660949707,
"learning_rate": 7.051282051282052e-07,
"loss": 1.6494,
"mean_token_accuracy": 0.6230604112148285,
"num_tokens": 32646249.0,
"step": 5720
},
{
"entropy": 1.544532060623169,
"epoch": 2.899797570850202,
"grad_norm": 0.2921552062034607,
"learning_rate": 6.713900134952767e-07,
"loss": 1.5418,
"mean_token_accuracy": 0.6278964817523957,
"num_tokens": 32705968.0,
"step": 5730
},
{
"entropy": 1.6792848348617553,
"epoch": 2.9048582995951415,
"grad_norm": 0.35362908244132996,
"learning_rate": 6.376518218623482e-07,
"loss": 1.6863,
"mean_token_accuracy": 0.606015944480896,
"num_tokens": 32764283.0,
"step": 5740
},
{
"entropy": 1.3941245913505553,
"epoch": 2.909919028340081,
"grad_norm": 0.3051432967185974,
"learning_rate": 6.039136302294198e-07,
"loss": 1.3916,
"mean_token_accuracy": 0.655018413066864,
"num_tokens": 32821604.0,
"step": 5750
},
{
"entropy": 1.3279268383979796,
"epoch": 2.91497975708502,
"grad_norm": 0.28279706835746765,
"learning_rate": 5.701754385964912e-07,
"loss": 1.3252,
"mean_token_accuracy": 0.6721781909465789,
"num_tokens": 32882742.0,
"step": 5760
},
{
"entropy": 1.40997371673584,
"epoch": 2.9200404858299596,
"grad_norm": 0.27882078289985657,
"learning_rate": 5.364372469635628e-07,
"loss": 1.4014,
"mean_token_accuracy": 0.653525573015213,
"num_tokens": 32940852.0,
"step": 5770
},
{
"entropy": 1.4527880668640136,
"epoch": 2.925101214574899,
"grad_norm": 0.34039291739463806,
"learning_rate": 5.026990553306344e-07,
"loss": 1.4668,
"mean_token_accuracy": 0.6496657609939576,
"num_tokens": 32993007.0,
"step": 5780
},
{
"entropy": 1.4252225756645203,
"epoch": 2.9301619433198383,
"grad_norm": 0.33022540807724,
"learning_rate": 4.6896086369770585e-07,
"loss": 1.4163,
"mean_token_accuracy": 0.6603101253509521,
"num_tokens": 33052311.0,
"step": 5790
},
{
"entropy": 1.3463350296020509,
"epoch": 2.9352226720647776,
"grad_norm": 0.3052782416343689,
"learning_rate": 4.352226720647774e-07,
"loss": 1.3402,
"mean_token_accuracy": 0.6646123170852661,
"num_tokens": 33107681.0,
"step": 5800
},
{
"entropy": 1.4629117488861083,
"epoch": 2.9402834008097165,
"grad_norm": 0.3405231535434723,
"learning_rate": 4.0148448043184886e-07,
"loss": 1.4697,
"mean_token_accuracy": 0.6421536803245544,
"num_tokens": 33160562.0,
"step": 5810
},
{
"entropy": 1.5115989089012145,
"epoch": 2.945344129554656,
"grad_norm": 0.25086501240730286,
"learning_rate": 3.677462887989204e-07,
"loss": 1.5254,
"mean_token_accuracy": 0.636814546585083,
"num_tokens": 33219166.0,
"step": 5820
},
{
"entropy": 1.5050144791603088,
"epoch": 2.950404858299595,
"grad_norm": 0.30874550342559814,
"learning_rate": 3.34008097165992e-07,
"loss": 1.5059,
"mean_token_accuracy": 0.6406992137432098,
"num_tokens": 33275918.0,
"step": 5830
},
{
"entropy": 1.4055772423744202,
"epoch": 2.9554655870445345,
"grad_norm": 0.37710893154144287,
"learning_rate": 3.0026990553306346e-07,
"loss": 1.4007,
"mean_token_accuracy": 0.6559918403625489,
"num_tokens": 33330271.0,
"step": 5840
},
{
"entropy": 1.514758288860321,
"epoch": 2.9605263157894735,
"grad_norm": 0.2986261248588562,
"learning_rate": 2.66531713900135e-07,
"loss": 1.5106,
"mean_token_accuracy": 0.6324501454830169,
"num_tokens": 33390790.0,
"step": 5850
},
{
"entropy": 1.4632804989814758,
"epoch": 2.965587044534413,
"grad_norm": 0.297039657831192,
"learning_rate": 2.327935222672065e-07,
"loss": 1.4562,
"mean_token_accuracy": 0.652844125032425,
"num_tokens": 33451092.0,
"step": 5860
},
{
"entropy": 1.619661772251129,
"epoch": 2.970647773279352,
"grad_norm": 0.32788631319999695,
"learning_rate": 1.9905533063427803e-07,
"loss": 1.6222,
"mean_token_accuracy": 0.6278849899768829,
"num_tokens": 33509293.0,
"step": 5870
},
{
"entropy": 1.6123327970504762,
"epoch": 2.9757085020242915,
"grad_norm": 0.30364230275154114,
"learning_rate": 1.6531713900134953e-07,
"loss": 1.62,
"mean_token_accuracy": 0.6268645524978638,
"num_tokens": 33567748.0,
"step": 5880
},
{
"entropy": 1.465350294113159,
"epoch": 2.980769230769231,
"grad_norm": 0.271182119846344,
"learning_rate": 1.3157894736842107e-07,
"loss": 1.4767,
"mean_token_accuracy": 0.6428022742271423,
"num_tokens": 33626678.0,
"step": 5890
},
{
"entropy": 1.5106886863708495,
"epoch": 2.98582995951417,
"grad_norm": 0.30039140582084656,
"learning_rate": 9.784075573549259e-08,
"loss": 1.501,
"mean_token_accuracy": 0.6413563072681427,
"num_tokens": 33681713.0,
"step": 5900
},
{
"entropy": 1.6268660426139832,
"epoch": 2.9908906882591095,
"grad_norm": 0.30086028575897217,
"learning_rate": 6.41025641025641e-08,
"loss": 1.6387,
"mean_token_accuracy": 0.6218379974365235,
"num_tokens": 33738372.0,
"step": 5910
},
{
"entropy": 1.5718028783798217,
"epoch": 2.9959514170040484,
"grad_norm": 0.3744632601737976,
"learning_rate": 3.036437246963563e-08,
"loss": 1.5594,
"mean_token_accuracy": 0.6297330737113953,
"num_tokens": 33794850.0,
"step": 5920
}
],
"logging_steps": 10,
"max_steps": 5928,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2845340765506765e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}