{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 5928, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.3182146310806275, "epoch": 0.005060728744939271, "grad_norm": 0.36533281207084656, "learning_rate": 1.9969635627530365e-05, "loss": 2.0792, "mean_token_accuracy": 0.5860633730888367, "num_tokens": 59233.0, "step": 10 }, { "entropy": 1.2874402403831482, "epoch": 0.010121457489878543, "grad_norm": 0.47160375118255615, "learning_rate": 1.9935897435897437e-05, "loss": 2.0417, "mean_token_accuracy": 0.5939164876937866, "num_tokens": 114581.0, "step": 20 }, { "entropy": 1.1730836629867554, "epoch": 0.015182186234817813, "grad_norm": 0.3855545222759247, "learning_rate": 1.990215924426451e-05, "loss": 1.8448, "mean_token_accuracy": 0.6225896775722504, "num_tokens": 170799.0, "step": 30 }, { "entropy": 1.328663158416748, "epoch": 0.020242914979757085, "grad_norm": 0.3587476313114166, "learning_rate": 1.986842105263158e-05, "loss": 2.0267, "mean_token_accuracy": 0.5886577606201172, "num_tokens": 224687.0, "step": 40 }, { "entropy": 1.3244597673416139, "epoch": 0.025303643724696356, "grad_norm": 0.4023756980895996, "learning_rate": 1.9834682860998653e-05, "loss": 1.9579, "mean_token_accuracy": 0.5918201506137848, "num_tokens": 282567.0, "step": 50 }, { "entropy": 1.3242129743099214, "epoch": 0.030364372469635626, "grad_norm": 0.4889814257621765, "learning_rate": 1.9800944669365722e-05, "loss": 1.9161, "mean_token_accuracy": 0.6058241128921509, "num_tokens": 336705.0, "step": 60 }, { "entropy": 1.353874671459198, "epoch": 0.0354251012145749, "grad_norm": 0.43763861060142517, "learning_rate": 1.9767206477732795e-05, "loss": 1.8797, "mean_token_accuracy": 0.6002139091491699, "num_tokens": 395328.0, "step": 70 }, { "entropy": 1.2600490927696228, "epoch": 0.04048582995951417, "grad_norm": 0.6143773198127747, "learning_rate": 1.9733468286099865e-05, "loss": 1.7122, "mean_token_accuracy": 0.6283527314662933, "num_tokens": 448766.0, "step": 80 }, { "entropy": 1.318302822113037, "epoch": 0.04554655870445344, "grad_norm": 0.37836670875549316, "learning_rate": 1.9699730094466938e-05, "loss": 1.7054, "mean_token_accuracy": 0.6172383666038513, "num_tokens": 502153.0, "step": 90 }, { "entropy": 1.395500862598419, "epoch": 0.05060728744939271, "grad_norm": 0.42456531524658203, "learning_rate": 1.966599190283401e-05, "loss": 1.7468, "mean_token_accuracy": 0.6102555096149445, "num_tokens": 559789.0, "step": 100 }, { "entropy": 1.2470922768115997, "epoch": 0.05566801619433198, "grad_norm": 0.3632870316505432, "learning_rate": 1.963225371120108e-05, "loss": 1.5353, "mean_token_accuracy": 0.6436724066734314, "num_tokens": 620090.0, "step": 110 }, { "entropy": 1.4007501482963562, "epoch": 0.06072874493927125, "grad_norm": 0.36180490255355835, "learning_rate": 1.9598515519568153e-05, "loss": 1.7042, "mean_token_accuracy": 0.6169813573360443, "num_tokens": 674400.0, "step": 120 }, { "entropy": 1.509167194366455, "epoch": 0.06578947368421052, "grad_norm": 0.33648917078971863, "learning_rate": 1.9564777327935226e-05, "loss": 1.8149, "mean_token_accuracy": 0.6050768792629242, "num_tokens": 732008.0, "step": 130 }, { "entropy": 1.406454861164093, "epoch": 0.0708502024291498, "grad_norm": 0.2894444167613983, "learning_rate": 1.9531039136302295e-05, "loss": 1.6417, "mean_token_accuracy": 0.6224378228187561, "num_tokens": 790088.0, "step": 140 }, { "entropy": 1.5152673125267029, "epoch": 0.07591093117408906, "grad_norm": 0.28545448184013367, "learning_rate": 1.949730094466937e-05, "loss": 1.7609, "mean_token_accuracy": 0.6055684983730316, "num_tokens": 846716.0, "step": 150 }, { "entropy": 1.3967717170715332, "epoch": 0.08097165991902834, "grad_norm": 0.26414811611175537, "learning_rate": 1.9463562753036438e-05, "loss": 1.5725, "mean_token_accuracy": 0.6291777551174164, "num_tokens": 909154.0, "step": 160 }, { "entropy": 1.5347481608390807, "epoch": 0.0860323886639676, "grad_norm": 0.27282679080963135, "learning_rate": 1.942982456140351e-05, "loss": 1.7069, "mean_token_accuracy": 0.6087481796741485, "num_tokens": 967058.0, "step": 170 }, { "entropy": 1.4826232314109802, "epoch": 0.09109311740890688, "grad_norm": 0.23245945572853088, "learning_rate": 1.939608636977058e-05, "loss": 1.636, "mean_token_accuracy": 0.6207191824913025, "num_tokens": 1022407.0, "step": 180 }, { "entropy": 1.5127532601356506, "epoch": 0.09615384615384616, "grad_norm": 0.2711787223815918, "learning_rate": 1.9362348178137653e-05, "loss": 1.6767, "mean_token_accuracy": 0.615806394815445, "num_tokens": 1079738.0, "step": 190 }, { "entropy": 1.6158159017562865, "epoch": 0.10121457489878542, "grad_norm": 0.29755550622940063, "learning_rate": 1.9328609986504726e-05, "loss": 1.7642, "mean_token_accuracy": 0.6007438480854035, "num_tokens": 1140680.0, "step": 200 }, { "entropy": 1.4902734279632568, "epoch": 0.1062753036437247, "grad_norm": 0.24520562589168549, "learning_rate": 1.9294871794871796e-05, "loss": 1.5893, "mean_token_accuracy": 0.6293672084808349, "num_tokens": 1194492.0, "step": 210 }, { "entropy": 1.6347212672233582, "epoch": 0.11133603238866396, "grad_norm": 0.3082791566848755, "learning_rate": 1.926113360323887e-05, "loss": 1.7482, "mean_token_accuracy": 0.602141198515892, "num_tokens": 1252053.0, "step": 220 }, { "entropy": 1.5750308752059936, "epoch": 0.11639676113360324, "grad_norm": 0.23394237458705902, "learning_rate": 1.922739541160594e-05, "loss": 1.6651, "mean_token_accuracy": 0.6140229105949402, "num_tokens": 1308749.0, "step": 230 }, { "entropy": 1.5293641209602356, "epoch": 0.1214574898785425, "grad_norm": 0.22243493795394897, "learning_rate": 1.919365721997301e-05, "loss": 1.5962, "mean_token_accuracy": 0.6277494192123413, "num_tokens": 1371806.0, "step": 240 }, { "entropy": 1.5892576217651366, "epoch": 0.12651821862348178, "grad_norm": 0.23461221158504486, "learning_rate": 1.915991902834008e-05, "loss": 1.6669, "mean_token_accuracy": 0.6198325097560883, "num_tokens": 1427210.0, "step": 250 }, { "entropy": 1.6534079551696776, "epoch": 0.13157894736842105, "grad_norm": 0.2797304391860962, "learning_rate": 1.9126180836707153e-05, "loss": 1.7432, "mean_token_accuracy": 0.6030194580554962, "num_tokens": 1485664.0, "step": 260 }, { "entropy": 1.6070345997810365, "epoch": 0.13663967611336034, "grad_norm": 0.22065305709838867, "learning_rate": 1.9092442645074226e-05, "loss": 1.677, "mean_token_accuracy": 0.6108350694179535, "num_tokens": 1544169.0, "step": 270 }, { "entropy": 1.658397912979126, "epoch": 0.1417004048582996, "grad_norm": 0.17878006398677826, "learning_rate": 1.9058704453441296e-05, "loss": 1.7484, "mean_token_accuracy": 0.6061225473880768, "num_tokens": 1607852.0, "step": 280 }, { "entropy": 1.5805446743965148, "epoch": 0.14676113360323886, "grad_norm": 0.20498958230018616, "learning_rate": 1.902496626180837e-05, "loss": 1.6358, "mean_token_accuracy": 0.6217520833015442, "num_tokens": 1667280.0, "step": 290 }, { "entropy": 1.553944504261017, "epoch": 0.15182186234817813, "grad_norm": 0.2072789967060089, "learning_rate": 1.899122807017544e-05, "loss": 1.6016, "mean_token_accuracy": 0.6277549624443054, "num_tokens": 1722987.0, "step": 300 }, { "entropy": 1.6091766953468323, "epoch": 0.15688259109311742, "grad_norm": 0.25766435265541077, "learning_rate": 1.895748987854251e-05, "loss": 1.6603, "mean_token_accuracy": 0.6145843267440796, "num_tokens": 1777611.0, "step": 310 }, { "entropy": 1.4922061681747436, "epoch": 0.16194331983805668, "grad_norm": 0.23709791898727417, "learning_rate": 1.8923751686909584e-05, "loss": 1.5237, "mean_token_accuracy": 0.638946932554245, "num_tokens": 1833335.0, "step": 320 }, { "entropy": 1.5376826167106628, "epoch": 0.16700404858299595, "grad_norm": 0.24256624281406403, "learning_rate": 1.8890013495276657e-05, "loss": 1.5813, "mean_token_accuracy": 0.625263386964798, "num_tokens": 1883303.0, "step": 330 }, { "entropy": 1.5892139554023743, "epoch": 0.1720647773279352, "grad_norm": 0.20020949840545654, "learning_rate": 1.8856275303643726e-05, "loss": 1.6522, "mean_token_accuracy": 0.6241094172000885, "num_tokens": 1937007.0, "step": 340 }, { "entropy": 1.5604022860527038, "epoch": 0.1771255060728745, "grad_norm": 0.2134305238723755, "learning_rate": 1.8822537112010796e-05, "loss": 1.6036, "mean_token_accuracy": 0.617218679189682, "num_tokens": 1996601.0, "step": 350 }, { "entropy": 1.6344910740852356, "epoch": 0.18218623481781376, "grad_norm": 0.2528083622455597, "learning_rate": 1.878879892037787e-05, "loss": 1.6885, "mean_token_accuracy": 0.6135373294353486, "num_tokens": 2051920.0, "step": 360 }, { "entropy": 1.6087428450584411, "epoch": 0.18724696356275303, "grad_norm": 0.3239048421382904, "learning_rate": 1.8755060728744942e-05, "loss": 1.687, "mean_token_accuracy": 0.6124200880527496, "num_tokens": 2108082.0, "step": 370 }, { "entropy": 1.5165512919425965, "epoch": 0.19230769230769232, "grad_norm": 0.21001844108104706, "learning_rate": 1.872132253711201e-05, "loss": 1.5231, "mean_token_accuracy": 0.6357404530048371, "num_tokens": 2164317.0, "step": 380 }, { "entropy": 1.4784175634384156, "epoch": 0.19736842105263158, "grad_norm": 0.21521133184432983, "learning_rate": 1.8687584345479084e-05, "loss": 1.5055, "mean_token_accuracy": 0.6435703456401825, "num_tokens": 2222720.0, "step": 390 }, { "entropy": 1.6346607565879823, "epoch": 0.20242914979757085, "grad_norm": 0.24888823926448822, "learning_rate": 1.8653846153846157e-05, "loss": 1.6701, "mean_token_accuracy": 0.6212630808353424, "num_tokens": 2284597.0, "step": 400 }, { "entropy": 1.6906925320625306, "epoch": 0.2074898785425101, "grad_norm": 0.21836967766284943, "learning_rate": 1.8620107962213227e-05, "loss": 1.7238, "mean_token_accuracy": 0.606383764743805, "num_tokens": 2340566.0, "step": 410 }, { "entropy": 1.4906925320625306, "epoch": 0.2125506072874494, "grad_norm": 0.21778637170791626, "learning_rate": 1.85863697705803e-05, "loss": 1.5135, "mean_token_accuracy": 0.6399281203746796, "num_tokens": 2397861.0, "step": 420 }, { "entropy": 1.6318996787071227, "epoch": 0.21761133603238866, "grad_norm": 0.2725844979286194, "learning_rate": 1.8552631578947373e-05, "loss": 1.707, "mean_token_accuracy": 0.6183918356895447, "num_tokens": 2453175.0, "step": 430 }, { "entropy": 1.530117428302765, "epoch": 0.22267206477732793, "grad_norm": 0.20461727678775787, "learning_rate": 1.8518893387314442e-05, "loss": 1.5423, "mean_token_accuracy": 0.6370847761631012, "num_tokens": 2511372.0, "step": 440 }, { "entropy": 1.5733375310897828, "epoch": 0.22773279352226722, "grad_norm": 0.2394452542066574, "learning_rate": 1.848515519568151e-05, "loss": 1.5909, "mean_token_accuracy": 0.6229382216930389, "num_tokens": 2571400.0, "step": 450 }, { "entropy": 1.4420257091522217, "epoch": 0.23279352226720648, "grad_norm": 0.23069949448108673, "learning_rate": 1.8451417004048584e-05, "loss": 1.4763, "mean_token_accuracy": 0.6464443206787109, "num_tokens": 2630220.0, "step": 460 }, { "entropy": 1.5972508192062378, "epoch": 0.23785425101214575, "grad_norm": 0.22586746513843536, "learning_rate": 1.8417678812415657e-05, "loss": 1.6251, "mean_token_accuracy": 0.6198283314704895, "num_tokens": 2688923.0, "step": 470 }, { "entropy": 1.4670196294784545, "epoch": 0.242914979757085, "grad_norm": 0.23567302525043488, "learning_rate": 1.8383940620782727e-05, "loss": 1.4654, "mean_token_accuracy": 0.6410723388195038, "num_tokens": 2745340.0, "step": 480 }, { "entropy": 1.5074788808822632, "epoch": 0.2479757085020243, "grad_norm": 0.2870822548866272, "learning_rate": 1.83502024291498e-05, "loss": 1.532, "mean_token_accuracy": 0.6343021392822266, "num_tokens": 2796794.0, "step": 490 }, { "entropy": 1.5003413558006287, "epoch": 0.25303643724696356, "grad_norm": 0.19105228781700134, "learning_rate": 1.8316464237516873e-05, "loss": 1.5132, "mean_token_accuracy": 0.643144553899765, "num_tokens": 2853606.0, "step": 500 }, { "entropy": 1.4724809288978578, "epoch": 0.25809716599190285, "grad_norm": 0.2321540117263794, "learning_rate": 1.8282726045883942e-05, "loss": 1.499, "mean_token_accuracy": 0.6394071221351624, "num_tokens": 2910686.0, "step": 510 }, { "entropy": 1.5466928601264953, "epoch": 0.2631578947368421, "grad_norm": 0.2588091790676117, "learning_rate": 1.8248987854251015e-05, "loss": 1.5745, "mean_token_accuracy": 0.6278865933418274, "num_tokens": 2969316.0, "step": 520 }, { "entropy": 1.4397364735603333, "epoch": 0.2682186234817814, "grad_norm": 0.22344444692134857, "learning_rate": 1.8215249662618085e-05, "loss": 1.4459, "mean_token_accuracy": 0.6503956913948059, "num_tokens": 3021973.0, "step": 530 }, { "entropy": 1.5749887704849244, "epoch": 0.2732793522267207, "grad_norm": 0.20665939152240753, "learning_rate": 1.8181511470985158e-05, "loss": 1.6023, "mean_token_accuracy": 0.622279840707779, "num_tokens": 3081510.0, "step": 540 }, { "entropy": 1.519134557247162, "epoch": 0.2783400809716599, "grad_norm": 0.20693500339984894, "learning_rate": 1.8147773279352227e-05, "loss": 1.5253, "mean_token_accuracy": 0.6335927963256835, "num_tokens": 3139438.0, "step": 550 }, { "entropy": 1.4850042462348938, "epoch": 0.2834008097165992, "grad_norm": 0.20233699679374695, "learning_rate": 1.81140350877193e-05, "loss": 1.5081, "mean_token_accuracy": 0.6392342805862427, "num_tokens": 3194184.0, "step": 560 }, { "entropy": 1.6362142205238341, "epoch": 0.28846153846153844, "grad_norm": 0.19187521934509277, "learning_rate": 1.808029689608637e-05, "loss": 1.6497, "mean_token_accuracy": 0.6131653010845184, "num_tokens": 3253449.0, "step": 570 }, { "entropy": 1.5829873204231262, "epoch": 0.2935222672064777, "grad_norm": 0.21769073605537415, "learning_rate": 1.8046558704453442e-05, "loss": 1.6063, "mean_token_accuracy": 0.6185498893260956, "num_tokens": 3309330.0, "step": 580 }, { "entropy": 1.5365434288978577, "epoch": 0.298582995951417, "grad_norm": 0.20103144645690918, "learning_rate": 1.8012820512820515e-05, "loss": 1.5559, "mean_token_accuracy": 0.6354671478271484, "num_tokens": 3368237.0, "step": 590 }, { "entropy": 1.5507975578308106, "epoch": 0.30364372469635625, "grad_norm": 0.20210447907447815, "learning_rate": 1.7979082321187585e-05, "loss": 1.5848, "mean_token_accuracy": 0.6247013151645661, "num_tokens": 3429760.0, "step": 600 }, { "entropy": 1.668788731098175, "epoch": 0.30870445344129555, "grad_norm": 0.23076701164245605, "learning_rate": 1.7945344129554658e-05, "loss": 1.7186, "mean_token_accuracy": 0.6151426732540131, "num_tokens": 3481424.0, "step": 610 }, { "entropy": 1.4967810273170472, "epoch": 0.31376518218623484, "grad_norm": 0.18658699095249176, "learning_rate": 1.791160593792173e-05, "loss": 1.5039, "mean_token_accuracy": 0.6395800650119782, "num_tokens": 3540974.0, "step": 620 }, { "entropy": 1.5561461091041564, "epoch": 0.3188259109311741, "grad_norm": 0.2277403026819229, "learning_rate": 1.78778677462888e-05, "loss": 1.5933, "mean_token_accuracy": 0.6238772809505463, "num_tokens": 3599247.0, "step": 630 }, { "entropy": 1.6327290773391723, "epoch": 0.32388663967611336, "grad_norm": 0.21525472402572632, "learning_rate": 1.784412955465587e-05, "loss": 1.6583, "mean_token_accuracy": 0.6137534499168396, "num_tokens": 3656797.0, "step": 640 }, { "entropy": 1.6783543229103088, "epoch": 0.32894736842105265, "grad_norm": 0.2178918868303299, "learning_rate": 1.7810391363022943e-05, "loss": 1.7236, "mean_token_accuracy": 0.6091976821422577, "num_tokens": 3712597.0, "step": 650 }, { "entropy": 1.458414077758789, "epoch": 0.3340080971659919, "grad_norm": 0.2724186182022095, "learning_rate": 1.7776653171390016e-05, "loss": 1.4712, "mean_token_accuracy": 0.6499071300029755, "num_tokens": 3769258.0, "step": 660 }, { "entropy": 1.4010087609291078, "epoch": 0.3390688259109312, "grad_norm": 0.24354684352874756, "learning_rate": 1.7742914979757085e-05, "loss": 1.4125, "mean_token_accuracy": 0.6563641846179962, "num_tokens": 3827461.0, "step": 670 }, { "entropy": 1.5870350360870362, "epoch": 0.3441295546558704, "grad_norm": 0.20323996245861053, "learning_rate": 1.7709176788124158e-05, "loss": 1.6215, "mean_token_accuracy": 0.624784529209137, "num_tokens": 3884189.0, "step": 680 }, { "entropy": 1.6935706734657288, "epoch": 0.3491902834008097, "grad_norm": 0.24285322427749634, "learning_rate": 1.767543859649123e-05, "loss": 1.7141, "mean_token_accuracy": 0.6097041130065918, "num_tokens": 3939148.0, "step": 690 }, { "entropy": 1.5216692566871644, "epoch": 0.354251012145749, "grad_norm": 0.24251361191272736, "learning_rate": 1.76417004048583e-05, "loss": 1.526, "mean_token_accuracy": 0.6344065189361572, "num_tokens": 3996059.0, "step": 700 }, { "entropy": 1.584353470802307, "epoch": 0.35931174089068824, "grad_norm": 0.22013038396835327, "learning_rate": 1.7607962213225373e-05, "loss": 1.5894, "mean_token_accuracy": 0.6244750499725342, "num_tokens": 4056179.0, "step": 710 }, { "entropy": 1.499224054813385, "epoch": 0.3643724696356275, "grad_norm": 0.22103145718574524, "learning_rate": 1.7574224021592443e-05, "loss": 1.5209, "mean_token_accuracy": 0.6322570383548737, "num_tokens": 4114329.0, "step": 720 }, { "entropy": 1.3952741980552674, "epoch": 0.3694331983805668, "grad_norm": 0.19164645671844482, "learning_rate": 1.7540485829959516e-05, "loss": 1.4095, "mean_token_accuracy": 0.6568491697311402, "num_tokens": 4167072.0, "step": 730 }, { "entropy": 1.484491491317749, "epoch": 0.37449392712550605, "grad_norm": 0.22778365015983582, "learning_rate": 1.7506747638326585e-05, "loss": 1.5054, "mean_token_accuracy": 0.6413045108318329, "num_tokens": 4225902.0, "step": 740 }, { "entropy": 1.5875544667243957, "epoch": 0.37955465587044535, "grad_norm": 0.22424441576004028, "learning_rate": 1.7473009446693658e-05, "loss": 1.6189, "mean_token_accuracy": 0.6218379735946655, "num_tokens": 4282716.0, "step": 750 }, { "entropy": 1.5914431929588317, "epoch": 0.38461538461538464, "grad_norm": 0.22598877549171448, "learning_rate": 1.743927125506073e-05, "loss": 1.629, "mean_token_accuracy": 0.6168800354003906, "num_tokens": 4341769.0, "step": 760 }, { "entropy": 1.588646697998047, "epoch": 0.3896761133603239, "grad_norm": 0.24020566046237946, "learning_rate": 1.74055330634278e-05, "loss": 1.5962, "mean_token_accuracy": 0.6249550580978394, "num_tokens": 4400298.0, "step": 770 }, { "entropy": 1.524513852596283, "epoch": 0.39473684210526316, "grad_norm": 0.19308218359947205, "learning_rate": 1.7371794871794873e-05, "loss": 1.5494, "mean_token_accuracy": 0.6319825410842895, "num_tokens": 4456170.0, "step": 780 }, { "entropy": 1.5187025308609008, "epoch": 0.39979757085020245, "grad_norm": 0.2745817303657532, "learning_rate": 1.7338056680161946e-05, "loss": 1.5286, "mean_token_accuracy": 0.641098040342331, "num_tokens": 4509439.0, "step": 790 }, { "entropy": 1.5416448593139649, "epoch": 0.4048582995951417, "grad_norm": 0.2520337998867035, "learning_rate": 1.7304318488529016e-05, "loss": 1.5522, "mean_token_accuracy": 0.6389577805995941, "num_tokens": 4568509.0, "step": 800 }, { "entropy": 1.622413122653961, "epoch": 0.409919028340081, "grad_norm": 0.20173849165439606, "learning_rate": 1.7270580296896085e-05, "loss": 1.6312, "mean_token_accuracy": 0.6254597425460815, "num_tokens": 4621157.0, "step": 810 }, { "entropy": 1.6604674816131593, "epoch": 0.4149797570850202, "grad_norm": 0.23679770529270172, "learning_rate": 1.723684210526316e-05, "loss": 1.6884, "mean_token_accuracy": 0.6142423152923584, "num_tokens": 4673418.0, "step": 820 }, { "entropy": 1.512584674358368, "epoch": 0.4200404858299595, "grad_norm": 0.22097937762737274, "learning_rate": 1.720310391363023e-05, "loss": 1.5394, "mean_token_accuracy": 0.6410868644714356, "num_tokens": 4731386.0, "step": 830 }, { "entropy": 1.4811100244522095, "epoch": 0.4251012145748988, "grad_norm": 0.1975807249546051, "learning_rate": 1.71693657219973e-05, "loss": 1.474, "mean_token_accuracy": 0.6388140618801117, "num_tokens": 4784928.0, "step": 840 }, { "entropy": 1.6224814653396606, "epoch": 0.43016194331983804, "grad_norm": 0.21695128083229065, "learning_rate": 1.7135627530364374e-05, "loss": 1.6465, "mean_token_accuracy": 0.6171948432922363, "num_tokens": 4844351.0, "step": 850 }, { "entropy": 1.4179201185703278, "epoch": 0.4352226720647773, "grad_norm": 0.2105616182088852, "learning_rate": 1.7101889338731447e-05, "loss": 1.4287, "mean_token_accuracy": 0.6521054327487945, "num_tokens": 4902506.0, "step": 860 }, { "entropy": 1.5097766757011413, "epoch": 0.4402834008097166, "grad_norm": 0.23443420231342316, "learning_rate": 1.7068151147098516e-05, "loss": 1.526, "mean_token_accuracy": 0.6371770858764648, "num_tokens": 4957377.0, "step": 870 }, { "entropy": 1.499946367740631, "epoch": 0.44534412955465585, "grad_norm": 0.1935402899980545, "learning_rate": 1.703441295546559e-05, "loss": 1.536, "mean_token_accuracy": 0.6418095469474793, "num_tokens": 5019057.0, "step": 880 }, { "entropy": 1.5840212941169738, "epoch": 0.45040485829959515, "grad_norm": 0.2871309518814087, "learning_rate": 1.7000674763832662e-05, "loss": 1.5944, "mean_token_accuracy": 0.6274131119251252, "num_tokens": 5072125.0, "step": 890 }, { "entropy": 1.6296968936920166, "epoch": 0.45546558704453444, "grad_norm": 0.19836841523647308, "learning_rate": 1.696693657219973e-05, "loss": 1.6328, "mean_token_accuracy": 0.621731948852539, "num_tokens": 5127397.0, "step": 900 }, { "entropy": 1.5373274445533753, "epoch": 0.4605263157894737, "grad_norm": 0.24680444598197937, "learning_rate": 1.69331983805668e-05, "loss": 1.5417, "mean_token_accuracy": 0.628632801771164, "num_tokens": 5179785.0, "step": 910 }, { "entropy": 1.4075371384620667, "epoch": 0.46558704453441296, "grad_norm": 0.23700740933418274, "learning_rate": 1.6899460188933874e-05, "loss": 1.4108, "mean_token_accuracy": 0.6608946800231934, "num_tokens": 5235573.0, "step": 920 }, { "entropy": 1.5140914797782898, "epoch": 0.4706477732793522, "grad_norm": 0.23013481497764587, "learning_rate": 1.6865721997300947e-05, "loss": 1.5085, "mean_token_accuracy": 0.6322543203830719, "num_tokens": 5294146.0, "step": 930 }, { "entropy": 1.5315414309501647, "epoch": 0.4757085020242915, "grad_norm": 0.27098962664604187, "learning_rate": 1.6831983805668016e-05, "loss": 1.5617, "mean_token_accuracy": 0.6292850613594055, "num_tokens": 5350989.0, "step": 940 }, { "entropy": 1.479003095626831, "epoch": 0.4807692307692308, "grad_norm": 0.1984509378671646, "learning_rate": 1.679824561403509e-05, "loss": 1.4812, "mean_token_accuracy": 0.6419821918010712, "num_tokens": 5407325.0, "step": 950 }, { "entropy": 1.5653569459915162, "epoch": 0.48582995951417, "grad_norm": 0.2867957353591919, "learning_rate": 1.6764507422402162e-05, "loss": 1.6079, "mean_token_accuracy": 0.6316430389881134, "num_tokens": 5460946.0, "step": 960 }, { "entropy": 1.476916539669037, "epoch": 0.4908906882591093, "grad_norm": 0.30787891149520874, "learning_rate": 1.673076923076923e-05, "loss": 1.48, "mean_token_accuracy": 0.6418763399124146, "num_tokens": 5521311.0, "step": 970 }, { "entropy": 1.521777379512787, "epoch": 0.4959514170040486, "grad_norm": 0.22446390986442566, "learning_rate": 1.6697031039136305e-05, "loss": 1.5293, "mean_token_accuracy": 0.6305320382118225, "num_tokens": 5586273.0, "step": 980 }, { "entropy": 1.5841980934143067, "epoch": 0.5010121457489879, "grad_norm": 0.24676790833473206, "learning_rate": 1.6663292847503377e-05, "loss": 1.6064, "mean_token_accuracy": 0.6247429788112641, "num_tokens": 5645607.0, "step": 990 }, { "entropy": 1.5454851269721985, "epoch": 0.5060728744939271, "grad_norm": 0.2086755633354187, "learning_rate": 1.6629554655870447e-05, "loss": 1.5715, "mean_token_accuracy": 0.6307513952255249, "num_tokens": 5703782.0, "step": 1000 }, { "entropy": 1.479654276371002, "epoch": 0.5111336032388664, "grad_norm": 0.21369728446006775, "learning_rate": 1.6595816464237517e-05, "loss": 1.4857, "mean_token_accuracy": 0.6503060281276702, "num_tokens": 5756050.0, "step": 1010 }, { "entropy": 1.5413155913352967, "epoch": 0.5161943319838057, "grad_norm": 0.29068905115127563, "learning_rate": 1.656207827260459e-05, "loss": 1.5625, "mean_token_accuracy": 0.6323202788829804, "num_tokens": 5813575.0, "step": 1020 }, { "entropy": 1.5284120678901671, "epoch": 0.521255060728745, "grad_norm": 0.26866260170936584, "learning_rate": 1.6528340080971662e-05, "loss": 1.5296, "mean_token_accuracy": 0.6371418595314026, "num_tokens": 5868292.0, "step": 1030 }, { "entropy": 1.5233107686042786, "epoch": 0.5263157894736842, "grad_norm": 0.2544384300708771, "learning_rate": 1.6494601889338732e-05, "loss": 1.5192, "mean_token_accuracy": 0.629064416885376, "num_tokens": 5926581.0, "step": 1040 }, { "entropy": 1.4819204211235046, "epoch": 0.5313765182186235, "grad_norm": 0.2691729962825775, "learning_rate": 1.6460863697705805e-05, "loss": 1.489, "mean_token_accuracy": 0.6453047692775726, "num_tokens": 5983604.0, "step": 1050 }, { "entropy": 1.343429481983185, "epoch": 0.5364372469635628, "grad_norm": 0.21679846942424774, "learning_rate": 1.6427125506072878e-05, "loss": 1.34, "mean_token_accuracy": 0.669063252210617, "num_tokens": 6040931.0, "step": 1060 }, { "entropy": 1.620318102836609, "epoch": 0.541497975708502, "grad_norm": 0.2846720516681671, "learning_rate": 1.6393387314439947e-05, "loss": 1.6464, "mean_token_accuracy": 0.625487893819809, "num_tokens": 6094196.0, "step": 1070 }, { "entropy": 1.6185827255249023, "epoch": 0.5465587044534413, "grad_norm": 0.24272854626178741, "learning_rate": 1.635964912280702e-05, "loss": 1.6422, "mean_token_accuracy": 0.6269473850727081, "num_tokens": 6150350.0, "step": 1080 }, { "entropy": 1.5225468039512635, "epoch": 0.5516194331983806, "grad_norm": 0.2274954468011856, "learning_rate": 1.632591093117409e-05, "loss": 1.5128, "mean_token_accuracy": 0.6347517490386962, "num_tokens": 6203671.0, "step": 1090 }, { "entropy": 1.4914517521858215, "epoch": 0.5566801619433198, "grad_norm": 0.20096349716186523, "learning_rate": 1.6292172739541163e-05, "loss": 1.5056, "mean_token_accuracy": 0.6353028774261474, "num_tokens": 6264849.0, "step": 1100 }, { "entropy": 1.5485971808433532, "epoch": 0.5617408906882592, "grad_norm": 0.24010322988033295, "learning_rate": 1.6258434547908232e-05, "loss": 1.5398, "mean_token_accuracy": 0.6348303139209748, "num_tokens": 6321385.0, "step": 1110 }, { "entropy": 1.5580425620079041, "epoch": 0.5668016194331984, "grad_norm": 0.21382348239421844, "learning_rate": 1.6224696356275305e-05, "loss": 1.5824, "mean_token_accuracy": 0.6257192850112915, "num_tokens": 6377498.0, "step": 1120 }, { "entropy": 1.5573256254196166, "epoch": 0.5718623481781376, "grad_norm": 0.24488642811775208, "learning_rate": 1.6190958164642378e-05, "loss": 1.5628, "mean_token_accuracy": 0.6246356785297393, "num_tokens": 6432433.0, "step": 1130 }, { "entropy": 1.4993282079696655, "epoch": 0.5769230769230769, "grad_norm": 0.2223263829946518, "learning_rate": 1.6157219973009447e-05, "loss": 1.5111, "mean_token_accuracy": 0.6382519125938415, "num_tokens": 6492547.0, "step": 1140 }, { "entropy": 1.4915427923202516, "epoch": 0.5819838056680162, "grad_norm": 0.232344850897789, "learning_rate": 1.612348178137652e-05, "loss": 1.5079, "mean_token_accuracy": 0.6373468995094299, "num_tokens": 6545726.0, "step": 1150 }, { "entropy": 1.561433982849121, "epoch": 0.5870445344129555, "grad_norm": 0.2586466073989868, "learning_rate": 1.6089743589743593e-05, "loss": 1.5638, "mean_token_accuracy": 0.6296638369560241, "num_tokens": 6606186.0, "step": 1160 }, { "entropy": 1.4575641989707946, "epoch": 0.5921052631578947, "grad_norm": 0.23262882232666016, "learning_rate": 1.6056005398110663e-05, "loss": 1.4734, "mean_token_accuracy": 0.646199643611908, "num_tokens": 6666588.0, "step": 1170 }, { "entropy": 1.5205667972564698, "epoch": 0.597165991902834, "grad_norm": 0.2673611044883728, "learning_rate": 1.6022267206477736e-05, "loss": 1.5302, "mean_token_accuracy": 0.6332932889461518, "num_tokens": 6728351.0, "step": 1180 }, { "entropy": 1.5646514296531677, "epoch": 0.6022267206477733, "grad_norm": 0.24620375037193298, "learning_rate": 1.5988529014844805e-05, "loss": 1.5848, "mean_token_accuracy": 0.6307655155658722, "num_tokens": 6789626.0, "step": 1190 }, { "entropy": 1.597201144695282, "epoch": 0.6072874493927125, "grad_norm": 0.28606894612312317, "learning_rate": 1.5954790823211878e-05, "loss": 1.5779, "mean_token_accuracy": 0.6305352866649627, "num_tokens": 6840612.0, "step": 1200 }, { "entropy": 1.6090473532676697, "epoch": 0.6123481781376519, "grad_norm": 0.26432231068611145, "learning_rate": 1.5921052631578948e-05, "loss": 1.6361, "mean_token_accuracy": 0.6166266143321991, "num_tokens": 6898491.0, "step": 1210 }, { "entropy": 1.5265617489814758, "epoch": 0.6174089068825911, "grad_norm": 0.24568380415439606, "learning_rate": 1.588731443994602e-05, "loss": 1.5239, "mean_token_accuracy": 0.6334243714809418, "num_tokens": 6953682.0, "step": 1220 }, { "entropy": 1.5580573081970215, "epoch": 0.6224696356275303, "grad_norm": 0.2606264650821686, "learning_rate": 1.5853576248313093e-05, "loss": 1.5652, "mean_token_accuracy": 0.6310077726840972, "num_tokens": 7009187.0, "step": 1230 }, { "entropy": 1.5315574645996093, "epoch": 0.6275303643724697, "grad_norm": 0.23248089849948883, "learning_rate": 1.5819838056680163e-05, "loss": 1.5515, "mean_token_accuracy": 0.635067343711853, "num_tokens": 7069568.0, "step": 1240 }, { "entropy": 1.4524394154548645, "epoch": 0.6325910931174089, "grad_norm": 0.20559658110141754, "learning_rate": 1.5786099865047236e-05, "loss": 1.4655, "mean_token_accuracy": 0.644383716583252, "num_tokens": 7132908.0, "step": 1250 }, { "entropy": 1.5834705471992492, "epoch": 0.6376518218623481, "grad_norm": 0.2312365472316742, "learning_rate": 1.5752361673414305e-05, "loss": 1.6107, "mean_token_accuracy": 0.6216763257980347, "num_tokens": 7193948.0, "step": 1260 }, { "entropy": 1.5223916292190551, "epoch": 0.6427125506072875, "grad_norm": 0.302206426858902, "learning_rate": 1.5718623481781378e-05, "loss": 1.5347, "mean_token_accuracy": 0.6370865941047669, "num_tokens": 7249715.0, "step": 1270 }, { "entropy": 1.68130704164505, "epoch": 0.6477732793522267, "grad_norm": 0.24234986305236816, "learning_rate": 1.5684885290148448e-05, "loss": 1.703, "mean_token_accuracy": 0.6063290297985077, "num_tokens": 7306114.0, "step": 1280 }, { "entropy": 1.6868065476417542, "epoch": 0.652834008097166, "grad_norm": 0.2558751702308655, "learning_rate": 1.565114709851552e-05, "loss": 1.7002, "mean_token_accuracy": 0.6132429718971253, "num_tokens": 7363295.0, "step": 1290 }, { "entropy": 1.4368155479431153, "epoch": 0.6578947368421053, "grad_norm": 0.3175618350505829, "learning_rate": 1.561740890688259e-05, "loss": 1.4368, "mean_token_accuracy": 0.6544794201850891, "num_tokens": 7415420.0, "step": 1300 }, { "entropy": 1.5615759491920471, "epoch": 0.6629554655870445, "grad_norm": 0.2953908443450928, "learning_rate": 1.5583670715249663e-05, "loss": 1.5617, "mean_token_accuracy": 0.6310927093029022, "num_tokens": 7475642.0, "step": 1310 }, { "entropy": 1.4271193981170653, "epoch": 0.6680161943319838, "grad_norm": 0.24695925414562225, "learning_rate": 1.5549932523616736e-05, "loss": 1.4189, "mean_token_accuracy": 0.6522926926612854, "num_tokens": 7538029.0, "step": 1320 }, { "entropy": 1.5500613093376159, "epoch": 0.6730769230769231, "grad_norm": 0.2324494868516922, "learning_rate": 1.5516194331983806e-05, "loss": 1.5641, "mean_token_accuracy": 0.626498419046402, "num_tokens": 7597460.0, "step": 1330 }, { "entropy": 1.476065456867218, "epoch": 0.6781376518218624, "grad_norm": 0.2418016493320465, "learning_rate": 1.548245614035088e-05, "loss": 1.4751, "mean_token_accuracy": 0.641443008184433, "num_tokens": 7652792.0, "step": 1340 }, { "entropy": 1.5325765252113341, "epoch": 0.6831983805668016, "grad_norm": 0.23513104021549225, "learning_rate": 1.544871794871795e-05, "loss": 1.5499, "mean_token_accuracy": 0.6278112173080445, "num_tokens": 7706166.0, "step": 1350 }, { "entropy": 1.5952306509017944, "epoch": 0.6882591093117408, "grad_norm": 0.22960874438285828, "learning_rate": 1.541497975708502e-05, "loss": 1.6124, "mean_token_accuracy": 0.623203706741333, "num_tokens": 7762524.0, "step": 1360 }, { "entropy": 1.4605698585510254, "epoch": 0.6933198380566802, "grad_norm": 0.2283059060573578, "learning_rate": 1.5381241565452094e-05, "loss": 1.4702, "mean_token_accuracy": 0.6456966698169708, "num_tokens": 7816597.0, "step": 1370 }, { "entropy": 1.3722566485404968, "epoch": 0.6983805668016194, "grad_norm": 0.24912376701831818, "learning_rate": 1.5347503373819163e-05, "loss": 1.3777, "mean_token_accuracy": 0.6624338209629059, "num_tokens": 7878907.0, "step": 1380 }, { "entropy": 1.5658705353736877, "epoch": 0.7034412955465587, "grad_norm": 0.26213786005973816, "learning_rate": 1.5313765182186236e-05, "loss": 1.5614, "mean_token_accuracy": 0.6281410813331604, "num_tokens": 7931234.0, "step": 1390 }, { "entropy": 1.4248001098632812, "epoch": 0.708502024291498, "grad_norm": 0.3189115822315216, "learning_rate": 1.5280026990553306e-05, "loss": 1.4343, "mean_token_accuracy": 0.6542839646339417, "num_tokens": 7983537.0, "step": 1400 }, { "entropy": 1.499564802646637, "epoch": 0.7135627530364372, "grad_norm": 0.24217011034488678, "learning_rate": 1.5246288798920379e-05, "loss": 1.5238, "mean_token_accuracy": 0.6327670216560364, "num_tokens": 8039434.0, "step": 1410 }, { "entropy": 1.5084555625915528, "epoch": 0.7186234817813765, "grad_norm": 0.21525943279266357, "learning_rate": 1.521255060728745e-05, "loss": 1.5051, "mean_token_accuracy": 0.6452975988388061, "num_tokens": 8095407.0, "step": 1420 }, { "entropy": 1.5463826656341553, "epoch": 0.7236842105263158, "grad_norm": 0.25616827607154846, "learning_rate": 1.5178812415654523e-05, "loss": 1.5526, "mean_token_accuracy": 0.6282021820545196, "num_tokens": 8150109.0, "step": 1430 }, { "entropy": 1.6857656121253968, "epoch": 0.728744939271255, "grad_norm": 0.25321727991104126, "learning_rate": 1.5145074224021594e-05, "loss": 1.7184, "mean_token_accuracy": 0.6112756371498108, "num_tokens": 8214438.0, "step": 1440 }, { "entropy": 1.5806215167045594, "epoch": 0.7338056680161943, "grad_norm": 0.21112073957920074, "learning_rate": 1.5111336032388665e-05, "loss": 1.5852, "mean_token_accuracy": 0.6202045798301696, "num_tokens": 8273743.0, "step": 1450 }, { "entropy": 1.4979040026664734, "epoch": 0.7388663967611336, "grad_norm": 0.22126545011997223, "learning_rate": 1.5077597840755738e-05, "loss": 1.5201, "mean_token_accuracy": 0.637889975309372, "num_tokens": 8334507.0, "step": 1460 }, { "entropy": 1.5463458061218263, "epoch": 0.7439271255060729, "grad_norm": 0.22952505946159363, "learning_rate": 1.5043859649122808e-05, "loss": 1.5498, "mean_token_accuracy": 0.6345071375370026, "num_tokens": 8390405.0, "step": 1470 }, { "entropy": 1.423577868938446, "epoch": 0.7489878542510121, "grad_norm": 0.2474886029958725, "learning_rate": 1.5010121457489879e-05, "loss": 1.4306, "mean_token_accuracy": 0.655298399925232, "num_tokens": 8452056.0, "step": 1480 }, { "entropy": 1.5899730324745178, "epoch": 0.7540485829959515, "grad_norm": 0.2736392021179199, "learning_rate": 1.497638326585695e-05, "loss": 1.581, "mean_token_accuracy": 0.6186748504638672, "num_tokens": 8511999.0, "step": 1490 }, { "entropy": 1.5433414101600647, "epoch": 0.7591093117408907, "grad_norm": 0.2836778163909912, "learning_rate": 1.4942645074224023e-05, "loss": 1.5544, "mean_token_accuracy": 0.6286308348178864, "num_tokens": 8566021.0, "step": 1500 }, { "entropy": 1.4887511134147644, "epoch": 0.7641700404858299, "grad_norm": 0.33601313829421997, "learning_rate": 1.4908906882591094e-05, "loss": 1.4994, "mean_token_accuracy": 0.6406654596328736, "num_tokens": 8622757.0, "step": 1510 }, { "entropy": 1.5212846279144288, "epoch": 0.7692307692307693, "grad_norm": 0.2853647470474243, "learning_rate": 1.4875168690958165e-05, "loss": 1.5409, "mean_token_accuracy": 0.6337429225444794, "num_tokens": 8677777.0, "step": 1520 }, { "entropy": 1.4735643148422242, "epoch": 0.7742914979757085, "grad_norm": 0.2369018942117691, "learning_rate": 1.4841430499325238e-05, "loss": 1.4812, "mean_token_accuracy": 0.6412514448165894, "num_tokens": 8735792.0, "step": 1530 }, { "entropy": 1.5245864272117615, "epoch": 0.7793522267206477, "grad_norm": 0.2317512333393097, "learning_rate": 1.480769230769231e-05, "loss": 1.5354, "mean_token_accuracy": 0.6362193703651429, "num_tokens": 8795324.0, "step": 1540 }, { "entropy": 1.487471914291382, "epoch": 0.7844129554655871, "grad_norm": 0.24812865257263184, "learning_rate": 1.477395411605938e-05, "loss": 1.487, "mean_token_accuracy": 0.6461592555046082, "num_tokens": 8848190.0, "step": 1550 }, { "entropy": 1.446857714653015, "epoch": 0.7894736842105263, "grad_norm": 0.23715689778327942, "learning_rate": 1.474021592442645e-05, "loss": 1.4494, "mean_token_accuracy": 0.654287850856781, "num_tokens": 8900078.0, "step": 1560 }, { "entropy": 1.6414281487464906, "epoch": 0.7945344129554656, "grad_norm": 0.26817786693573, "learning_rate": 1.4706477732793523e-05, "loss": 1.6536, "mean_token_accuracy": 0.6186295211315155, "num_tokens": 8955471.0, "step": 1570 }, { "entropy": 1.5608402729034423, "epoch": 0.7995951417004049, "grad_norm": 0.2652844190597534, "learning_rate": 1.4672739541160594e-05, "loss": 1.5787, "mean_token_accuracy": 0.62896608710289, "num_tokens": 9013912.0, "step": 1580 }, { "entropy": 1.5290770292282105, "epoch": 0.8046558704453441, "grad_norm": 0.25053921341896057, "learning_rate": 1.4639001349527666e-05, "loss": 1.543, "mean_token_accuracy": 0.6325620353221894, "num_tokens": 9073236.0, "step": 1590 }, { "entropy": 1.473749542236328, "epoch": 0.8097165991902834, "grad_norm": 0.2638007402420044, "learning_rate": 1.4605263157894739e-05, "loss": 1.4962, "mean_token_accuracy": 0.64018235206604, "num_tokens": 9130345.0, "step": 1600 }, { "entropy": 1.4807411432266235, "epoch": 0.8147773279352226, "grad_norm": 0.2131456434726715, "learning_rate": 1.457152496626181e-05, "loss": 1.4896, "mean_token_accuracy": 0.6396925866603851, "num_tokens": 9181695.0, "step": 1610 }, { "entropy": 1.4747131943702698, "epoch": 0.819838056680162, "grad_norm": 0.25145605206489563, "learning_rate": 1.4537786774628881e-05, "loss": 1.4513, "mean_token_accuracy": 0.6473784625530243, "num_tokens": 9237367.0, "step": 1620 }, { "entropy": 1.5602935075759887, "epoch": 0.8248987854251012, "grad_norm": 0.24879582226276398, "learning_rate": 1.4504048582995954e-05, "loss": 1.565, "mean_token_accuracy": 0.6289263606071472, "num_tokens": 9302101.0, "step": 1630 }, { "entropy": 1.4359328031539917, "epoch": 0.8299595141700404, "grad_norm": 0.21965323388576508, "learning_rate": 1.4470310391363025e-05, "loss": 1.4408, "mean_token_accuracy": 0.6550322711467743, "num_tokens": 9361115.0, "step": 1640 }, { "entropy": 1.5193968892097474, "epoch": 0.8350202429149798, "grad_norm": 0.27555471658706665, "learning_rate": 1.4436572199730096e-05, "loss": 1.5173, "mean_token_accuracy": 0.6335371434688568, "num_tokens": 9417109.0, "step": 1650 }, { "entropy": 1.5528843998908997, "epoch": 0.840080971659919, "grad_norm": 0.2689385414123535, "learning_rate": 1.4402834008097166e-05, "loss": 1.5668, "mean_token_accuracy": 0.6325760573148728, "num_tokens": 9473473.0, "step": 1660 }, { "entropy": 1.4268815875053407, "epoch": 0.8451417004048583, "grad_norm": 0.3029450476169586, "learning_rate": 1.4369095816464239e-05, "loss": 1.4197, "mean_token_accuracy": 0.6514874160289764, "num_tokens": 9530575.0, "step": 1670 }, { "entropy": 1.4315476655960082, "epoch": 0.8502024291497976, "grad_norm": 0.24891141057014465, "learning_rate": 1.433535762483131e-05, "loss": 1.4228, "mean_token_accuracy": 0.656501293182373, "num_tokens": 9590931.0, "step": 1680 }, { "entropy": 1.5360273122787476, "epoch": 0.8552631578947368, "grad_norm": 0.30486878752708435, "learning_rate": 1.4301619433198381e-05, "loss": 1.5474, "mean_token_accuracy": 0.6348777890205384, "num_tokens": 9644652.0, "step": 1690 }, { "entropy": 1.6101372838020325, "epoch": 0.8603238866396761, "grad_norm": 0.23739294707775116, "learning_rate": 1.4267881241565454e-05, "loss": 1.6222, "mean_token_accuracy": 0.6213286280632019, "num_tokens": 9697296.0, "step": 1700 }, { "entropy": 1.56304851770401, "epoch": 0.8653846153846154, "grad_norm": 0.2499363124370575, "learning_rate": 1.4234143049932525e-05, "loss": 1.5642, "mean_token_accuracy": 0.6282478511333466, "num_tokens": 9755265.0, "step": 1710 }, { "entropy": 1.4945539951324462, "epoch": 0.8704453441295547, "grad_norm": 0.24991373717784882, "learning_rate": 1.4200404858299596e-05, "loss": 1.5336, "mean_token_accuracy": 0.6350914716720581, "num_tokens": 9815899.0, "step": 1720 }, { "entropy": 1.5779843926429749, "epoch": 0.8755060728744939, "grad_norm": 0.24115176498889923, "learning_rate": 1.416666666666667e-05, "loss": 1.5933, "mean_token_accuracy": 0.6283825278282166, "num_tokens": 9872513.0, "step": 1730 }, { "entropy": 1.4204454302787781, "epoch": 0.8805668016194332, "grad_norm": 0.22373662889003754, "learning_rate": 1.413292847503374e-05, "loss": 1.4136, "mean_token_accuracy": 0.6557290494441986, "num_tokens": 9932083.0, "step": 1740 }, { "entropy": 1.636140561103821, "epoch": 0.8856275303643725, "grad_norm": 0.29674816131591797, "learning_rate": 1.409919028340081e-05, "loss": 1.662, "mean_token_accuracy": 0.6216094970703125, "num_tokens": 9988494.0, "step": 1750 }, { "entropy": 1.549510085582733, "epoch": 0.8906882591093117, "grad_norm": 0.24920591711997986, "learning_rate": 1.4065452091767881e-05, "loss": 1.5553, "mean_token_accuracy": 0.6351737916469574, "num_tokens": 10041605.0, "step": 1760 }, { "entropy": 1.5425897359848022, "epoch": 0.895748987854251, "grad_norm": 0.2719487249851227, "learning_rate": 1.4031713900134953e-05, "loss": 1.5457, "mean_token_accuracy": 0.6341227173805237, "num_tokens": 10097471.0, "step": 1770 }, { "entropy": 1.5892379999160766, "epoch": 0.9008097165991903, "grad_norm": 0.26108458638191223, "learning_rate": 1.3997975708502025e-05, "loss": 1.5846, "mean_token_accuracy": 0.6257834196090698, "num_tokens": 10157839.0, "step": 1780 }, { "entropy": 1.5164817094802856, "epoch": 0.9058704453441295, "grad_norm": 0.255862295627594, "learning_rate": 1.3964237516869097e-05, "loss": 1.5325, "mean_token_accuracy": 0.6302552342414856, "num_tokens": 10215568.0, "step": 1790 }, { "entropy": 1.5202425956726073, "epoch": 0.9109311740890689, "grad_norm": 0.2746359705924988, "learning_rate": 1.3930499325236168e-05, "loss": 1.5264, "mean_token_accuracy": 0.6395917236804962, "num_tokens": 10277752.0, "step": 1800 }, { "entropy": 1.5994849681854248, "epoch": 0.9159919028340081, "grad_norm": 0.259244441986084, "learning_rate": 1.389676113360324e-05, "loss": 1.6126, "mean_token_accuracy": 0.6206628024578095, "num_tokens": 10332436.0, "step": 1810 }, { "entropy": 1.5928335905075073, "epoch": 0.9210526315789473, "grad_norm": 0.30553993582725525, "learning_rate": 1.3863022941970312e-05, "loss": 1.604, "mean_token_accuracy": 0.6238301634788513, "num_tokens": 10385660.0, "step": 1820 }, { "entropy": 1.5503159523010255, "epoch": 0.9261133603238867, "grad_norm": 0.2695212662220001, "learning_rate": 1.3829284750337383e-05, "loss": 1.5727, "mean_token_accuracy": 0.6283754229545593, "num_tokens": 10440034.0, "step": 1830 }, { "entropy": 1.472425067424774, "epoch": 0.9311740890688259, "grad_norm": 0.26096370816230774, "learning_rate": 1.3795546558704453e-05, "loss": 1.4744, "mean_token_accuracy": 0.6468591213226318, "num_tokens": 10495586.0, "step": 1840 }, { "entropy": 1.4272591471672058, "epoch": 0.9362348178137652, "grad_norm": 0.2956947088241577, "learning_rate": 1.3761808367071526e-05, "loss": 1.4446, "mean_token_accuracy": 0.6488463521003723, "num_tokens": 10546414.0, "step": 1850 }, { "entropy": 1.4026084661483764, "epoch": 0.9412955465587044, "grad_norm": 0.24682804942131042, "learning_rate": 1.3728070175438597e-05, "loss": 1.3906, "mean_token_accuracy": 0.6536332130432129, "num_tokens": 10603382.0, "step": 1860 }, { "entropy": 1.585541033744812, "epoch": 0.9463562753036437, "grad_norm": 0.28304097056388855, "learning_rate": 1.3694331983805668e-05, "loss": 1.5972, "mean_token_accuracy": 0.6255220711231232, "num_tokens": 10666030.0, "step": 1870 }, { "entropy": 1.580546224117279, "epoch": 0.951417004048583, "grad_norm": 0.2616841793060303, "learning_rate": 1.3660593792172741e-05, "loss": 1.6051, "mean_token_accuracy": 0.6219939827919007, "num_tokens": 10725741.0, "step": 1880 }, { "entropy": 1.6499082326889039, "epoch": 0.9564777327935222, "grad_norm": 0.2620835304260254, "learning_rate": 1.3626855600539812e-05, "loss": 1.6969, "mean_token_accuracy": 0.6166241288185119, "num_tokens": 10787880.0, "step": 1890 }, { "entropy": 1.3888215899467469, "epoch": 0.9615384615384616, "grad_norm": 0.2680383324623108, "learning_rate": 1.3593117408906883e-05, "loss": 1.3917, "mean_token_accuracy": 0.6527835667133332, "num_tokens": 10844894.0, "step": 1900 }, { "entropy": 1.3836533963680266, "epoch": 0.9665991902834008, "grad_norm": 0.35761716961860657, "learning_rate": 1.3559379217273956e-05, "loss": 1.3895, "mean_token_accuracy": 0.6636650562286377, "num_tokens": 10900721.0, "step": 1910 }, { "entropy": 1.451544201374054, "epoch": 0.97165991902834, "grad_norm": 0.26495417952537537, "learning_rate": 1.3525641025641028e-05, "loss": 1.447, "mean_token_accuracy": 0.6403470158576965, "num_tokens": 10951838.0, "step": 1920 }, { "entropy": 1.5138379335403442, "epoch": 0.9767206477732794, "grad_norm": 0.23315957188606262, "learning_rate": 1.3491902834008099e-05, "loss": 1.5385, "mean_token_accuracy": 0.6303693652153015, "num_tokens": 11010569.0, "step": 1930 }, { "entropy": 1.5039880394935607, "epoch": 0.9817813765182186, "grad_norm": 0.26653018593788147, "learning_rate": 1.3458164642375168e-05, "loss": 1.515, "mean_token_accuracy": 0.6446199715137482, "num_tokens": 11068307.0, "step": 1940 }, { "entropy": 1.5039002180099488, "epoch": 0.9868421052631579, "grad_norm": 0.24144147336483002, "learning_rate": 1.3424426450742241e-05, "loss": 1.5012, "mean_token_accuracy": 0.6414350152015686, "num_tokens": 11131378.0, "step": 1950 }, { "entropy": 1.5108654856681825, "epoch": 0.9919028340080972, "grad_norm": 0.33613070845603943, "learning_rate": 1.3390688259109312e-05, "loss": 1.5229, "mean_token_accuracy": 0.6330624580383301, "num_tokens": 11189667.0, "step": 1960 }, { "entropy": 1.43316547870636, "epoch": 0.9969635627530364, "grad_norm": 0.27450039982795715, "learning_rate": 1.3356950067476384e-05, "loss": 1.4358, "mean_token_accuracy": 0.6528611719608307, "num_tokens": 11248100.0, "step": 1970 }, { "entropy": 1.5782551288604736, "epoch": 1.0020242914979758, "grad_norm": 0.2942919433116913, "learning_rate": 1.3323211875843457e-05, "loss": 1.5945, "mean_token_accuracy": 0.622716897726059, "num_tokens": 11301434.0, "step": 1980 }, { "entropy": 1.5513013124465942, "epoch": 1.007085020242915, "grad_norm": 0.4627493619918823, "learning_rate": 1.3289473684210528e-05, "loss": 1.5645, "mean_token_accuracy": 0.6323555290699006, "num_tokens": 11357709.0, "step": 1990 }, { "entropy": 1.5218539357185363, "epoch": 1.0121457489878543, "grad_norm": 0.29789215326309204, "learning_rate": 1.3255735492577599e-05, "loss": 1.5296, "mean_token_accuracy": 0.6385591834783554, "num_tokens": 11409081.0, "step": 2000 }, { "entropy": 1.5782111883163452, "epoch": 1.0172064777327936, "grad_norm": 0.3623863458633423, "learning_rate": 1.3221997300944672e-05, "loss": 1.5815, "mean_token_accuracy": 0.6234244406223297, "num_tokens": 11461701.0, "step": 2010 }, { "entropy": 1.478236198425293, "epoch": 1.0222672064777327, "grad_norm": 0.24126943945884705, "learning_rate": 1.3188259109311743e-05, "loss": 1.4773, "mean_token_accuracy": 0.6408190190792084, "num_tokens": 11522781.0, "step": 2020 }, { "entropy": 1.474450170993805, "epoch": 1.027327935222672, "grad_norm": 0.27630022168159485, "learning_rate": 1.3154520917678813e-05, "loss": 1.4777, "mean_token_accuracy": 0.6390757083892822, "num_tokens": 11577690.0, "step": 2030 }, { "entropy": 1.38731769323349, "epoch": 1.0323886639676114, "grad_norm": 0.2594892382621765, "learning_rate": 1.3120782726045884e-05, "loss": 1.4113, "mean_token_accuracy": 0.6555228769779206, "num_tokens": 11634378.0, "step": 2040 }, { "entropy": 1.4996397018432617, "epoch": 1.0374493927125505, "grad_norm": 0.29768475890159607, "learning_rate": 1.3087044534412957e-05, "loss": 1.5046, "mean_token_accuracy": 0.6385474681854248, "num_tokens": 11691197.0, "step": 2050 }, { "entropy": 1.7156208992004394, "epoch": 1.04251012145749, "grad_norm": 0.30838677287101746, "learning_rate": 1.3053306342780028e-05, "loss": 1.7196, "mean_token_accuracy": 0.6081624507904053, "num_tokens": 11744812.0, "step": 2060 }, { "entropy": 1.4287778735160828, "epoch": 1.0475708502024292, "grad_norm": 0.30164098739624023, "learning_rate": 1.3019568151147099e-05, "loss": 1.4251, "mean_token_accuracy": 0.6514438152313232, "num_tokens": 11798182.0, "step": 2070 }, { "entropy": 1.6277110576629639, "epoch": 1.0526315789473684, "grad_norm": 0.27688923478126526, "learning_rate": 1.2985829959514172e-05, "loss": 1.637, "mean_token_accuracy": 0.6167466878890991, "num_tokens": 11853640.0, "step": 2080 }, { "entropy": 1.3780420899391175, "epoch": 1.0576923076923077, "grad_norm": 0.2407483607530594, "learning_rate": 1.2952091767881243e-05, "loss": 1.3775, "mean_token_accuracy": 0.6617866694927216, "num_tokens": 11909613.0, "step": 2090 }, { "entropy": 1.4581809163093566, "epoch": 1.062753036437247, "grad_norm": 0.3337167203426361, "learning_rate": 1.2918353576248314e-05, "loss": 1.4533, "mean_token_accuracy": 0.6537846267223358, "num_tokens": 11967740.0, "step": 2100 }, { "entropy": 1.4676265239715576, "epoch": 1.0678137651821862, "grad_norm": 0.2601131796836853, "learning_rate": 1.2884615384615386e-05, "loss": 1.4607, "mean_token_accuracy": 0.6463825047016144, "num_tokens": 12020775.0, "step": 2110 }, { "entropy": 1.5144903063774109, "epoch": 1.0728744939271255, "grad_norm": 0.276044636964798, "learning_rate": 1.2850877192982459e-05, "loss": 1.5184, "mean_token_accuracy": 0.6339675188064575, "num_tokens": 12081273.0, "step": 2120 }, { "entropy": 1.5458029508590698, "epoch": 1.0779352226720649, "grad_norm": 0.3157075047492981, "learning_rate": 1.2817139001349528e-05, "loss": 1.5519, "mean_token_accuracy": 0.6369691550731659, "num_tokens": 12134672.0, "step": 2130 }, { "entropy": 1.4140147149562836, "epoch": 1.082995951417004, "grad_norm": 0.32847243547439575, "learning_rate": 1.27834008097166e-05, "loss": 1.4223, "mean_token_accuracy": 0.6571628749370575, "num_tokens": 12193683.0, "step": 2140 }, { "entropy": 1.5222583651542663, "epoch": 1.0880566801619433, "grad_norm": 0.2528051435947418, "learning_rate": 1.274966261808367e-05, "loss": 1.5229, "mean_token_accuracy": 0.6361405253410339, "num_tokens": 12249416.0, "step": 2150 }, { "entropy": 1.5322677731513976, "epoch": 1.0931174089068827, "grad_norm": 0.25397226214408875, "learning_rate": 1.2715924426450743e-05, "loss": 1.5353, "mean_token_accuracy": 0.6307880222797394, "num_tokens": 12312488.0, "step": 2160 }, { "entropy": 1.4451451063156129, "epoch": 1.0981781376518218, "grad_norm": 0.3207351565361023, "learning_rate": 1.2682186234817815e-05, "loss": 1.4532, "mean_token_accuracy": 0.6479784369468689, "num_tokens": 12365397.0, "step": 2170 }, { "entropy": 1.6216472387313843, "epoch": 1.1032388663967612, "grad_norm": 0.22639265656471252, "learning_rate": 1.2648448043184886e-05, "loss": 1.6331, "mean_token_accuracy": 0.6182599663734436, "num_tokens": 12426280.0, "step": 2180 }, { "entropy": 1.417205023765564, "epoch": 1.1082995951417005, "grad_norm": 0.31163787841796875, "learning_rate": 1.2614709851551959e-05, "loss": 1.4197, "mean_token_accuracy": 0.6472279012203217, "num_tokens": 12481836.0, "step": 2190 }, { "entropy": 1.5859375596046448, "epoch": 1.1133603238866396, "grad_norm": 0.2581881582736969, "learning_rate": 1.258097165991903e-05, "loss": 1.5947, "mean_token_accuracy": 0.6293219923973083, "num_tokens": 12537386.0, "step": 2200 }, { "entropy": 1.6218139290809632, "epoch": 1.118421052631579, "grad_norm": 0.27295926213264465, "learning_rate": 1.2547233468286101e-05, "loss": 1.6235, "mean_token_accuracy": 0.6221803069114685, "num_tokens": 12591068.0, "step": 2210 }, { "entropy": 1.5492971539497375, "epoch": 1.123481781376518, "grad_norm": 0.28580132126808167, "learning_rate": 1.251349527665317e-05, "loss": 1.5594, "mean_token_accuracy": 0.6253218352794647, "num_tokens": 12648353.0, "step": 2220 }, { "entropy": 1.649086058139801, "epoch": 1.1285425101214575, "grad_norm": 0.24511824548244476, "learning_rate": 1.2479757085020244e-05, "loss": 1.6621, "mean_token_accuracy": 0.6179795920848846, "num_tokens": 12700443.0, "step": 2230 }, { "entropy": 1.4533384203910829, "epoch": 1.1336032388663968, "grad_norm": 0.3033972382545471, "learning_rate": 1.2446018893387315e-05, "loss": 1.4451, "mean_token_accuracy": 0.6498919248580932, "num_tokens": 12748990.0, "step": 2240 }, { "entropy": 1.5436882257461548, "epoch": 1.1386639676113361, "grad_norm": 0.2811788022518158, "learning_rate": 1.2412280701754386e-05, "loss": 1.5508, "mean_token_accuracy": 0.6290224313735961, "num_tokens": 12807477.0, "step": 2250 }, { "entropy": 1.448672115802765, "epoch": 1.1437246963562753, "grad_norm": 0.29944077134132385, "learning_rate": 1.2378542510121459e-05, "loss": 1.4598, "mean_token_accuracy": 0.6483164548873901, "num_tokens": 12869123.0, "step": 2260 }, { "entropy": 1.3786328792572022, "epoch": 1.1487854251012146, "grad_norm": 0.27392685413360596, "learning_rate": 1.234480431848853e-05, "loss": 1.3767, "mean_token_accuracy": 0.6597134828567505, "num_tokens": 12924572.0, "step": 2270 }, { "entropy": 1.5663957238197326, "epoch": 1.1538461538461537, "grad_norm": 0.3136812150478363, "learning_rate": 1.2311066126855601e-05, "loss": 1.5661, "mean_token_accuracy": 0.6283589959144592, "num_tokens": 12982715.0, "step": 2280 }, { "entropy": 1.4102508783340455, "epoch": 1.158906882591093, "grad_norm": 0.33586448431015015, "learning_rate": 1.2277327935222674e-05, "loss": 1.4242, "mean_token_accuracy": 0.6464997053146362, "num_tokens": 13035535.0, "step": 2290 }, { "entropy": 1.4415246963500976, "epoch": 1.1639676113360324, "grad_norm": 0.24208928644657135, "learning_rate": 1.2243589743589746e-05, "loss": 1.4572, "mean_token_accuracy": 0.6485124588012695, "num_tokens": 13098688.0, "step": 2300 }, { "entropy": 1.490816557407379, "epoch": 1.1690283400809718, "grad_norm": 0.27268052101135254, "learning_rate": 1.2209851551956815e-05, "loss": 1.4841, "mean_token_accuracy": 0.6446694970130921, "num_tokens": 13155516.0, "step": 2310 }, { "entropy": 1.41130930185318, "epoch": 1.174089068825911, "grad_norm": 0.3298867642879486, "learning_rate": 1.2176113360323886e-05, "loss": 1.4114, "mean_token_accuracy": 0.6582064151763916, "num_tokens": 13208333.0, "step": 2320 }, { "entropy": 1.6078017115592957, "epoch": 1.1791497975708503, "grad_norm": 0.2950042188167572, "learning_rate": 1.214237516869096e-05, "loss": 1.6164, "mean_token_accuracy": 0.6207537829875946, "num_tokens": 13264796.0, "step": 2330 }, { "entropy": 1.500421929359436, "epoch": 1.1842105263157894, "grad_norm": 0.2659217417240143, "learning_rate": 1.210863697705803e-05, "loss": 1.5125, "mean_token_accuracy": 0.6368428528308868, "num_tokens": 13325761.0, "step": 2340 }, { "entropy": 1.511633825302124, "epoch": 1.1892712550607287, "grad_norm": 0.2882932722568512, "learning_rate": 1.2074898785425102e-05, "loss": 1.5265, "mean_token_accuracy": 0.6347347319126129, "num_tokens": 13381225.0, "step": 2350 }, { "entropy": 1.4531208157539368, "epoch": 1.194331983805668, "grad_norm": 0.2595268487930298, "learning_rate": 1.2041160593792175e-05, "loss": 1.4615, "mean_token_accuracy": 0.6477943778038024, "num_tokens": 13443099.0, "step": 2360 }, { "entropy": 1.4483809113502502, "epoch": 1.1993927125506072, "grad_norm": 0.31083598732948303, "learning_rate": 1.2007422402159246e-05, "loss": 1.4345, "mean_token_accuracy": 0.6418466746807099, "num_tokens": 13492349.0, "step": 2370 }, { "entropy": 1.4612587809562683, "epoch": 1.2044534412955465, "grad_norm": 0.3023878037929535, "learning_rate": 1.1973684210526317e-05, "loss": 1.4644, "mean_token_accuracy": 0.6457450866699219, "num_tokens": 13553635.0, "step": 2380 }, { "entropy": 1.503815734386444, "epoch": 1.209514170040486, "grad_norm": 0.2668578326702118, "learning_rate": 1.193994601889339e-05, "loss": 1.5031, "mean_token_accuracy": 0.6367665946483612, "num_tokens": 13610860.0, "step": 2390 }, { "entropy": 1.5158751249313354, "epoch": 1.214574898785425, "grad_norm": 0.22731706500053406, "learning_rate": 1.1906207827260461e-05, "loss": 1.5166, "mean_token_accuracy": 0.6408901572227478, "num_tokens": 13671500.0, "step": 2400 }, { "entropy": 1.4245959162712096, "epoch": 1.2196356275303644, "grad_norm": 0.23208104074001312, "learning_rate": 1.187246963562753e-05, "loss": 1.4395, "mean_token_accuracy": 0.650111585855484, "num_tokens": 13732700.0, "step": 2410 }, { "entropy": 1.5526673555374146, "epoch": 1.2246963562753037, "grad_norm": 0.3204510807991028, "learning_rate": 1.1838731443994602e-05, "loss": 1.5659, "mean_token_accuracy": 0.6272344350814819, "num_tokens": 13792638.0, "step": 2420 }, { "entropy": 1.4588525891304016, "epoch": 1.2297570850202428, "grad_norm": 0.2778925895690918, "learning_rate": 1.1804993252361675e-05, "loss": 1.4745, "mean_token_accuracy": 0.6453329682350158, "num_tokens": 13848701.0, "step": 2430 }, { "entropy": 1.3035455107688905, "epoch": 1.2348178137651822, "grad_norm": 0.26574888825416565, "learning_rate": 1.1771255060728746e-05, "loss": 1.3013, "mean_token_accuracy": 0.680269593000412, "num_tokens": 13903243.0, "step": 2440 }, { "entropy": 1.5677086472511292, "epoch": 1.2398785425101215, "grad_norm": 0.2806277573108673, "learning_rate": 1.1737516869095817e-05, "loss": 1.5653, "mean_token_accuracy": 0.6303077161312103, "num_tokens": 13962439.0, "step": 2450 }, { "entropy": 1.4167581439018249, "epoch": 1.2449392712550607, "grad_norm": 0.2721521258354187, "learning_rate": 1.1703778677462888e-05, "loss": 1.4122, "mean_token_accuracy": 0.6505212604999542, "num_tokens": 14017529.0, "step": 2460 }, { "entropy": 1.5344619274139404, "epoch": 1.25, "grad_norm": 0.2629392445087433, "learning_rate": 1.1670040485829961e-05, "loss": 1.5489, "mean_token_accuracy": 0.6296425819396972, "num_tokens": 14074333.0, "step": 2470 }, { "entropy": 1.4288833916187287, "epoch": 1.2550607287449393, "grad_norm": 0.28045085072517395, "learning_rate": 1.1636302294197033e-05, "loss": 1.4332, "mean_token_accuracy": 0.6531016409397126, "num_tokens": 14131260.0, "step": 2480 }, { "entropy": 1.4341704964637756, "epoch": 1.2601214574898785, "grad_norm": 0.27869343757629395, "learning_rate": 1.1602564102564104e-05, "loss": 1.4245, "mean_token_accuracy": 0.6531503915786743, "num_tokens": 14187704.0, "step": 2490 }, { "entropy": 1.5492194533348083, "epoch": 1.2651821862348178, "grad_norm": 0.3610108494758606, "learning_rate": 1.1568825910931173e-05, "loss": 1.5493, "mean_token_accuracy": 0.6251341938972473, "num_tokens": 14244227.0, "step": 2500 }, { "entropy": 1.4428314566612244, "epoch": 1.2702429149797572, "grad_norm": 0.2730664908885956, "learning_rate": 1.1535087719298246e-05, "loss": 1.4481, "mean_token_accuracy": 0.6439902603626251, "num_tokens": 14301363.0, "step": 2510 }, { "entropy": 1.6202573895454406, "epoch": 1.2753036437246963, "grad_norm": 0.2632329761981964, "learning_rate": 1.1501349527665317e-05, "loss": 1.6394, "mean_token_accuracy": 0.6166090041399002, "num_tokens": 14358360.0, "step": 2520 }, { "entropy": 1.4789348363876342, "epoch": 1.2803643724696356, "grad_norm": 0.31635069847106934, "learning_rate": 1.1467611336032389e-05, "loss": 1.4909, "mean_token_accuracy": 0.6398876368999481, "num_tokens": 14414169.0, "step": 2530 }, { "entropy": 1.5108978629112244, "epoch": 1.285425101214575, "grad_norm": 0.32884782552719116, "learning_rate": 1.1433873144399461e-05, "loss": 1.5177, "mean_token_accuracy": 0.6348686575889587, "num_tokens": 14475715.0, "step": 2540 }, { "entropy": 1.419902467727661, "epoch": 1.290485829959514, "grad_norm": 0.2587096095085144, "learning_rate": 1.1400134952766533e-05, "loss": 1.4162, "mean_token_accuracy": 0.6549311280250549, "num_tokens": 14534625.0, "step": 2550 }, { "entropy": 1.4202989101409913, "epoch": 1.2955465587044535, "grad_norm": 0.3693634271621704, "learning_rate": 1.1366396761133604e-05, "loss": 1.4086, "mean_token_accuracy": 0.6512441515922547, "num_tokens": 14587225.0, "step": 2560 }, { "entropy": 1.646610152721405, "epoch": 1.3006072874493926, "grad_norm": 0.2674924433231354, "learning_rate": 1.1332658569500677e-05, "loss": 1.6561, "mean_token_accuracy": 0.6122437655925751, "num_tokens": 14645474.0, "step": 2570 }, { "entropy": 1.5521462559700012, "epoch": 1.305668016194332, "grad_norm": 0.2970985770225525, "learning_rate": 1.1298920377867748e-05, "loss": 1.5528, "mean_token_accuracy": 0.6343778431415558, "num_tokens": 14702700.0, "step": 2580 }, { "entropy": 1.556568205356598, "epoch": 1.3107287449392713, "grad_norm": 0.2645126283168793, "learning_rate": 1.1265182186234818e-05, "loss": 1.5629, "mean_token_accuracy": 0.6288919091224671, "num_tokens": 14757931.0, "step": 2590 }, { "entropy": 1.4472679018974304, "epoch": 1.3157894736842106, "grad_norm": 0.2335396409034729, "learning_rate": 1.1231443994601889e-05, "loss": 1.4551, "mean_token_accuracy": 0.6467409670352936, "num_tokens": 14814936.0, "step": 2600 }, { "entropy": 1.5397544741630553, "epoch": 1.3208502024291497, "grad_norm": 0.2709454298019409, "learning_rate": 1.1197705802968962e-05, "loss": 1.5446, "mean_token_accuracy": 0.6299596786499023, "num_tokens": 14875733.0, "step": 2610 }, { "entropy": 1.4886646032333375, "epoch": 1.325910931174089, "grad_norm": 0.35333138704299927, "learning_rate": 1.1163967611336033e-05, "loss": 1.4863, "mean_token_accuracy": 0.63787921667099, "num_tokens": 14929765.0, "step": 2620 }, { "entropy": 1.453588593006134, "epoch": 1.3309716599190282, "grad_norm": 0.27809369564056396, "learning_rate": 1.1130229419703104e-05, "loss": 1.4697, "mean_token_accuracy": 0.6435807704925537, "num_tokens": 14990890.0, "step": 2630 }, { "entropy": 1.5616032361984253, "epoch": 1.3360323886639676, "grad_norm": 0.30011820793151855, "learning_rate": 1.1096491228070177e-05, "loss": 1.5712, "mean_token_accuracy": 0.6284253001213074, "num_tokens": 15051025.0, "step": 2640 }, { "entropy": 1.5883211970329285, "epoch": 1.341093117408907, "grad_norm": 0.2934761345386505, "learning_rate": 1.1062753036437248e-05, "loss": 1.5979, "mean_token_accuracy": 0.6245935201644898, "num_tokens": 15108003.0, "step": 2650 }, { "entropy": 1.6321449398994445, "epoch": 1.3461538461538463, "grad_norm": 0.2740587890148163, "learning_rate": 1.102901484480432e-05, "loss": 1.6281, "mean_token_accuracy": 0.6189014375209808, "num_tokens": 15164177.0, "step": 2660 }, { "entropy": 1.5974322438240052, "epoch": 1.3512145748987854, "grad_norm": 0.26599040627479553, "learning_rate": 1.0995276653171392e-05, "loss": 1.5986, "mean_token_accuracy": 0.6264194548130035, "num_tokens": 15219699.0, "step": 2670 }, { "entropy": 1.6627001881599426, "epoch": 1.3562753036437247, "grad_norm": 0.35696741938591003, "learning_rate": 1.0961538461538464e-05, "loss": 1.6903, "mean_token_accuracy": 0.6077935576438904, "num_tokens": 15275379.0, "step": 2680 }, { "entropy": 1.4766412138938905, "epoch": 1.3613360323886639, "grad_norm": 0.32456570863723755, "learning_rate": 1.0927800269905533e-05, "loss": 1.473, "mean_token_accuracy": 0.6439019083976746, "num_tokens": 15334742.0, "step": 2690 }, { "entropy": 1.534727895259857, "epoch": 1.3663967611336032, "grad_norm": 0.30418887734413147, "learning_rate": 1.0894062078272604e-05, "loss": 1.5354, "mean_token_accuracy": 0.6321536242961884, "num_tokens": 15384467.0, "step": 2700 }, { "entropy": 1.4464212298393249, "epoch": 1.3714574898785425, "grad_norm": 0.2574264407157898, "learning_rate": 1.0860323886639677e-05, "loss": 1.4397, "mean_token_accuracy": 0.6525548756122589, "num_tokens": 15446519.0, "step": 2710 }, { "entropy": 1.5958146333694458, "epoch": 1.376518218623482, "grad_norm": 0.28892847895622253, "learning_rate": 1.0826585695006748e-05, "loss": 1.5915, "mean_token_accuracy": 0.6221986651420593, "num_tokens": 15505401.0, "step": 2720 }, { "entropy": 1.6138377904891967, "epoch": 1.381578947368421, "grad_norm": 0.2827686667442322, "learning_rate": 1.079284750337382e-05, "loss": 1.6357, "mean_token_accuracy": 0.622738265991211, "num_tokens": 15563279.0, "step": 2730 }, { "entropy": 1.54440039396286, "epoch": 1.3866396761133604, "grad_norm": 0.2887682318687439, "learning_rate": 1.0759109311740893e-05, "loss": 1.5273, "mean_token_accuracy": 0.6313063859939575, "num_tokens": 15618960.0, "step": 2740 }, { "entropy": 1.4138375759124755, "epoch": 1.3917004048582995, "grad_norm": 0.36498573422431946, "learning_rate": 1.0725371120107964e-05, "loss": 1.4189, "mean_token_accuracy": 0.6591821730136871, "num_tokens": 15670366.0, "step": 2750 }, { "entropy": 1.4741955041885375, "epoch": 1.3967611336032388, "grad_norm": 0.3496224284172058, "learning_rate": 1.0691632928475035e-05, "loss": 1.4764, "mean_token_accuracy": 0.6401580095291137, "num_tokens": 15724201.0, "step": 2760 }, { "entropy": 1.5501426219940186, "epoch": 1.4018218623481782, "grad_norm": 0.26639312505722046, "learning_rate": 1.0657894736842108e-05, "loss": 1.546, "mean_token_accuracy": 0.6274727523326874, "num_tokens": 15783636.0, "step": 2770 }, { "entropy": 1.557990849018097, "epoch": 1.4068825910931175, "grad_norm": 0.34502512216567993, "learning_rate": 1.0624156545209177e-05, "loss": 1.5781, "mean_token_accuracy": 0.6323262035846711, "num_tokens": 15841694.0, "step": 2780 }, { "entropy": 1.455728328227997, "epoch": 1.4119433198380567, "grad_norm": 0.2952381372451782, "learning_rate": 1.0590418353576249e-05, "loss": 1.4547, "mean_token_accuracy": 0.6422502875328064, "num_tokens": 15896343.0, "step": 2790 }, { "entropy": 1.6696131229400635, "epoch": 1.417004048582996, "grad_norm": 0.2534728944301605, "learning_rate": 1.055668016194332e-05, "loss": 1.6816, "mean_token_accuracy": 0.607040387392044, "num_tokens": 15952655.0, "step": 2800 }, { "entropy": 1.3929704070091247, "epoch": 1.4220647773279351, "grad_norm": 0.2545351982116699, "learning_rate": 1.0522941970310391e-05, "loss": 1.3815, "mean_token_accuracy": 0.6631879568099975, "num_tokens": 16009017.0, "step": 2810 }, { "entropy": 1.4616627931594848, "epoch": 1.4271255060728745, "grad_norm": 0.29235726594924927, "learning_rate": 1.0489203778677464e-05, "loss": 1.469, "mean_token_accuracy": 0.6424239039421081, "num_tokens": 16064974.0, "step": 2820 }, { "entropy": 1.5947192907333374, "epoch": 1.4321862348178138, "grad_norm": 0.4684313237667084, "learning_rate": 1.0455465587044535e-05, "loss": 1.6334, "mean_token_accuracy": 0.620320850610733, "num_tokens": 16121438.0, "step": 2830 }, { "entropy": 1.5052786350250245, "epoch": 1.4372469635627532, "grad_norm": 0.2901478707790375, "learning_rate": 1.0421727395411606e-05, "loss": 1.5228, "mean_token_accuracy": 0.6432769238948822, "num_tokens": 16177348.0, "step": 2840 }, { "entropy": 1.5745530486106873, "epoch": 1.4423076923076923, "grad_norm": 0.4461107552051544, "learning_rate": 1.038798920377868e-05, "loss": 1.5797, "mean_token_accuracy": 0.628256207704544, "num_tokens": 16233533.0, "step": 2850 }, { "entropy": 1.6525885105133056, "epoch": 1.4473684210526316, "grad_norm": 0.30729052424430847, "learning_rate": 1.035425101214575e-05, "loss": 1.659, "mean_token_accuracy": 0.6156529784202576, "num_tokens": 16288984.0, "step": 2860 }, { "entropy": 1.4813060998916625, "epoch": 1.4524291497975708, "grad_norm": 0.26118186116218567, "learning_rate": 1.0320512820512822e-05, "loss": 1.4694, "mean_token_accuracy": 0.6394685864448547, "num_tokens": 16347312.0, "step": 2870 }, { "entropy": 1.3725073099136353, "epoch": 1.45748987854251, "grad_norm": 0.24992327392101288, "learning_rate": 1.0286774628879891e-05, "loss": 1.3778, "mean_token_accuracy": 0.6600593090057373, "num_tokens": 16401182.0, "step": 2880 }, { "entropy": 1.5925581932067872, "epoch": 1.4625506072874495, "grad_norm": 0.3013634979724884, "learning_rate": 1.0253036437246964e-05, "loss": 1.5989, "mean_token_accuracy": 0.6274087786674499, "num_tokens": 16463180.0, "step": 2890 }, { "entropy": 1.395955240726471, "epoch": 1.4676113360323888, "grad_norm": 0.2821931540966034, "learning_rate": 1.0219298245614035e-05, "loss": 1.3955, "mean_token_accuracy": 0.6572477340698242, "num_tokens": 16524984.0, "step": 2900 }, { "entropy": 1.493795931339264, "epoch": 1.472672064777328, "grad_norm": 0.27723386883735657, "learning_rate": 1.0185560053981107e-05, "loss": 1.4988, "mean_token_accuracy": 0.6318200826644897, "num_tokens": 16585454.0, "step": 2910 }, { "entropy": 1.608326256275177, "epoch": 1.4777327935222673, "grad_norm": 0.24880221486091614, "learning_rate": 1.015182186234818e-05, "loss": 1.6037, "mean_token_accuracy": 0.6237947404384613, "num_tokens": 16642878.0, "step": 2920 }, { "entropy": 1.4563136458396913, "epoch": 1.4827935222672064, "grad_norm": 0.2714000940322876, "learning_rate": 1.011808367071525e-05, "loss": 1.4609, "mean_token_accuracy": 0.6409155547618866, "num_tokens": 16697425.0, "step": 2930 }, { "entropy": 1.4760780036449432, "epoch": 1.4878542510121457, "grad_norm": 0.3031882047653198, "learning_rate": 1.0084345479082322e-05, "loss": 1.4802, "mean_token_accuracy": 0.6450917005538941, "num_tokens": 16760118.0, "step": 2940 }, { "entropy": 1.493908405303955, "epoch": 1.492914979757085, "grad_norm": 0.2621052861213684, "learning_rate": 1.0050607287449395e-05, "loss": 1.4918, "mean_token_accuracy": 0.6401423692703248, "num_tokens": 16813749.0, "step": 2950 }, { "entropy": 1.6856267690658568, "epoch": 1.4979757085020242, "grad_norm": 0.26623499393463135, "learning_rate": 1.0016869095816466e-05, "loss": 1.6777, "mean_token_accuracy": 0.6135709464550019, "num_tokens": 16874289.0, "step": 2960 }, { "entropy": 1.4696342468261718, "epoch": 1.5030364372469636, "grad_norm": 0.2687808871269226, "learning_rate": 9.983130904183537e-06, "loss": 1.4727, "mean_token_accuracy": 0.6447311758995056, "num_tokens": 16930145.0, "step": 2970 }, { "entropy": 1.4744965791702271, "epoch": 1.5080971659919027, "grad_norm": 0.23845624923706055, "learning_rate": 9.949392712550608e-06, "loss": 1.4721, "mean_token_accuracy": 0.645156466960907, "num_tokens": 16984053.0, "step": 2980 }, { "entropy": 1.4356729149818421, "epoch": 1.513157894736842, "grad_norm": 0.3086620271205902, "learning_rate": 9.91565452091768e-06, "loss": 1.4271, "mean_token_accuracy": 0.6454346477985382, "num_tokens": 17040474.0, "step": 2990 }, { "entropy": 1.4318643450736999, "epoch": 1.5182186234817814, "grad_norm": 0.31296011805534363, "learning_rate": 9.881916329284751e-06, "loss": 1.4284, "mean_token_accuracy": 0.6570405125617981, "num_tokens": 17091443.0, "step": 3000 }, { "entropy": 1.486535382270813, "epoch": 1.5232793522267207, "grad_norm": 0.24280501902103424, "learning_rate": 9.848178137651822e-06, "loss": 1.4782, "mean_token_accuracy": 0.6421392917633056, "num_tokens": 17145789.0, "step": 3010 }, { "entropy": 1.3536667227745056, "epoch": 1.52834008097166, "grad_norm": 0.3393391966819763, "learning_rate": 9.814439946018895e-06, "loss": 1.3665, "mean_token_accuracy": 0.659710270166397, "num_tokens": 17200045.0, "step": 3020 }, { "entropy": 1.477955400943756, "epoch": 1.5334008097165992, "grad_norm": 0.2695980668067932, "learning_rate": 9.780701754385966e-06, "loss": 1.4773, "mean_token_accuracy": 0.6442347228527069, "num_tokens": 17253382.0, "step": 3030 }, { "entropy": 1.4808340609073638, "epoch": 1.5384615384615383, "grad_norm": 0.32629549503326416, "learning_rate": 9.746963562753037e-06, "loss": 1.487, "mean_token_accuracy": 0.6431676924228669, "num_tokens": 17306138.0, "step": 3040 }, { "entropy": 1.4529295325279237, "epoch": 1.5435222672064777, "grad_norm": 0.2536776661872864, "learning_rate": 9.713225371120109e-06, "loss": 1.4591, "mean_token_accuracy": 0.6488350391387939, "num_tokens": 17368255.0, "step": 3050 }, { "entropy": 1.438970947265625, "epoch": 1.548582995951417, "grad_norm": 0.26340344548225403, "learning_rate": 9.67948717948718e-06, "loss": 1.4513, "mean_token_accuracy": 0.6449286341667175, "num_tokens": 17426575.0, "step": 3060 }, { "entropy": 1.709315264225006, "epoch": 1.5536437246963564, "grad_norm": 0.31817519664764404, "learning_rate": 9.645748987854253e-06, "loss": 1.7215, "mean_token_accuracy": 0.6018387496471405, "num_tokens": 17488131.0, "step": 3070 }, { "entropy": 1.5443035364151, "epoch": 1.5587044534412957, "grad_norm": 0.3266107141971588, "learning_rate": 9.612010796221324e-06, "loss": 1.5511, "mean_token_accuracy": 0.63644158244133, "num_tokens": 17546517.0, "step": 3080 }, { "entropy": 1.6439727783203124, "epoch": 1.5637651821862348, "grad_norm": 0.25957760214805603, "learning_rate": 9.578272604588395e-06, "loss": 1.6584, "mean_token_accuracy": 0.6159623801708222, "num_tokens": 17605712.0, "step": 3090 }, { "entropy": 1.5427301168441772, "epoch": 1.568825910931174, "grad_norm": 0.27618587017059326, "learning_rate": 9.544534412955466e-06, "loss": 1.5451, "mean_token_accuracy": 0.6260794997215271, "num_tokens": 17659721.0, "step": 3100 }, { "entropy": 1.383056926727295, "epoch": 1.5738866396761133, "grad_norm": 0.3027380406856537, "learning_rate": 9.510796221322538e-06, "loss": 1.3743, "mean_token_accuracy": 0.6610878467559814, "num_tokens": 17714100.0, "step": 3110 }, { "entropy": 1.503647792339325, "epoch": 1.5789473684210527, "grad_norm": 0.29517048597335815, "learning_rate": 9.47705802968961e-06, "loss": 1.5053, "mean_token_accuracy": 0.6365588068962097, "num_tokens": 17771308.0, "step": 3120 }, { "entropy": 1.548321044445038, "epoch": 1.584008097165992, "grad_norm": 0.255573570728302, "learning_rate": 9.44331983805668e-06, "loss": 1.5523, "mean_token_accuracy": 0.6278903543949127, "num_tokens": 17828285.0, "step": 3130 }, { "entropy": 1.5210648417472838, "epoch": 1.5890688259109311, "grad_norm": 0.3004836142063141, "learning_rate": 9.409581646423753e-06, "loss": 1.5331, "mean_token_accuracy": 0.6306875169277191, "num_tokens": 17888745.0, "step": 3140 }, { "entropy": 1.4695003390312196, "epoch": 1.5941295546558705, "grad_norm": 0.2813291549682617, "learning_rate": 9.375843454790824e-06, "loss": 1.4673, "mean_token_accuracy": 0.6395570158958435, "num_tokens": 17949494.0, "step": 3150 }, { "entropy": 1.4948333382606507, "epoch": 1.5991902834008096, "grad_norm": 0.3244977593421936, "learning_rate": 9.342105263157895e-06, "loss": 1.5044, "mean_token_accuracy": 0.6397220313549041, "num_tokens": 18006803.0, "step": 3160 }, { "entropy": 1.4767319202423095, "epoch": 1.604251012145749, "grad_norm": 0.2612328827381134, "learning_rate": 9.308367071524967e-06, "loss": 1.4795, "mean_token_accuracy": 0.6446912109851837, "num_tokens": 18062396.0, "step": 3170 }, { "entropy": 1.5266436815261841, "epoch": 1.6093117408906883, "grad_norm": 0.3239694833755493, "learning_rate": 9.274628879892038e-06, "loss": 1.5418, "mean_token_accuracy": 0.6299160838127136, "num_tokens": 18110883.0, "step": 3180 }, { "entropy": 1.4701735734939576, "epoch": 1.6143724696356276, "grad_norm": 0.2672860324382782, "learning_rate": 9.240890688259109e-06, "loss": 1.4503, "mean_token_accuracy": 0.6537446200847625, "num_tokens": 18159767.0, "step": 3190 }, { "entropy": 1.4009694814682008, "epoch": 1.6194331983805668, "grad_norm": 0.29456961154937744, "learning_rate": 9.207152496626182e-06, "loss": 1.4045, "mean_token_accuracy": 0.6546292185783387, "num_tokens": 18217624.0, "step": 3200 }, { "entropy": 1.6173853397369384, "epoch": 1.624493927125506, "grad_norm": 0.30044859647750854, "learning_rate": 9.173414304993253e-06, "loss": 1.6125, "mean_token_accuracy": 0.6170080423355102, "num_tokens": 18277255.0, "step": 3210 }, { "entropy": 1.5926665306091308, "epoch": 1.6295546558704452, "grad_norm": 0.29986920952796936, "learning_rate": 9.139676113360324e-06, "loss": 1.6003, "mean_token_accuracy": 0.6278518795967102, "num_tokens": 18335766.0, "step": 3220 }, { "entropy": 1.4131199598312378, "epoch": 1.6346153846153846, "grad_norm": 0.33528971672058105, "learning_rate": 9.105937921727396e-06, "loss": 1.4244, "mean_token_accuracy": 0.6553650915622711, "num_tokens": 18392231.0, "step": 3230 }, { "entropy": 1.5079811573028565, "epoch": 1.639676113360324, "grad_norm": 0.32541990280151367, "learning_rate": 9.072199730094467e-06, "loss": 1.5182, "mean_token_accuracy": 0.6279927968978882, "num_tokens": 18447207.0, "step": 3240 }, { "entropy": 1.5234640002250672, "epoch": 1.6447368421052633, "grad_norm": 0.2562153935432434, "learning_rate": 9.03846153846154e-06, "loss": 1.5149, "mean_token_accuracy": 0.6328702330589294, "num_tokens": 18505601.0, "step": 3250 }, { "entropy": 1.6198287844657897, "epoch": 1.6497975708502024, "grad_norm": 0.3361916244029999, "learning_rate": 9.004723346828611e-06, "loss": 1.6255, "mean_token_accuracy": 0.61873180270195, "num_tokens": 18558902.0, "step": 3260 }, { "entropy": 1.4600189566612243, "epoch": 1.6548582995951417, "grad_norm": 0.304756760597229, "learning_rate": 8.970985155195682e-06, "loss": 1.4471, "mean_token_accuracy": 0.6477857530117035, "num_tokens": 18619635.0, "step": 3270 }, { "entropy": 1.5648074269294738, "epoch": 1.6599190283400809, "grad_norm": 0.30415093898773193, "learning_rate": 8.937246963562753e-06, "loss": 1.5767, "mean_token_accuracy": 0.6242169559001922, "num_tokens": 18674088.0, "step": 3280 }, { "entropy": 1.5605645179748535, "epoch": 1.6649797570850202, "grad_norm": 0.26909834146499634, "learning_rate": 8.903508771929825e-06, "loss": 1.5605, "mean_token_accuracy": 0.635642808675766, "num_tokens": 18734453.0, "step": 3290 }, { "entropy": 1.5742518544197082, "epoch": 1.6700404858299596, "grad_norm": 0.2826893925666809, "learning_rate": 8.869770580296898e-06, "loss": 1.5643, "mean_token_accuracy": 0.6317550718784333, "num_tokens": 18793346.0, "step": 3300 }, { "entropy": 1.6354934215545653, "epoch": 1.675101214574899, "grad_norm": 0.2833310067653656, "learning_rate": 8.836032388663969e-06, "loss": 1.6417, "mean_token_accuracy": 0.6152911841869354, "num_tokens": 18845698.0, "step": 3310 }, { "entropy": 1.5844687461853026, "epoch": 1.680161943319838, "grad_norm": 0.3369496762752533, "learning_rate": 8.80229419703104e-06, "loss": 1.59, "mean_token_accuracy": 0.6237683236598969, "num_tokens": 18903498.0, "step": 3320 }, { "entropy": 1.570583975315094, "epoch": 1.6852226720647774, "grad_norm": 0.36443012952804565, "learning_rate": 8.768556005398111e-06, "loss": 1.5757, "mean_token_accuracy": 0.6224404633045196, "num_tokens": 18957652.0, "step": 3330 }, { "entropy": 1.6000779747962952, "epoch": 1.6902834008097165, "grad_norm": 0.32085222005844116, "learning_rate": 8.734817813765182e-06, "loss": 1.6067, "mean_token_accuracy": 0.6194514989852905, "num_tokens": 19018526.0, "step": 3340 }, { "entropy": 1.4028130412101745, "epoch": 1.6953441295546559, "grad_norm": 0.2869940996170044, "learning_rate": 8.701079622132255e-06, "loss": 1.41, "mean_token_accuracy": 0.6613210141658783, "num_tokens": 19073801.0, "step": 3350 }, { "entropy": 1.5723829984664917, "epoch": 1.7004048582995952, "grad_norm": 0.3251384496688843, "learning_rate": 8.667341430499327e-06, "loss": 1.5647, "mean_token_accuracy": 0.6301065504550933, "num_tokens": 19128923.0, "step": 3360 }, { "entropy": 1.5013223052024842, "epoch": 1.7054655870445345, "grad_norm": 0.30307453870773315, "learning_rate": 8.633603238866398e-06, "loss": 1.4962, "mean_token_accuracy": 0.6359362661838531, "num_tokens": 19186932.0, "step": 3370 }, { "entropy": 1.602057731151581, "epoch": 1.7105263157894737, "grad_norm": 0.369747132062912, "learning_rate": 8.599865047233469e-06, "loss": 1.5956, "mean_token_accuracy": 0.6288884073495865, "num_tokens": 19245687.0, "step": 3380 }, { "entropy": 1.5921403288841247, "epoch": 1.7155870445344128, "grad_norm": 0.2498423159122467, "learning_rate": 8.56612685560054e-06, "loss": 1.5971, "mean_token_accuracy": 0.621882963180542, "num_tokens": 19307247.0, "step": 3390 }, { "entropy": 1.5030475974082946, "epoch": 1.7206477732793521, "grad_norm": 0.3407726585865021, "learning_rate": 8.532388663967613e-06, "loss": 1.5109, "mean_token_accuracy": 0.6346028625965119, "num_tokens": 19367320.0, "step": 3400 }, { "entropy": 1.4825459122657776, "epoch": 1.7257085020242915, "grad_norm": 0.27978742122650146, "learning_rate": 8.498650472334684e-06, "loss": 1.4831, "mean_token_accuracy": 0.6394042372703552, "num_tokens": 19429919.0, "step": 3410 }, { "entropy": 1.4931324481964112, "epoch": 1.7307692307692308, "grad_norm": 0.288116455078125, "learning_rate": 8.464912280701755e-06, "loss": 1.4957, "mean_token_accuracy": 0.6380782008171082, "num_tokens": 19485577.0, "step": 3420 }, { "entropy": 1.4949661374092102, "epoch": 1.7358299595141702, "grad_norm": 0.31869447231292725, "learning_rate": 8.431174089068827e-06, "loss": 1.4926, "mean_token_accuracy": 0.6403613984584808, "num_tokens": 19541077.0, "step": 3430 }, { "entropy": 1.596668827533722, "epoch": 1.7408906882591093, "grad_norm": 0.28910359740257263, "learning_rate": 8.397435897435898e-06, "loss": 1.6038, "mean_token_accuracy": 0.6189565002918244, "num_tokens": 19605457.0, "step": 3440 }, { "entropy": 1.4728464007377624, "epoch": 1.7459514170040484, "grad_norm": 0.27498626708984375, "learning_rate": 8.36369770580297e-06, "loss": 1.485, "mean_token_accuracy": 0.6360372960567474, "num_tokens": 19667915.0, "step": 3450 }, { "entropy": 1.5345482110977173, "epoch": 1.7510121457489878, "grad_norm": 0.2618876099586487, "learning_rate": 8.32995951417004e-06, "loss": 1.5381, "mean_token_accuracy": 0.6326977252960205, "num_tokens": 19725030.0, "step": 3460 }, { "entropy": 1.393745517730713, "epoch": 1.7560728744939271, "grad_norm": 0.28456103801727295, "learning_rate": 8.296221322537113e-06, "loss": 1.3836, "mean_token_accuracy": 0.6631475329399109, "num_tokens": 19784982.0, "step": 3470 }, { "entropy": 1.3884447813034058, "epoch": 1.7611336032388665, "grad_norm": 0.27543261647224426, "learning_rate": 8.262483130904184e-06, "loss": 1.3847, "mean_token_accuracy": 0.6586002767086029, "num_tokens": 19848002.0, "step": 3480 }, { "entropy": 1.4454799056053163, "epoch": 1.7661943319838058, "grad_norm": 0.36814162135124207, "learning_rate": 8.228744939271256e-06, "loss": 1.455, "mean_token_accuracy": 0.6476415753364563, "num_tokens": 19906592.0, "step": 3490 }, { "entropy": 1.3266122221946717, "epoch": 1.771255060728745, "grad_norm": 0.2580831050872803, "learning_rate": 8.195006747638327e-06, "loss": 1.329, "mean_token_accuracy": 0.6699170589447021, "num_tokens": 19963138.0, "step": 3500 }, { "entropy": 1.670899212360382, "epoch": 1.776315789473684, "grad_norm": 0.29895538091659546, "learning_rate": 8.161268556005398e-06, "loss": 1.6755, "mean_token_accuracy": 0.613396269083023, "num_tokens": 20021345.0, "step": 3510 }, { "entropy": 1.354777181148529, "epoch": 1.7813765182186234, "grad_norm": 0.34177860617637634, "learning_rate": 8.12753036437247e-06, "loss": 1.3456, "mean_token_accuracy": 0.6686203420162201, "num_tokens": 20072875.0, "step": 3520 }, { "entropy": 1.5505508065223694, "epoch": 1.7864372469635628, "grad_norm": 0.2592535614967346, "learning_rate": 8.093792172739542e-06, "loss": 1.5535, "mean_token_accuracy": 0.6263529658317566, "num_tokens": 20132207.0, "step": 3530 }, { "entropy": 1.485759150981903, "epoch": 1.791497975708502, "grad_norm": 0.2742493450641632, "learning_rate": 8.060053981106613e-06, "loss": 1.4964, "mean_token_accuracy": 0.635893827676773, "num_tokens": 20195010.0, "step": 3540 }, { "entropy": 1.5584338903427124, "epoch": 1.7965587044534415, "grad_norm": 0.2946804463863373, "learning_rate": 8.026315789473685e-06, "loss": 1.5553, "mean_token_accuracy": 0.6236848413944245, "num_tokens": 20257540.0, "step": 3550 }, { "entropy": 1.5169085144996644, "epoch": 1.8016194331983806, "grad_norm": 0.26114436984062195, "learning_rate": 7.992577597840756e-06, "loss": 1.5138, "mean_token_accuracy": 0.6321025729179383, "num_tokens": 20318045.0, "step": 3560 }, { "entropy": 1.337432289123535, "epoch": 1.8066801619433197, "grad_norm": 0.29184892773628235, "learning_rate": 7.958839406207827e-06, "loss": 1.3471, "mean_token_accuracy": 0.6645301103591919, "num_tokens": 20373587.0, "step": 3570 }, { "entropy": 1.5995257258415223, "epoch": 1.811740890688259, "grad_norm": 0.3016499876976013, "learning_rate": 7.9251012145749e-06, "loss": 1.5929, "mean_token_accuracy": 0.6236974120140075, "num_tokens": 20431451.0, "step": 3580 }, { "entropy": 1.623330581188202, "epoch": 1.8168016194331984, "grad_norm": 0.35448580980300903, "learning_rate": 7.891363022941971e-06, "loss": 1.6129, "mean_token_accuracy": 0.6200532436370849, "num_tokens": 20487984.0, "step": 3590 }, { "entropy": 1.5125982403755187, "epoch": 1.8218623481781377, "grad_norm": 0.32799309492111206, "learning_rate": 7.857624831309042e-06, "loss": 1.5025, "mean_token_accuracy": 0.6384036839008331, "num_tokens": 20541725.0, "step": 3600 }, { "entropy": 1.53478661775589, "epoch": 1.8269230769230769, "grad_norm": 0.32730069756507874, "learning_rate": 7.823886639676114e-06, "loss": 1.5294, "mean_token_accuracy": 0.6311649143695831, "num_tokens": 20600145.0, "step": 3610 }, { "entropy": 1.5410036087036132, "epoch": 1.8319838056680162, "grad_norm": 0.3669460117816925, "learning_rate": 7.790148448043185e-06, "loss": 1.5537, "mean_token_accuracy": 0.6282461225986481, "num_tokens": 20655732.0, "step": 3620 }, { "entropy": 1.453836238384247, "epoch": 1.8370445344129553, "grad_norm": 0.31468528509140015, "learning_rate": 7.756410256410258e-06, "loss": 1.4568, "mean_token_accuracy": 0.6447117567062378, "num_tokens": 20712525.0, "step": 3630 }, { "entropy": 1.475819957256317, "epoch": 1.8421052631578947, "grad_norm": 0.29064053297042847, "learning_rate": 7.722672064777329e-06, "loss": 1.4821, "mean_token_accuracy": 0.6439218640327453, "num_tokens": 20768387.0, "step": 3640 }, { "entropy": 1.4041451275348664, "epoch": 1.847165991902834, "grad_norm": 0.2812243700027466, "learning_rate": 7.6889338731444e-06, "loss": 1.4044, "mean_token_accuracy": 0.6594688057899475, "num_tokens": 20826082.0, "step": 3650 }, { "entropy": 1.6357195615768432, "epoch": 1.8522267206477734, "grad_norm": 0.2777828276157379, "learning_rate": 7.655195681511471e-06, "loss": 1.6412, "mean_token_accuracy": 0.6139614999294281, "num_tokens": 20882894.0, "step": 3660 }, { "entropy": 1.535701298713684, "epoch": 1.8572874493927125, "grad_norm": 0.3234771490097046, "learning_rate": 7.6214574898785435e-06, "loss": 1.5333, "mean_token_accuracy": 0.6341882109642029, "num_tokens": 20938122.0, "step": 3670 }, { "entropy": 1.5881774067878722, "epoch": 1.8623481781376519, "grad_norm": 0.3148040175437927, "learning_rate": 7.587719298245615e-06, "loss": 1.6014, "mean_token_accuracy": 0.6275585472583771, "num_tokens": 20995824.0, "step": 3680 }, { "entropy": 1.3315507769584656, "epoch": 1.867408906882591, "grad_norm": 0.327178031206131, "learning_rate": 7.553981106612687e-06, "loss": 1.3346, "mean_token_accuracy": 0.668057644367218, "num_tokens": 21050607.0, "step": 3690 }, { "entropy": 1.5097809910774231, "epoch": 1.8724696356275303, "grad_norm": 0.29023247957229614, "learning_rate": 7.520242914979757e-06, "loss": 1.5045, "mean_token_accuracy": 0.6351129233837127, "num_tokens": 21106066.0, "step": 3700 }, { "entropy": 1.4823681235313415, "epoch": 1.8775303643724697, "grad_norm": 0.3215828537940979, "learning_rate": 7.486504723346829e-06, "loss": 1.4818, "mean_token_accuracy": 0.6453963398933411, "num_tokens": 21164981.0, "step": 3710 }, { "entropy": 1.4289534091949463, "epoch": 1.882591093117409, "grad_norm": 0.3170277178287506, "learning_rate": 7.452766531713901e-06, "loss": 1.446, "mean_token_accuracy": 0.6434959769248962, "num_tokens": 21223777.0, "step": 3720 }, { "entropy": 1.5202300190925597, "epoch": 1.8876518218623481, "grad_norm": 0.2913142740726471, "learning_rate": 7.4190283400809725e-06, "loss": 1.5349, "mean_token_accuracy": 0.6304753959178925, "num_tokens": 21279646.0, "step": 3730 }, { "entropy": 1.6774636268615724, "epoch": 1.8927125506072875, "grad_norm": 0.33726078271865845, "learning_rate": 7.385290148448044e-06, "loss": 1.6783, "mean_token_accuracy": 0.6076300263404846, "num_tokens": 21335265.0, "step": 3740 }, { "entropy": 1.5423774600028992, "epoch": 1.8977732793522266, "grad_norm": 0.27264466881752014, "learning_rate": 7.351551956815115e-06, "loss": 1.5533, "mean_token_accuracy": 0.6308148026466369, "num_tokens": 21396070.0, "step": 3750 }, { "entropy": 1.4624953866004944, "epoch": 1.902834008097166, "grad_norm": 0.35332223773002625, "learning_rate": 7.317813765182187e-06, "loss": 1.4655, "mean_token_accuracy": 0.641634488105774, "num_tokens": 21452996.0, "step": 3760 }, { "entropy": 1.4817042350769043, "epoch": 1.9078947368421053, "grad_norm": 0.3333725035190582, "learning_rate": 7.284075573549258e-06, "loss": 1.4903, "mean_token_accuracy": 0.6411226511001586, "num_tokens": 21508906.0, "step": 3770 }, { "entropy": 1.5518399238586427, "epoch": 1.9129554655870447, "grad_norm": 0.2960481643676758, "learning_rate": 7.25033738191633e-06, "loss": 1.5445, "mean_token_accuracy": 0.6266183733940125, "num_tokens": 21568967.0, "step": 3780 }, { "entropy": 1.5748514771461486, "epoch": 1.9180161943319838, "grad_norm": 0.31355923414230347, "learning_rate": 7.216599190283401e-06, "loss": 1.5716, "mean_token_accuracy": 0.6256311893463135, "num_tokens": 21631403.0, "step": 3790 }, { "entropy": 1.450837540626526, "epoch": 1.9230769230769231, "grad_norm": 0.27845069766044617, "learning_rate": 7.182860998650473e-06, "loss": 1.4611, "mean_token_accuracy": 0.650393956899643, "num_tokens": 21688727.0, "step": 3800 }, { "entropy": 1.611345076560974, "epoch": 1.9281376518218623, "grad_norm": 0.2685949206352234, "learning_rate": 7.149122807017545e-06, "loss": 1.6126, "mean_token_accuracy": 0.6262206137180328, "num_tokens": 21742484.0, "step": 3810 }, { "entropy": 1.3427100419998168, "epoch": 1.9331983805668016, "grad_norm": 0.41044095158576965, "learning_rate": 7.115384615384616e-06, "loss": 1.3418, "mean_token_accuracy": 0.663384473323822, "num_tokens": 21799804.0, "step": 3820 }, { "entropy": 1.4495494306087493, "epoch": 1.938259109311741, "grad_norm": 0.5138364434242249, "learning_rate": 7.081646423751688e-06, "loss": 1.4481, "mean_token_accuracy": 0.6450947999954224, "num_tokens": 21858581.0, "step": 3830 }, { "entropy": 1.3911212921142577, "epoch": 1.9433198380566803, "grad_norm": 0.29537278413772583, "learning_rate": 7.047908232118758e-06, "loss": 1.3992, "mean_token_accuracy": 0.6531029522418976, "num_tokens": 21915585.0, "step": 3840 }, { "entropy": 1.4535645723342896, "epoch": 1.9483805668016194, "grad_norm": 0.25756731629371643, "learning_rate": 7.0141700404858304e-06, "loss": 1.4401, "mean_token_accuracy": 0.6463619887828826, "num_tokens": 21976079.0, "step": 3850 }, { "entropy": 1.4952040553092956, "epoch": 1.9534412955465585, "grad_norm": 0.3046974539756775, "learning_rate": 6.9804318488529025e-06, "loss": 1.5097, "mean_token_accuracy": 0.6341541647911072, "num_tokens": 22035025.0, "step": 3860 }, { "entropy": 1.5177082777023316, "epoch": 1.958502024291498, "grad_norm": 0.3251610994338989, "learning_rate": 6.946693657219974e-06, "loss": 1.5163, "mean_token_accuracy": 0.6359520852565765, "num_tokens": 22092788.0, "step": 3870 }, { "entropy": 1.4667699456214904, "epoch": 1.9635627530364372, "grad_norm": 0.3152090311050415, "learning_rate": 6.912955465587045e-06, "loss": 1.4715, "mean_token_accuracy": 0.6418612182140351, "num_tokens": 22153745.0, "step": 3880 }, { "entropy": 1.6101324200630187, "epoch": 1.9686234817813766, "grad_norm": 0.340584933757782, "learning_rate": 6.879217273954116e-06, "loss": 1.6212, "mean_token_accuracy": 0.6180540084838867, "num_tokens": 22211817.0, "step": 3890 }, { "entropy": 1.459115242958069, "epoch": 1.973684210526316, "grad_norm": 0.2879182696342468, "learning_rate": 6.845479082321188e-06, "loss": 1.4419, "mean_token_accuracy": 0.6466407418251038, "num_tokens": 22265817.0, "step": 3900 }, { "entropy": 1.4101441740989684, "epoch": 1.978744939271255, "grad_norm": 0.3250649571418762, "learning_rate": 6.81174089068826e-06, "loss": 1.4063, "mean_token_accuracy": 0.6551910638809204, "num_tokens": 22324629.0, "step": 3910 }, { "entropy": 1.6089503526687623, "epoch": 1.9838056680161942, "grad_norm": 0.3786233961582184, "learning_rate": 6.7780026990553315e-06, "loss": 1.6147, "mean_token_accuracy": 0.6272029399871826, "num_tokens": 22381691.0, "step": 3920 }, { "entropy": 1.3815577149391174, "epoch": 1.9888663967611335, "grad_norm": 0.304582417011261, "learning_rate": 6.744264507422402e-06, "loss": 1.3759, "mean_token_accuracy": 0.657072639465332, "num_tokens": 22432987.0, "step": 3930 }, { "entropy": 1.6114310383796693, "epoch": 1.9939271255060729, "grad_norm": 0.3556569218635559, "learning_rate": 6.710526315789474e-06, "loss": 1.6089, "mean_token_accuracy": 0.6203605115413666, "num_tokens": 22491567.0, "step": 3940 }, { "entropy": 1.5013386726379394, "epoch": 1.9989878542510122, "grad_norm": 0.3433378040790558, "learning_rate": 6.676788124156546e-06, "loss": 1.497, "mean_token_accuracy": 0.6365504443645478, "num_tokens": 22548351.0, "step": 3950 }, { "entropy": 1.4863505601882934, "epoch": 2.0040485829959516, "grad_norm": 0.348243772983551, "learning_rate": 6.643049932523617e-06, "loss": 1.4864, "mean_token_accuracy": 0.6374901950359344, "num_tokens": 22596557.0, "step": 3960 }, { "entropy": 1.5316878080368042, "epoch": 2.0091093117408905, "grad_norm": 0.32034119963645935, "learning_rate": 6.609311740890689e-06, "loss": 1.538, "mean_token_accuracy": 0.6406886577606201, "num_tokens": 22656578.0, "step": 3970 }, { "entropy": 1.422401201725006, "epoch": 2.01417004048583, "grad_norm": 0.2935118079185486, "learning_rate": 6.57557354925776e-06, "loss": 1.4232, "mean_token_accuracy": 0.6517488479614257, "num_tokens": 22715169.0, "step": 3980 }, { "entropy": 1.4487539887428285, "epoch": 2.019230769230769, "grad_norm": 0.311564177274704, "learning_rate": 6.541835357624832e-06, "loss": 1.4388, "mean_token_accuracy": 0.6472173929214478, "num_tokens": 22772089.0, "step": 3990 }, { "entropy": 1.5003145456314086, "epoch": 2.0242914979757085, "grad_norm": 0.2912486493587494, "learning_rate": 6.508097165991904e-06, "loss": 1.5015, "mean_token_accuracy": 0.6321758210659028, "num_tokens": 22834505.0, "step": 4000 }, { "entropy": 1.4098521590232849, "epoch": 2.029352226720648, "grad_norm": 0.29250964522361755, "learning_rate": 6.474358974358975e-06, "loss": 1.4107, "mean_token_accuracy": 0.6528907954692841, "num_tokens": 22889105.0, "step": 4010 }, { "entropy": 1.4532611846923829, "epoch": 2.034412955465587, "grad_norm": 0.34667733311653137, "learning_rate": 6.440620782726047e-06, "loss": 1.4581, "mean_token_accuracy": 0.6446337521076202, "num_tokens": 22942406.0, "step": 4020 }, { "entropy": 1.5700780391693114, "epoch": 2.039473684210526, "grad_norm": 0.3028770685195923, "learning_rate": 6.406882591093117e-06, "loss": 1.5643, "mean_token_accuracy": 0.6249816060066223, "num_tokens": 22996028.0, "step": 4030 }, { "entropy": 1.6611987948417664, "epoch": 2.0445344129554655, "grad_norm": 0.30681440234184265, "learning_rate": 6.3731443994601894e-06, "loss": 1.6827, "mean_token_accuracy": 0.6147861301898956, "num_tokens": 23051645.0, "step": 4040 }, { "entropy": 1.4732018947601317, "epoch": 2.049595141700405, "grad_norm": 0.26491233706474304, "learning_rate": 6.3394062078272615e-06, "loss": 1.466, "mean_token_accuracy": 0.6404920816421509, "num_tokens": 23105066.0, "step": 4050 }, { "entropy": 1.5172441840171813, "epoch": 2.054655870445344, "grad_norm": 0.3094307780265808, "learning_rate": 6.305668016194333e-06, "loss": 1.5004, "mean_token_accuracy": 0.6372400879859924, "num_tokens": 23157352.0, "step": 4060 }, { "entropy": 1.422630524635315, "epoch": 2.0597165991902835, "grad_norm": 0.29695579409599304, "learning_rate": 6.271929824561404e-06, "loss": 1.428, "mean_token_accuracy": 0.6465956628322601, "num_tokens": 23212465.0, "step": 4070 }, { "entropy": 1.4499358654022216, "epoch": 2.064777327935223, "grad_norm": 0.3413025438785553, "learning_rate": 6.238191632928475e-06, "loss": 1.4555, "mean_token_accuracy": 0.6432287812232971, "num_tokens": 23268400.0, "step": 4080 }, { "entropy": 1.433293628692627, "epoch": 2.0698380566801617, "grad_norm": 0.27788856625556946, "learning_rate": 6.204453441295547e-06, "loss": 1.4404, "mean_token_accuracy": 0.6448906004428864, "num_tokens": 23330858.0, "step": 4090 }, { "entropy": 1.527322268486023, "epoch": 2.074898785425101, "grad_norm": 0.28372228145599365, "learning_rate": 6.170715249662618e-06, "loss": 1.5369, "mean_token_accuracy": 0.6296894669532775, "num_tokens": 23388049.0, "step": 4100 }, { "entropy": 1.654162836074829, "epoch": 2.0799595141700404, "grad_norm": 0.3283277451992035, "learning_rate": 6.1369770580296905e-06, "loss": 1.6652, "mean_token_accuracy": 0.6081342697143555, "num_tokens": 23450327.0, "step": 4110 }, { "entropy": 1.5552624464035034, "epoch": 2.08502024291498, "grad_norm": 0.3101661205291748, "learning_rate": 6.103238866396761e-06, "loss": 1.5571, "mean_token_accuracy": 0.6288932502269745, "num_tokens": 23507582.0, "step": 4120 }, { "entropy": 1.5187462210655212, "epoch": 2.090080971659919, "grad_norm": 0.26190704107284546, "learning_rate": 6.069500674763833e-06, "loss": 1.5231, "mean_token_accuracy": 0.6347708106040955, "num_tokens": 23570085.0, "step": 4130 }, { "entropy": 1.4180486440658568, "epoch": 2.0951417004048585, "grad_norm": 0.24935229122638702, "learning_rate": 6.035762483130905e-06, "loss": 1.4134, "mean_token_accuracy": 0.6535919070243835, "num_tokens": 23629729.0, "step": 4140 }, { "entropy": 1.5712830781936646, "epoch": 2.1002024291497974, "grad_norm": 0.28485989570617676, "learning_rate": 6.002024291497976e-06, "loss": 1.5661, "mean_token_accuracy": 0.6283676266670227, "num_tokens": 23686822.0, "step": 4150 }, { "entropy": 1.487233829498291, "epoch": 2.1052631578947367, "grad_norm": 0.3802538812160492, "learning_rate": 5.968286099865048e-06, "loss": 1.5071, "mean_token_accuracy": 0.636066097021103, "num_tokens": 23743196.0, "step": 4160 }, { "entropy": 1.485396420955658, "epoch": 2.110323886639676, "grad_norm": 0.37386566400527954, "learning_rate": 5.934547908232119e-06, "loss": 1.4772, "mean_token_accuracy": 0.6422532796859741, "num_tokens": 23798229.0, "step": 4170 }, { "entropy": 1.535237228870392, "epoch": 2.1153846153846154, "grad_norm": 0.26898157596588135, "learning_rate": 5.900809716599191e-06, "loss": 1.5333, "mean_token_accuracy": 0.6358494937419892, "num_tokens": 23852408.0, "step": 4180 }, { "entropy": 1.5727092146873474, "epoch": 2.1204453441295548, "grad_norm": 0.3571448028087616, "learning_rate": 5.867071524966263e-06, "loss": 1.5678, "mean_token_accuracy": 0.6239661037921905, "num_tokens": 23902266.0, "step": 4190 }, { "entropy": 1.5237385392189027, "epoch": 2.125506072874494, "grad_norm": 0.28321143984794617, "learning_rate": 5.833333333333334e-06, "loss": 1.5365, "mean_token_accuracy": 0.6352564930915833, "num_tokens": 23959815.0, "step": 4200 }, { "entropy": 1.5299026012420653, "epoch": 2.130566801619433, "grad_norm": 0.3400108218193054, "learning_rate": 5.799595141700405e-06, "loss": 1.519, "mean_token_accuracy": 0.6339640021324158, "num_tokens": 24012133.0, "step": 4210 }, { "entropy": 1.657011294364929, "epoch": 2.1356275303643724, "grad_norm": 0.3595241606235504, "learning_rate": 5.765856950067476e-06, "loss": 1.668, "mean_token_accuracy": 0.6125568807125091, "num_tokens": 24063677.0, "step": 4220 }, { "entropy": 1.5003764629364014, "epoch": 2.1406882591093117, "grad_norm": 0.32139450311660767, "learning_rate": 5.7321187584345484e-06, "loss": 1.4876, "mean_token_accuracy": 0.6435904741287232, "num_tokens": 24120380.0, "step": 4230 }, { "entropy": 1.6574489951133728, "epoch": 2.145748987854251, "grad_norm": 0.30065852403640747, "learning_rate": 5.6983805668016205e-06, "loss": 1.6782, "mean_token_accuracy": 0.6093615233898163, "num_tokens": 24181603.0, "step": 4240 }, { "entropy": 1.4604612827301025, "epoch": 2.1508097165991904, "grad_norm": 0.28791046142578125, "learning_rate": 5.664642375168692e-06, "loss": 1.4376, "mean_token_accuracy": 0.6457455456256866, "num_tokens": 24239096.0, "step": 4250 }, { "entropy": 1.4780054807662963, "epoch": 2.1558704453441297, "grad_norm": 0.2827425003051758, "learning_rate": 5.630904183535763e-06, "loss": 1.4805, "mean_token_accuracy": 0.6447736561298371, "num_tokens": 24295397.0, "step": 4260 }, { "entropy": 1.4344088315963746, "epoch": 2.1609311740890687, "grad_norm": 0.3887704908847809, "learning_rate": 5.597165991902834e-06, "loss": 1.4266, "mean_token_accuracy": 0.6494575679302216, "num_tokens": 24345669.0, "step": 4270 }, { "entropy": 1.5128828644752503, "epoch": 2.165991902834008, "grad_norm": 0.34420716762542725, "learning_rate": 5.563427800269906e-06, "loss": 1.5186, "mean_token_accuracy": 0.6373259782791137, "num_tokens": 24403704.0, "step": 4280 }, { "entropy": 1.3984260201454162, "epoch": 2.1710526315789473, "grad_norm": 0.33548930287361145, "learning_rate": 5.5296896086369774e-06, "loss": 1.381, "mean_token_accuracy": 0.6609737515449524, "num_tokens": 24457935.0, "step": 4290 }, { "entropy": 1.4911738991737367, "epoch": 2.1761133603238867, "grad_norm": 0.2852116823196411, "learning_rate": 5.4959514170040495e-06, "loss": 1.4799, "mean_token_accuracy": 0.6415831744670868, "num_tokens": 24511977.0, "step": 4300 }, { "entropy": 1.4702451825141907, "epoch": 2.181174089068826, "grad_norm": 0.28457802534103394, "learning_rate": 5.46221322537112e-06, "loss": 1.4768, "mean_token_accuracy": 0.6372047007083893, "num_tokens": 24569954.0, "step": 4310 }, { "entropy": 1.4613691449165345, "epoch": 2.1862348178137654, "grad_norm": 0.31222304701805115, "learning_rate": 5.428475033738192e-06, "loss": 1.4692, "mean_token_accuracy": 0.6442633271217346, "num_tokens": 24625268.0, "step": 4320 }, { "entropy": 1.466537070274353, "epoch": 2.1912955465587043, "grad_norm": 0.2962714433670044, "learning_rate": 5.394736842105264e-06, "loss": 1.4664, "mean_token_accuracy": 0.6492825329303742, "num_tokens": 24688289.0, "step": 4330 }, { "entropy": 1.5810052037239075, "epoch": 2.1963562753036436, "grad_norm": 0.30552032589912415, "learning_rate": 5.360998650472335e-06, "loss": 1.5811, "mean_token_accuracy": 0.6259881913661957, "num_tokens": 24746697.0, "step": 4340 }, { "entropy": 1.4260846734046937, "epoch": 2.201417004048583, "grad_norm": 0.2985803484916687, "learning_rate": 5.327260458839406e-06, "loss": 1.4137, "mean_token_accuracy": 0.6532795548439025, "num_tokens": 24810772.0, "step": 4350 }, { "entropy": 1.5106618881225586, "epoch": 2.2064777327935223, "grad_norm": 0.33830076456069946, "learning_rate": 5.293522267206478e-06, "loss": 1.522, "mean_token_accuracy": 0.6390328884124756, "num_tokens": 24870122.0, "step": 4360 }, { "entropy": 1.527205801010132, "epoch": 2.2115384615384617, "grad_norm": 0.444986492395401, "learning_rate": 5.25978407557355e-06, "loss": 1.5237, "mean_token_accuracy": 0.6333723068237305, "num_tokens": 24929676.0, "step": 4370 }, { "entropy": 1.571653914451599, "epoch": 2.216599190283401, "grad_norm": 0.27972137928009033, "learning_rate": 5.226045883940622e-06, "loss": 1.5782, "mean_token_accuracy": 0.62519211769104, "num_tokens": 24984648.0, "step": 4380 }, { "entropy": 1.579957866668701, "epoch": 2.22165991902834, "grad_norm": 0.35601162910461426, "learning_rate": 5.192307692307693e-06, "loss": 1.5916, "mean_token_accuracy": 0.6265009582042694, "num_tokens": 25039282.0, "step": 4390 }, { "entropy": 1.590737247467041, "epoch": 2.2267206477732793, "grad_norm": 0.3328033685684204, "learning_rate": 5.158569500674764e-06, "loss": 1.5942, "mean_token_accuracy": 0.6266931772232056, "num_tokens": 25084698.0, "step": 4400 }, { "entropy": 1.4461635231971741, "epoch": 2.2317813765182186, "grad_norm": 0.3073853850364685, "learning_rate": 5.124831309041835e-06, "loss": 1.4532, "mean_token_accuracy": 0.6430659115314483, "num_tokens": 25145917.0, "step": 4410 }, { "entropy": 1.6023080706596375, "epoch": 2.236842105263158, "grad_norm": 0.38999930024147034, "learning_rate": 5.0910931174089075e-06, "loss": 1.6065, "mean_token_accuracy": 0.6303758680820465, "num_tokens": 25200499.0, "step": 4420 }, { "entropy": 1.403742289543152, "epoch": 2.2419028340080973, "grad_norm": 0.3020265996456146, "learning_rate": 5.057354925775979e-06, "loss": 1.3936, "mean_token_accuracy": 0.6550646901130677, "num_tokens": 25253626.0, "step": 4430 }, { "entropy": 1.5970208644866943, "epoch": 2.246963562753036, "grad_norm": 0.34803110361099243, "learning_rate": 5.023616734143051e-06, "loss": 1.6128, "mean_token_accuracy": 0.6253244817256928, "num_tokens": 25315718.0, "step": 4440 }, { "entropy": 1.4895619392395019, "epoch": 2.2520242914979756, "grad_norm": 0.295636385679245, "learning_rate": 4.989878542510122e-06, "loss": 1.4976, "mean_token_accuracy": 0.6415492594242096, "num_tokens": 25378490.0, "step": 4450 }, { "entropy": 1.500291097164154, "epoch": 2.257085020242915, "grad_norm": 0.29003915190696716, "learning_rate": 4.956140350877193e-06, "loss": 1.4741, "mean_token_accuracy": 0.6455156445503235, "num_tokens": 25435125.0, "step": 4460 }, { "entropy": 1.5137645125389099, "epoch": 2.2621457489878543, "grad_norm": 0.345222145318985, "learning_rate": 4.922402159244265e-06, "loss": 1.5106, "mean_token_accuracy": 0.6373549580574036, "num_tokens": 25492838.0, "step": 4470 }, { "entropy": 1.4126244068145752, "epoch": 2.2672064777327936, "grad_norm": 0.43444496393203735, "learning_rate": 4.8886639676113364e-06, "loss": 1.402, "mean_token_accuracy": 0.6513433575630188, "num_tokens": 25552113.0, "step": 4480 }, { "entropy": 1.5574785828590394, "epoch": 2.272267206477733, "grad_norm": 0.28663352131843567, "learning_rate": 4.854925775978408e-06, "loss": 1.5719, "mean_token_accuracy": 0.6330413460731507, "num_tokens": 25604938.0, "step": 4490 }, { "entropy": 1.5517175793647766, "epoch": 2.2773279352226723, "grad_norm": 0.3585723042488098, "learning_rate": 4.82118758434548e-06, "loss": 1.5492, "mean_token_accuracy": 0.6311025798320771, "num_tokens": 25663827.0, "step": 4500 }, { "entropy": 1.7192303657531738, "epoch": 2.282388663967611, "grad_norm": 0.3171631395816803, "learning_rate": 4.787449392712551e-06, "loss": 1.7084, "mean_token_accuracy": 0.5979065060615539, "num_tokens": 25718627.0, "step": 4510 }, { "entropy": 1.4433665156364441, "epoch": 2.2874493927125505, "grad_norm": 0.31859585642814636, "learning_rate": 4.753711201079623e-06, "loss": 1.431, "mean_token_accuracy": 0.6453494548797607, "num_tokens": 25779859.0, "step": 4520 }, { "entropy": 1.493071937561035, "epoch": 2.29251012145749, "grad_norm": 0.3323538303375244, "learning_rate": 4.719973009446694e-06, "loss": 1.5016, "mean_token_accuracy": 0.6344216048717499, "num_tokens": 25835705.0, "step": 4530 }, { "entropy": 1.5348315596580506, "epoch": 2.2975708502024292, "grad_norm": 0.29418283700942993, "learning_rate": 4.686234817813765e-06, "loss": 1.5299, "mean_token_accuracy": 0.6337445557117463, "num_tokens": 25896484.0, "step": 4540 }, { "entropy": 1.4027626633644104, "epoch": 2.3026315789473686, "grad_norm": 0.3454079031944275, "learning_rate": 4.652496626180837e-06, "loss": 1.3954, "mean_token_accuracy": 0.6570545434951782, "num_tokens": 25946989.0, "step": 4550 }, { "entropy": 1.4810479283332825, "epoch": 2.3076923076923075, "grad_norm": 0.30555200576782227, "learning_rate": 4.618758434547909e-06, "loss": 1.4935, "mean_token_accuracy": 0.6418456912040711, "num_tokens": 26005212.0, "step": 4560 }, { "entropy": 1.5501378655433655, "epoch": 2.312753036437247, "grad_norm": 0.2936731278896332, "learning_rate": 4.585020242914981e-06, "loss": 1.5493, "mean_token_accuracy": 0.6311659216880798, "num_tokens": 26061206.0, "step": 4570 }, { "entropy": 1.5965832471847534, "epoch": 2.317813765182186, "grad_norm": 0.3174577057361603, "learning_rate": 4.551282051282052e-06, "loss": 1.5986, "mean_token_accuracy": 0.6272948026657105, "num_tokens": 26117314.0, "step": 4580 }, { "entropy": 1.497817873954773, "epoch": 2.3228744939271255, "grad_norm": 0.3074813485145569, "learning_rate": 4.517543859649123e-06, "loss": 1.5177, "mean_token_accuracy": 0.639699399471283, "num_tokens": 26177625.0, "step": 4590 }, { "entropy": 1.398792815208435, "epoch": 2.327935222672065, "grad_norm": 0.3233450949192047, "learning_rate": 4.483805668016194e-06, "loss": 1.3972, "mean_token_accuracy": 0.6578422546386719, "num_tokens": 26229108.0, "step": 4600 }, { "entropy": 1.3582614064216614, "epoch": 2.332995951417004, "grad_norm": 0.3194423019886017, "learning_rate": 4.4500674763832665e-06, "loss": 1.3473, "mean_token_accuracy": 0.6627348363399506, "num_tokens": 26281682.0, "step": 4610 }, { "entropy": 1.4663148880004884, "epoch": 2.3380566801619436, "grad_norm": 0.317622572183609, "learning_rate": 4.416329284750338e-06, "loss": 1.4749, "mean_token_accuracy": 0.6402939558029175, "num_tokens": 26343090.0, "step": 4620 }, { "entropy": 1.4386041164398193, "epoch": 2.3431174089068825, "grad_norm": 0.37403181195259094, "learning_rate": 4.382591093117409e-06, "loss": 1.4399, "mean_token_accuracy": 0.6470987558364868, "num_tokens": 26398372.0, "step": 4630 }, { "entropy": 1.591576099395752, "epoch": 2.348178137651822, "grad_norm": 0.27833235263824463, "learning_rate": 4.348852901484481e-06, "loss": 1.6015, "mean_token_accuracy": 0.6296046376228333, "num_tokens": 26458865.0, "step": 4640 }, { "entropy": 1.4324705123901367, "epoch": 2.353238866396761, "grad_norm": 0.3234311044216156, "learning_rate": 4.315114709851552e-06, "loss": 1.4182, "mean_token_accuracy": 0.6525469720363617, "num_tokens": 26514094.0, "step": 4650 }, { "entropy": 1.5859549045562744, "epoch": 2.3582995951417005, "grad_norm": 0.31048783659935, "learning_rate": 4.281376518218624e-06, "loss": 1.6055, "mean_token_accuracy": 0.6206431567668915, "num_tokens": 26573568.0, "step": 4660 }, { "entropy": 1.4157851219177247, "epoch": 2.36336032388664, "grad_norm": 0.27004745602607727, "learning_rate": 4.2476383265856954e-06, "loss": 1.4191, "mean_token_accuracy": 0.6526973366737365, "num_tokens": 26628281.0, "step": 4670 }, { "entropy": 1.4219112515449523, "epoch": 2.3684210526315788, "grad_norm": 0.3162846863269806, "learning_rate": 4.213900134952767e-06, "loss": 1.4237, "mean_token_accuracy": 0.6481447339057922, "num_tokens": 26683329.0, "step": 4680 }, { "entropy": 1.474673593044281, "epoch": 2.373481781376518, "grad_norm": 0.2558523714542389, "learning_rate": 4.180161943319838e-06, "loss": 1.4789, "mean_token_accuracy": 0.644309651851654, "num_tokens": 26741726.0, "step": 4690 }, { "entropy": 1.545168387889862, "epoch": 2.3785425101214575, "grad_norm": 0.3100733160972595, "learning_rate": 4.14642375168691e-06, "loss": 1.5585, "mean_token_accuracy": 0.6251280426979064, "num_tokens": 26801987.0, "step": 4700 }, { "entropy": 1.4952475309371949, "epoch": 2.383603238866397, "grad_norm": 0.2840896546840668, "learning_rate": 4.112685560053982e-06, "loss": 1.4928, "mean_token_accuracy": 0.6407946467399597, "num_tokens": 26862449.0, "step": 4710 }, { "entropy": 1.3853577494621276, "epoch": 2.388663967611336, "grad_norm": 0.315100759267807, "learning_rate": 4.078947368421053e-06, "loss": 1.3891, "mean_token_accuracy": 0.6517343044281005, "num_tokens": 26923528.0, "step": 4720 }, { "entropy": 1.5327417492866515, "epoch": 2.3937246963562755, "grad_norm": 0.3072359561920166, "learning_rate": 4.0452091767881244e-06, "loss": 1.5438, "mean_token_accuracy": 0.638210940361023, "num_tokens": 26976129.0, "step": 4730 }, { "entropy": 1.6007991313934327, "epoch": 2.3987854251012144, "grad_norm": 0.28095099329948425, "learning_rate": 4.011470985155196e-06, "loss": 1.6025, "mean_token_accuracy": 0.6204523742198944, "num_tokens": 27030769.0, "step": 4740 }, { "entropy": 1.5538129091262818, "epoch": 2.4038461538461537, "grad_norm": 0.3622888922691345, "learning_rate": 3.977732793522268e-06, "loss": 1.5497, "mean_token_accuracy": 0.6246297895908356, "num_tokens": 27085119.0, "step": 4750 }, { "entropy": 1.4716430306434631, "epoch": 2.408906882591093, "grad_norm": 0.2776808738708496, "learning_rate": 3.943994601889339e-06, "loss": 1.4715, "mean_token_accuracy": 0.6430730044841766, "num_tokens": 27146308.0, "step": 4760 }, { "entropy": 1.4779613852500915, "epoch": 2.4139676113360324, "grad_norm": 0.30735519528388977, "learning_rate": 3.910256410256411e-06, "loss": 1.481, "mean_token_accuracy": 0.6421349704265594, "num_tokens": 27204236.0, "step": 4770 }, { "entropy": 1.6263086080551148, "epoch": 2.419028340080972, "grad_norm": 0.3509717881679535, "learning_rate": 3.876518218623482e-06, "loss": 1.6306, "mean_token_accuracy": 0.6189518332481384, "num_tokens": 27253795.0, "step": 4780 }, { "entropy": 1.5051485419273376, "epoch": 2.4240890688259107, "grad_norm": 0.36502060294151306, "learning_rate": 3.842780026990553e-06, "loss": 1.5045, "mean_token_accuracy": 0.6390359103679657, "num_tokens": 27311173.0, "step": 4790 }, { "entropy": 1.5122657060623168, "epoch": 2.42914979757085, "grad_norm": 0.35788798332214355, "learning_rate": 3.8090418353576255e-06, "loss": 1.4811, "mean_token_accuracy": 0.6367557644844055, "num_tokens": 27366839.0, "step": 4800 }, { "entropy": 1.5352485537528993, "epoch": 2.4342105263157894, "grad_norm": 0.2877010107040405, "learning_rate": 3.7753036437246967e-06, "loss": 1.5402, "mean_token_accuracy": 0.6323030471801758, "num_tokens": 27423988.0, "step": 4810 }, { "entropy": 1.329223895072937, "epoch": 2.4392712550607287, "grad_norm": 0.27826353907585144, "learning_rate": 3.7415654520917683e-06, "loss": 1.3322, "mean_token_accuracy": 0.6661195576190948, "num_tokens": 27482284.0, "step": 4820 }, { "entropy": 1.460306990146637, "epoch": 2.444331983805668, "grad_norm": 0.2664757966995239, "learning_rate": 3.7078272604588395e-06, "loss": 1.4645, "mean_token_accuracy": 0.6439946055412292, "num_tokens": 27542235.0, "step": 4830 }, { "entropy": 1.420573878288269, "epoch": 2.4493927125506074, "grad_norm": 0.3187576234340668, "learning_rate": 3.674089068825911e-06, "loss": 1.4271, "mean_token_accuracy": 0.6494402289390564, "num_tokens": 27606521.0, "step": 4840 }, { "entropy": 1.5605995893478393, "epoch": 2.4544534412955468, "grad_norm": 0.3589235842227936, "learning_rate": 3.640350877192983e-06, "loss": 1.5464, "mean_token_accuracy": 0.636710187792778, "num_tokens": 27655589.0, "step": 4850 }, { "entropy": 1.5500049710273742, "epoch": 2.4595141700404857, "grad_norm": 0.42818954586982727, "learning_rate": 3.606612685560054e-06, "loss": 1.5422, "mean_token_accuracy": 0.6335929155349731, "num_tokens": 27707321.0, "step": 4860 }, { "entropy": 1.5264463543891906, "epoch": 2.464574898785425, "grad_norm": 0.30446869134902954, "learning_rate": 3.572874493927126e-06, "loss": 1.5354, "mean_token_accuracy": 0.6377040445804596, "num_tokens": 27766680.0, "step": 4870 }, { "entropy": 1.5602357268333436, "epoch": 2.4696356275303644, "grad_norm": 0.31952470541000366, "learning_rate": 3.5391363022941973e-06, "loss": 1.563, "mean_token_accuracy": 0.6299617826938629, "num_tokens": 27825128.0, "step": 4880 }, { "entropy": 1.4979919075965882, "epoch": 2.4746963562753037, "grad_norm": 0.3032040596008301, "learning_rate": 3.505398110661269e-06, "loss": 1.5194, "mean_token_accuracy": 0.6328540325164795, "num_tokens": 27886765.0, "step": 4890 }, { "entropy": 1.571874487400055, "epoch": 2.479757085020243, "grad_norm": 0.3398491144180298, "learning_rate": 3.47165991902834e-06, "loss": 1.568, "mean_token_accuracy": 0.6220065712928772, "num_tokens": 27942690.0, "step": 4900 }, { "entropy": 1.4888028264045716, "epoch": 2.484817813765182, "grad_norm": 0.2785778343677521, "learning_rate": 3.437921727395412e-06, "loss": 1.482, "mean_token_accuracy": 0.6426316261291504, "num_tokens": 28001045.0, "step": 4910 }, { "entropy": 1.4304234504699707, "epoch": 2.4898785425101213, "grad_norm": 0.36416903138160706, "learning_rate": 3.4041835357624834e-06, "loss": 1.4412, "mean_token_accuracy": 0.6480507373809814, "num_tokens": 28060050.0, "step": 4920 }, { "entropy": 1.4549246668815612, "epoch": 2.4949392712550607, "grad_norm": 0.3209365904331207, "learning_rate": 3.3704453441295546e-06, "loss": 1.4444, "mean_token_accuracy": 0.6485071182250977, "num_tokens": 28119937.0, "step": 4930 }, { "entropy": 1.6035995841026307, "epoch": 2.5, "grad_norm": 0.3263776898384094, "learning_rate": 3.3367071524966267e-06, "loss": 1.596, "mean_token_accuracy": 0.6212283372879028, "num_tokens": 28176098.0, "step": 4940 }, { "entropy": 1.3420706629753112, "epoch": 2.5050607287449393, "grad_norm": 0.29616400599479675, "learning_rate": 3.302968960863698e-06, "loss": 1.3361, "mean_token_accuracy": 0.6657415688037872, "num_tokens": 28232898.0, "step": 4950 }, { "entropy": 1.5731253027915955, "epoch": 2.5101214574898787, "grad_norm": 0.2652728259563446, "learning_rate": 3.2692307692307696e-06, "loss": 1.569, "mean_token_accuracy": 0.6270411610603333, "num_tokens": 28289187.0, "step": 4960 }, { "entropy": 1.4383020401000977, "epoch": 2.515182186234818, "grad_norm": 0.3313502371311188, "learning_rate": 3.2354925775978408e-06, "loss": 1.4301, "mean_token_accuracy": 0.6567471146583557, "num_tokens": 28345870.0, "step": 4970 }, { "entropy": 1.4449619054794312, "epoch": 2.520242914979757, "grad_norm": 0.299467533826828, "learning_rate": 3.2017543859649124e-06, "loss": 1.4596, "mean_token_accuracy": 0.6480660021305085, "num_tokens": 28401335.0, "step": 4980 }, { "entropy": 1.407576084136963, "epoch": 2.5253036437246963, "grad_norm": 0.33703747391700745, "learning_rate": 3.168016194331984e-06, "loss": 1.4026, "mean_token_accuracy": 0.6588316440582276, "num_tokens": 28451027.0, "step": 4990 }, { "entropy": 1.6358988881111145, "epoch": 2.5303643724696356, "grad_norm": 0.3531615138053894, "learning_rate": 3.1342780026990553e-06, "loss": 1.6387, "mean_token_accuracy": 0.6192252457141876, "num_tokens": 28508717.0, "step": 5000 }, { "entropy": 1.530623769760132, "epoch": 2.535425101214575, "grad_norm": 0.2998420000076294, "learning_rate": 3.1005398110661273e-06, "loss": 1.5209, "mean_token_accuracy": 0.6354014992713928, "num_tokens": 28566256.0, "step": 5010 }, { "entropy": 1.5933383703231812, "epoch": 2.5404858299595143, "grad_norm": 0.3689696192741394, "learning_rate": 3.0668016194331985e-06, "loss": 1.5881, "mean_token_accuracy": 0.6318571925163269, "num_tokens": 28618249.0, "step": 5020 }, { "entropy": 1.4564833164215087, "epoch": 2.5455465587044532, "grad_norm": 0.30524808168411255, "learning_rate": 3.03306342780027e-06, "loss": 1.4375, "mean_token_accuracy": 0.6440569698810578, "num_tokens": 28674342.0, "step": 5030 }, { "entropy": 1.510752511024475, "epoch": 2.5506072874493926, "grad_norm": 0.3323598802089691, "learning_rate": 2.999325236167342e-06, "loss": 1.5278, "mean_token_accuracy": 0.6354637145996094, "num_tokens": 28731622.0, "step": 5040 }, { "entropy": 1.4739052295684814, "epoch": 2.555668016194332, "grad_norm": 0.31869643926620483, "learning_rate": 2.965587044534413e-06, "loss": 1.4649, "mean_token_accuracy": 0.6425871312618255, "num_tokens": 28791133.0, "step": 5050 }, { "entropy": 1.5100542187690735, "epoch": 2.5607287449392713, "grad_norm": 0.3328213095664978, "learning_rate": 2.931848852901485e-06, "loss": 1.5045, "mean_token_accuracy": 0.6392671585083007, "num_tokens": 28847713.0, "step": 5060 }, { "entropy": 1.4085248589515686, "epoch": 2.5657894736842106, "grad_norm": 0.281522661447525, "learning_rate": 2.8981106612685563e-06, "loss": 1.3982, "mean_token_accuracy": 0.6513190269470215, "num_tokens": 28910189.0, "step": 5070 }, { "entropy": 1.397442674636841, "epoch": 2.57085020242915, "grad_norm": 0.3210408091545105, "learning_rate": 2.864372469635628e-06, "loss": 1.3977, "mean_token_accuracy": 0.6574838936328888, "num_tokens": 28966241.0, "step": 5080 }, { "entropy": 1.5165488362312316, "epoch": 2.5759109311740893, "grad_norm": 0.31288620829582214, "learning_rate": 2.830634278002699e-06, "loss": 1.5124, "mean_token_accuracy": 0.6387628674507141, "num_tokens": 29026210.0, "step": 5090 }, { "entropy": 1.5974119186401368, "epoch": 2.580971659919028, "grad_norm": 0.3497001826763153, "learning_rate": 2.796896086369771e-06, "loss": 1.61, "mean_token_accuracy": 0.6236252367496491, "num_tokens": 29083556.0, "step": 5100 }, { "entropy": 1.5403811931610107, "epoch": 2.5860323886639676, "grad_norm": 0.31958791613578796, "learning_rate": 2.7631578947368424e-06, "loss": 1.5418, "mean_token_accuracy": 0.634338253736496, "num_tokens": 29142090.0, "step": 5110 }, { "entropy": 1.4701064825057983, "epoch": 2.591093117408907, "grad_norm": 0.28594285249710083, "learning_rate": 2.7294197031039137e-06, "loss": 1.4693, "mean_token_accuracy": 0.6509437322616577, "num_tokens": 29198039.0, "step": 5120 }, { "entropy": 1.508654534816742, "epoch": 2.5961538461538463, "grad_norm": 0.28295132517814636, "learning_rate": 2.6956815114709857e-06, "loss": 1.5107, "mean_token_accuracy": 0.6393173456192016, "num_tokens": 29258240.0, "step": 5130 }, { "entropy": 1.573255705833435, "epoch": 2.601214574898785, "grad_norm": 0.2459454983472824, "learning_rate": 2.661943319838057e-06, "loss": 1.5903, "mean_token_accuracy": 0.6283860564231872, "num_tokens": 29318879.0, "step": 5140 }, { "entropy": 1.5287572503089906, "epoch": 2.6062753036437245, "grad_norm": 0.31771403551101685, "learning_rate": 2.6282051282051286e-06, "loss": 1.5452, "mean_token_accuracy": 0.6344579041004181, "num_tokens": 29379919.0, "step": 5150 }, { "entropy": 1.3615296483039856, "epoch": 2.611336032388664, "grad_norm": 0.28625616431236267, "learning_rate": 2.5944669365721998e-06, "loss": 1.349, "mean_token_accuracy": 0.6637236177921295, "num_tokens": 29438959.0, "step": 5160 }, { "entropy": 1.4767539501190186, "epoch": 2.616396761133603, "grad_norm": 0.2911388874053955, "learning_rate": 2.5607287449392714e-06, "loss": 1.4775, "mean_token_accuracy": 0.6405583918094635, "num_tokens": 29495248.0, "step": 5170 }, { "entropy": 1.4118461966514588, "epoch": 2.6214574898785425, "grad_norm": 0.3035772442817688, "learning_rate": 2.526990553306343e-06, "loss": 1.4266, "mean_token_accuracy": 0.6568454921245575, "num_tokens": 29549374.0, "step": 5180 }, { "entropy": 1.3858314156532288, "epoch": 2.626518218623482, "grad_norm": 0.28831735253334045, "learning_rate": 2.4932523616734143e-06, "loss": 1.3659, "mean_token_accuracy": 0.6626292169094086, "num_tokens": 29608335.0, "step": 5190 }, { "entropy": 1.5293712258338927, "epoch": 2.6315789473684212, "grad_norm": 0.33819642663002014, "learning_rate": 2.459514170040486e-06, "loss": 1.5299, "mean_token_accuracy": 0.629097181558609, "num_tokens": 29666401.0, "step": 5200 }, { "entropy": 1.5522411942481995, "epoch": 2.6366396761133606, "grad_norm": 0.37447431683540344, "learning_rate": 2.4257759784075576e-06, "loss": 1.5546, "mean_token_accuracy": 0.6252642631530761, "num_tokens": 29722977.0, "step": 5210 }, { "entropy": 1.5046650171279907, "epoch": 2.6417004048582995, "grad_norm": 0.32877567410469055, "learning_rate": 2.392037786774629e-06, "loss": 1.4941, "mean_token_accuracy": 0.6403312921524048, "num_tokens": 29777693.0, "step": 5220 }, { "entropy": 1.4904412388801576, "epoch": 2.646761133603239, "grad_norm": 0.30846232175827026, "learning_rate": 2.358299595141701e-06, "loss": 1.4874, "mean_token_accuracy": 0.6401443660259247, "num_tokens": 29841451.0, "step": 5230 }, { "entropy": 1.4474842250347137, "epoch": 2.651821862348178, "grad_norm": 0.3371650278568268, "learning_rate": 2.324561403508772e-06, "loss": 1.4514, "mean_token_accuracy": 0.6537328362464905, "num_tokens": 29900142.0, "step": 5240 }, { "entropy": 1.5490441560745238, "epoch": 2.6568825910931175, "grad_norm": 0.28833135962486267, "learning_rate": 2.2908232118758437e-06, "loss": 1.5525, "mean_token_accuracy": 0.6344904005527496, "num_tokens": 29965665.0, "step": 5250 }, { "entropy": 1.4371688961982727, "epoch": 2.6619433198380564, "grad_norm": 0.27346664667129517, "learning_rate": 2.257085020242915e-06, "loss": 1.4386, "mean_token_accuracy": 0.6554741203784943, "num_tokens": 30020063.0, "step": 5260 }, { "entropy": 1.57616069316864, "epoch": 2.667004048582996, "grad_norm": 0.31261205673217773, "learning_rate": 2.2233468286099865e-06, "loss": 1.5878, "mean_token_accuracy": 0.6287827432155609, "num_tokens": 30079648.0, "step": 5270 }, { "entropy": 1.6309450030326844, "epoch": 2.672064777327935, "grad_norm": 0.36513420939445496, "learning_rate": 2.189608636977058e-06, "loss": 1.6362, "mean_token_accuracy": 0.6139590203762054, "num_tokens": 30139557.0, "step": 5280 }, { "entropy": 1.6020007967948913, "epoch": 2.6771255060728745, "grad_norm": 0.3361331522464752, "learning_rate": 2.15587044534413e-06, "loss": 1.5899, "mean_token_accuracy": 0.623996788263321, "num_tokens": 30194644.0, "step": 5290 }, { "entropy": 1.4187337517738343, "epoch": 2.682186234817814, "grad_norm": 0.3711530864238739, "learning_rate": 2.1221322537112015e-06, "loss": 1.4225, "mean_token_accuracy": 0.6517966687679291, "num_tokens": 30249182.0, "step": 5300 }, { "entropy": 1.4419126749038695, "epoch": 2.687246963562753, "grad_norm": 0.34213292598724365, "learning_rate": 2.0883940620782727e-06, "loss": 1.4502, "mean_token_accuracy": 0.6504493892192841, "num_tokens": 30307151.0, "step": 5310 }, { "entropy": 1.593650794029236, "epoch": 2.6923076923076925, "grad_norm": 0.2626771032810211, "learning_rate": 2.0546558704453443e-06, "loss": 1.5977, "mean_token_accuracy": 0.6253896594047547, "num_tokens": 30363799.0, "step": 5320 }, { "entropy": 1.505863094329834, "epoch": 2.6973684210526314, "grad_norm": 0.31610244512557983, "learning_rate": 2.020917678812416e-06, "loss": 1.507, "mean_token_accuracy": 0.6344715654850006, "num_tokens": 30420544.0, "step": 5330 }, { "entropy": 1.5584728479385377, "epoch": 2.7024291497975708, "grad_norm": 0.3088075518608093, "learning_rate": 1.987179487179487e-06, "loss": 1.5504, "mean_token_accuracy": 0.6318127393722535, "num_tokens": 30479027.0, "step": 5340 }, { "entropy": 1.464753222465515, "epoch": 2.70748987854251, "grad_norm": 0.4019823372364044, "learning_rate": 1.953441295546559e-06, "loss": 1.4567, "mean_token_accuracy": 0.6482177615165711, "num_tokens": 30534454.0, "step": 5350 }, { "entropy": 1.5660398364067079, "epoch": 2.7125506072874495, "grad_norm": 0.2922350764274597, "learning_rate": 1.9197031039136304e-06, "loss": 1.5742, "mean_token_accuracy": 0.6296724855899811, "num_tokens": 30591311.0, "step": 5360 }, { "entropy": 1.513328456878662, "epoch": 2.717611336032389, "grad_norm": 0.34194323420524597, "learning_rate": 1.8859649122807019e-06, "loss": 1.5109, "mean_token_accuracy": 0.6368795096874237, "num_tokens": 30648991.0, "step": 5370 }, { "entropy": 1.517847204208374, "epoch": 2.7226720647773277, "grad_norm": 0.35915765166282654, "learning_rate": 1.8522267206477735e-06, "loss": 1.5111, "mean_token_accuracy": 0.6370461285114288, "num_tokens": 30702765.0, "step": 5380 }, { "entropy": 1.4219279885292053, "epoch": 2.727732793522267, "grad_norm": 0.31105926632881165, "learning_rate": 1.818488529014845e-06, "loss": 1.416, "mean_token_accuracy": 0.6535437107086182, "num_tokens": 30759049.0, "step": 5390 }, { "entropy": 1.4244774222373962, "epoch": 2.7327935222672064, "grad_norm": 0.3058363199234009, "learning_rate": 1.7847503373819164e-06, "loss": 1.4116, "mean_token_accuracy": 0.6499510526657104, "num_tokens": 30815038.0, "step": 5400 }, { "entropy": 1.6432706594467164, "epoch": 2.7378542510121457, "grad_norm": 0.33452996611595154, "learning_rate": 1.7510121457489878e-06, "loss": 1.6396, "mean_token_accuracy": 0.6203866958618164, "num_tokens": 30871076.0, "step": 5410 }, { "entropy": 1.6394325613975524, "epoch": 2.742914979757085, "grad_norm": 0.283194363117218, "learning_rate": 1.7172739541160596e-06, "loss": 1.6447, "mean_token_accuracy": 0.6192583978176117, "num_tokens": 30931499.0, "step": 5420 }, { "entropy": 1.613875186443329, "epoch": 2.7479757085020244, "grad_norm": 0.3175935745239258, "learning_rate": 1.683535762483131e-06, "loss": 1.616, "mean_token_accuracy": 0.6225574970245361, "num_tokens": 30993640.0, "step": 5430 }, { "entropy": 1.6437927842140199, "epoch": 2.753036437246964, "grad_norm": 0.2761462926864624, "learning_rate": 1.6497975708502027e-06, "loss": 1.6461, "mean_token_accuracy": 0.6168906092643738, "num_tokens": 31046563.0, "step": 5440 }, { "entropy": 1.3887561798095702, "epoch": 2.7580971659919027, "grad_norm": 0.3212042450904846, "learning_rate": 1.6160593792172741e-06, "loss": 1.3872, "mean_token_accuracy": 0.66387038230896, "num_tokens": 31100699.0, "step": 5450 }, { "entropy": 1.5592396020889283, "epoch": 2.763157894736842, "grad_norm": 0.28648391366004944, "learning_rate": 1.5823211875843455e-06, "loss": 1.5583, "mean_token_accuracy": 0.6273209810256958, "num_tokens": 31164910.0, "step": 5460 }, { "entropy": 1.546663898229599, "epoch": 2.7682186234817814, "grad_norm": 0.3598899841308594, "learning_rate": 1.548582995951417e-06, "loss": 1.5324, "mean_token_accuracy": 0.6319786071777344, "num_tokens": 31220029.0, "step": 5470 }, { "entropy": 1.5003357887268067, "epoch": 2.7732793522267207, "grad_norm": 0.2860889732837677, "learning_rate": 1.5148448043184886e-06, "loss": 1.4952, "mean_token_accuracy": 0.6411886811256409, "num_tokens": 31279401.0, "step": 5480 }, { "entropy": 1.418429398536682, "epoch": 2.77834008097166, "grad_norm": 0.2821556627750397, "learning_rate": 1.4811066126855602e-06, "loss": 1.421, "mean_token_accuracy": 0.6593441128730774, "num_tokens": 31334950.0, "step": 5490 }, { "entropy": 1.5357149362564086, "epoch": 2.783400809716599, "grad_norm": 0.3190230131149292, "learning_rate": 1.4473684210526317e-06, "loss": 1.5381, "mean_token_accuracy": 0.6347779989242553, "num_tokens": 31392814.0, "step": 5500 }, { "entropy": 1.4718513011932373, "epoch": 2.7884615384615383, "grad_norm": 0.2940792441368103, "learning_rate": 1.4136302294197033e-06, "loss": 1.4801, "mean_token_accuracy": 0.6389244079589844, "num_tokens": 31449900.0, "step": 5510 }, { "entropy": 1.3709456086158753, "epoch": 2.7935222672064777, "grad_norm": 0.30266401171684265, "learning_rate": 1.3798920377867747e-06, "loss": 1.3599, "mean_token_accuracy": 0.6677371621131897, "num_tokens": 31503636.0, "step": 5520 }, { "entropy": 1.4986552715301513, "epoch": 2.798582995951417, "grad_norm": 0.35532623529434204, "learning_rate": 1.3461538461538462e-06, "loss": 1.5069, "mean_token_accuracy": 0.6403627216815948, "num_tokens": 31563188.0, "step": 5530 }, { "entropy": 1.53000727891922, "epoch": 2.8036437246963564, "grad_norm": 0.3287500739097595, "learning_rate": 1.3124156545209176e-06, "loss": 1.5289, "mean_token_accuracy": 0.6303663849830627, "num_tokens": 31622655.0, "step": 5540 }, { "entropy": 1.399080502986908, "epoch": 2.8087044534412957, "grad_norm": 0.2796313762664795, "learning_rate": 1.2786774628879894e-06, "loss": 1.3962, "mean_token_accuracy": 0.6611886739730835, "num_tokens": 31678734.0, "step": 5550 }, { "entropy": 1.4993074774742126, "epoch": 2.813765182186235, "grad_norm": 0.2762647867202759, "learning_rate": 1.2449392712550609e-06, "loss": 1.5019, "mean_token_accuracy": 0.6430062472820282, "num_tokens": 31738238.0, "step": 5560 }, { "entropy": 1.5500144839286805, "epoch": 2.818825910931174, "grad_norm": 0.4136376678943634, "learning_rate": 1.2112010796221325e-06, "loss": 1.5483, "mean_token_accuracy": 0.6305223643779755, "num_tokens": 31794084.0, "step": 5570 }, { "entropy": 1.378357458114624, "epoch": 2.8238866396761133, "grad_norm": 0.2796184718608856, "learning_rate": 1.177462887989204e-06, "loss": 1.3879, "mean_token_accuracy": 0.6623157143592835, "num_tokens": 31850160.0, "step": 5580 }, { "entropy": 1.5122020602226258, "epoch": 2.8289473684210527, "grad_norm": 0.3030454218387604, "learning_rate": 1.1437246963562754e-06, "loss": 1.5336, "mean_token_accuracy": 0.6398816347122193, "num_tokens": 31908958.0, "step": 5590 }, { "entropy": 1.4396753072738648, "epoch": 2.834008097165992, "grad_norm": 0.329406201839447, "learning_rate": 1.109986504723347e-06, "loss": 1.4524, "mean_token_accuracy": 0.6439715623855591, "num_tokens": 31963221.0, "step": 5600 }, { "entropy": 1.4529581308364867, "epoch": 2.839068825910931, "grad_norm": 0.30805808305740356, "learning_rate": 1.0762483130904184e-06, "loss": 1.4535, "mean_token_accuracy": 0.6503434360027314, "num_tokens": 32024028.0, "step": 5610 }, { "entropy": 1.4559079647064208, "epoch": 2.8441295546558703, "grad_norm": 0.2905729115009308, "learning_rate": 1.0425101214574899e-06, "loss": 1.4595, "mean_token_accuracy": 0.6404688119888305, "num_tokens": 32079570.0, "step": 5620 }, { "entropy": 1.566865622997284, "epoch": 2.8491902834008096, "grad_norm": 0.3712847828865051, "learning_rate": 1.0087719298245615e-06, "loss": 1.5897, "mean_token_accuracy": 0.6281434834003449, "num_tokens": 32140749.0, "step": 5630 }, { "entropy": 1.3378282070159913, "epoch": 2.854251012145749, "grad_norm": 0.34094497561454773, "learning_rate": 9.750337381916331e-07, "loss": 1.3271, "mean_token_accuracy": 0.6688917458057404, "num_tokens": 32197723.0, "step": 5640 }, { "entropy": 1.5701539039611816, "epoch": 2.8593117408906883, "grad_norm": 0.3105640113353729, "learning_rate": 9.412955465587046e-07, "loss": 1.5691, "mean_token_accuracy": 0.631563925743103, "num_tokens": 32253364.0, "step": 5650 }, { "entropy": 1.3988579750061034, "epoch": 2.8643724696356276, "grad_norm": 0.33697089552879333, "learning_rate": 9.07557354925776e-07, "loss": 1.3774, "mean_token_accuracy": 0.6580399334430694, "num_tokens": 32307759.0, "step": 5660 }, { "entropy": 1.5067651271820068, "epoch": 2.869433198380567, "grad_norm": 0.4209248721599579, "learning_rate": 8.738191632928476e-07, "loss": 1.5117, "mean_token_accuracy": 0.6395208477973938, "num_tokens": 32361901.0, "step": 5670 }, { "entropy": 1.4897794604301453, "epoch": 2.8744939271255063, "grad_norm": 0.26533105969429016, "learning_rate": 8.400809716599192e-07, "loss": 1.492, "mean_token_accuracy": 0.6359480619430542, "num_tokens": 32421074.0, "step": 5680 }, { "entropy": 1.5117918133735657, "epoch": 2.8795546558704452, "grad_norm": 0.2814977169036865, "learning_rate": 8.063427800269906e-07, "loss": 1.5099, "mean_token_accuracy": 0.6378594696521759, "num_tokens": 32475249.0, "step": 5690 }, { "entropy": 1.5758733749389648, "epoch": 2.8846153846153846, "grad_norm": 0.3215586543083191, "learning_rate": 7.726045883940621e-07, "loss": 1.5742, "mean_token_accuracy": 0.6247093558311463, "num_tokens": 32533856.0, "step": 5700 }, { "entropy": 1.5476130247116089, "epoch": 2.889676113360324, "grad_norm": 0.3249874413013458, "learning_rate": 7.388663967611337e-07, "loss": 1.5596, "mean_token_accuracy": 0.630809611082077, "num_tokens": 32590748.0, "step": 5710 }, { "entropy": 1.6522730588912964, "epoch": 2.8947368421052633, "grad_norm": 0.30724644660949707, "learning_rate": 7.051282051282052e-07, "loss": 1.6494, "mean_token_accuracy": 0.6230604112148285, "num_tokens": 32646249.0, "step": 5720 }, { "entropy": 1.544532060623169, "epoch": 2.899797570850202, "grad_norm": 0.2921552062034607, "learning_rate": 6.713900134952767e-07, "loss": 1.5418, "mean_token_accuracy": 0.6278964817523957, "num_tokens": 32705968.0, "step": 5730 }, { "entropy": 1.6792848348617553, "epoch": 2.9048582995951415, "grad_norm": 0.35362908244132996, "learning_rate": 6.376518218623482e-07, "loss": 1.6863, "mean_token_accuracy": 0.606015944480896, "num_tokens": 32764283.0, "step": 5740 }, { "entropy": 1.3941245913505553, "epoch": 2.909919028340081, "grad_norm": 0.3051432967185974, "learning_rate": 6.039136302294198e-07, "loss": 1.3916, "mean_token_accuracy": 0.655018413066864, "num_tokens": 32821604.0, "step": 5750 }, { "entropy": 1.3279268383979796, "epoch": 2.91497975708502, "grad_norm": 0.28279706835746765, "learning_rate": 5.701754385964912e-07, "loss": 1.3252, "mean_token_accuracy": 0.6721781909465789, "num_tokens": 32882742.0, "step": 5760 }, { "entropy": 1.40997371673584, "epoch": 2.9200404858299596, "grad_norm": 0.27882078289985657, "learning_rate": 5.364372469635628e-07, "loss": 1.4014, "mean_token_accuracy": 0.653525573015213, "num_tokens": 32940852.0, "step": 5770 }, { "entropy": 1.4527880668640136, "epoch": 2.925101214574899, "grad_norm": 0.34039291739463806, "learning_rate": 5.026990553306344e-07, "loss": 1.4668, "mean_token_accuracy": 0.6496657609939576, "num_tokens": 32993007.0, "step": 5780 }, { "entropy": 1.4252225756645203, "epoch": 2.9301619433198383, "grad_norm": 0.33022540807724, "learning_rate": 4.6896086369770585e-07, "loss": 1.4163, "mean_token_accuracy": 0.6603101253509521, "num_tokens": 33052311.0, "step": 5790 }, { "entropy": 1.3463350296020509, "epoch": 2.9352226720647776, "grad_norm": 0.3052782416343689, "learning_rate": 4.352226720647774e-07, "loss": 1.3402, "mean_token_accuracy": 0.6646123170852661, "num_tokens": 33107681.0, "step": 5800 }, { "entropy": 1.4629117488861083, "epoch": 2.9402834008097165, "grad_norm": 0.3405231535434723, "learning_rate": 4.0148448043184886e-07, "loss": 1.4697, "mean_token_accuracy": 0.6421536803245544, "num_tokens": 33160562.0, "step": 5810 }, { "entropy": 1.5115989089012145, "epoch": 2.945344129554656, "grad_norm": 0.25086501240730286, "learning_rate": 3.677462887989204e-07, "loss": 1.5254, "mean_token_accuracy": 0.636814546585083, "num_tokens": 33219166.0, "step": 5820 }, { "entropy": 1.5050144791603088, "epoch": 2.950404858299595, "grad_norm": 0.30874550342559814, "learning_rate": 3.34008097165992e-07, "loss": 1.5059, "mean_token_accuracy": 0.6406992137432098, "num_tokens": 33275918.0, "step": 5830 }, { "entropy": 1.4055772423744202, "epoch": 2.9554655870445345, "grad_norm": 0.37710893154144287, "learning_rate": 3.0026990553306346e-07, "loss": 1.4007, "mean_token_accuracy": 0.6559918403625489, "num_tokens": 33330271.0, "step": 5840 }, { "entropy": 1.514758288860321, "epoch": 2.9605263157894735, "grad_norm": 0.2986261248588562, "learning_rate": 2.66531713900135e-07, "loss": 1.5106, "mean_token_accuracy": 0.6324501454830169, "num_tokens": 33390790.0, "step": 5850 }, { "entropy": 1.4632804989814758, "epoch": 2.965587044534413, "grad_norm": 0.297039657831192, "learning_rate": 2.327935222672065e-07, "loss": 1.4562, "mean_token_accuracy": 0.652844125032425, "num_tokens": 33451092.0, "step": 5860 }, { "entropy": 1.619661772251129, "epoch": 2.970647773279352, "grad_norm": 0.32788631319999695, "learning_rate": 1.9905533063427803e-07, "loss": 1.6222, "mean_token_accuracy": 0.6278849899768829, "num_tokens": 33509293.0, "step": 5870 }, { "entropy": 1.6123327970504762, "epoch": 2.9757085020242915, "grad_norm": 0.30364230275154114, "learning_rate": 1.6531713900134953e-07, "loss": 1.62, "mean_token_accuracy": 0.6268645524978638, "num_tokens": 33567748.0, "step": 5880 }, { "entropy": 1.465350294113159, "epoch": 2.980769230769231, "grad_norm": 0.271182119846344, "learning_rate": 1.3157894736842107e-07, "loss": 1.4767, "mean_token_accuracy": 0.6428022742271423, "num_tokens": 33626678.0, "step": 5890 }, { "entropy": 1.5106886863708495, "epoch": 2.98582995951417, "grad_norm": 0.30039140582084656, "learning_rate": 9.784075573549259e-08, "loss": 1.501, "mean_token_accuracy": 0.6413563072681427, "num_tokens": 33681713.0, "step": 5900 }, { "entropy": 1.6268660426139832, "epoch": 2.9908906882591095, "grad_norm": 0.30086028575897217, "learning_rate": 6.41025641025641e-08, "loss": 1.6387, "mean_token_accuracy": 0.6218379974365235, "num_tokens": 33738372.0, "step": 5910 }, { "entropy": 1.5718028783798217, "epoch": 2.9959514170040484, "grad_norm": 0.3744632601737976, "learning_rate": 3.036437246963563e-08, "loss": 1.5594, "mean_token_accuracy": 0.6297330737113953, "num_tokens": 33794850.0, "step": 5920 } ], "logging_steps": 10, "max_steps": 5928, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2845340765506765e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }