| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 5928, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.3182146310806275, | |
| "epoch": 0.005060728744939271, | |
| "grad_norm": 0.36533281207084656, | |
| "learning_rate": 1.9969635627530365e-05, | |
| "loss": 2.0792, | |
| "mean_token_accuracy": 0.5860633730888367, | |
| "num_tokens": 59233.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.2874402403831482, | |
| "epoch": 0.010121457489878543, | |
| "grad_norm": 0.47160375118255615, | |
| "learning_rate": 1.9935897435897437e-05, | |
| "loss": 2.0417, | |
| "mean_token_accuracy": 0.5939164876937866, | |
| "num_tokens": 114581.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.1730836629867554, | |
| "epoch": 0.015182186234817813, | |
| "grad_norm": 0.3855545222759247, | |
| "learning_rate": 1.990215924426451e-05, | |
| "loss": 1.8448, | |
| "mean_token_accuracy": 0.6225896775722504, | |
| "num_tokens": 170799.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.328663158416748, | |
| "epoch": 0.020242914979757085, | |
| "grad_norm": 0.3587476313114166, | |
| "learning_rate": 1.986842105263158e-05, | |
| "loss": 2.0267, | |
| "mean_token_accuracy": 0.5886577606201172, | |
| "num_tokens": 224687.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.3244597673416139, | |
| "epoch": 0.025303643724696356, | |
| "grad_norm": 0.4023756980895996, | |
| "learning_rate": 1.9834682860998653e-05, | |
| "loss": 1.9579, | |
| "mean_token_accuracy": 0.5918201506137848, | |
| "num_tokens": 282567.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.3242129743099214, | |
| "epoch": 0.030364372469635626, | |
| "grad_norm": 0.4889814257621765, | |
| "learning_rate": 1.9800944669365722e-05, | |
| "loss": 1.9161, | |
| "mean_token_accuracy": 0.6058241128921509, | |
| "num_tokens": 336705.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.353874671459198, | |
| "epoch": 0.0354251012145749, | |
| "grad_norm": 0.43763861060142517, | |
| "learning_rate": 1.9767206477732795e-05, | |
| "loss": 1.8797, | |
| "mean_token_accuracy": 0.6002139091491699, | |
| "num_tokens": 395328.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 1.2600490927696228, | |
| "epoch": 0.04048582995951417, | |
| "grad_norm": 0.6143773198127747, | |
| "learning_rate": 1.9733468286099865e-05, | |
| "loss": 1.7122, | |
| "mean_token_accuracy": 0.6283527314662933, | |
| "num_tokens": 448766.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.318302822113037, | |
| "epoch": 0.04554655870445344, | |
| "grad_norm": 0.37836670875549316, | |
| "learning_rate": 1.9699730094466938e-05, | |
| "loss": 1.7054, | |
| "mean_token_accuracy": 0.6172383666038513, | |
| "num_tokens": 502153.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 1.395500862598419, | |
| "epoch": 0.05060728744939271, | |
| "grad_norm": 0.42456531524658203, | |
| "learning_rate": 1.966599190283401e-05, | |
| "loss": 1.7468, | |
| "mean_token_accuracy": 0.6102555096149445, | |
| "num_tokens": 559789.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.2470922768115997, | |
| "epoch": 0.05566801619433198, | |
| "grad_norm": 0.3632870316505432, | |
| "learning_rate": 1.963225371120108e-05, | |
| "loss": 1.5353, | |
| "mean_token_accuracy": 0.6436724066734314, | |
| "num_tokens": 620090.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.4007501482963562, | |
| "epoch": 0.06072874493927125, | |
| "grad_norm": 0.36180490255355835, | |
| "learning_rate": 1.9598515519568153e-05, | |
| "loss": 1.7042, | |
| "mean_token_accuracy": 0.6169813573360443, | |
| "num_tokens": 674400.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.509167194366455, | |
| "epoch": 0.06578947368421052, | |
| "grad_norm": 0.33648917078971863, | |
| "learning_rate": 1.9564777327935226e-05, | |
| "loss": 1.8149, | |
| "mean_token_accuracy": 0.6050768792629242, | |
| "num_tokens": 732008.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.406454861164093, | |
| "epoch": 0.0708502024291498, | |
| "grad_norm": 0.2894444167613983, | |
| "learning_rate": 1.9531039136302295e-05, | |
| "loss": 1.6417, | |
| "mean_token_accuracy": 0.6224378228187561, | |
| "num_tokens": 790088.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.5152673125267029, | |
| "epoch": 0.07591093117408906, | |
| "grad_norm": 0.28545448184013367, | |
| "learning_rate": 1.949730094466937e-05, | |
| "loss": 1.7609, | |
| "mean_token_accuracy": 0.6055684983730316, | |
| "num_tokens": 846716.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.3967717170715332, | |
| "epoch": 0.08097165991902834, | |
| "grad_norm": 0.26414811611175537, | |
| "learning_rate": 1.9463562753036438e-05, | |
| "loss": 1.5725, | |
| "mean_token_accuracy": 0.6291777551174164, | |
| "num_tokens": 909154.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.5347481608390807, | |
| "epoch": 0.0860323886639676, | |
| "grad_norm": 0.27282679080963135, | |
| "learning_rate": 1.942982456140351e-05, | |
| "loss": 1.7069, | |
| "mean_token_accuracy": 0.6087481796741485, | |
| "num_tokens": 967058.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.4826232314109802, | |
| "epoch": 0.09109311740890688, | |
| "grad_norm": 0.23245945572853088, | |
| "learning_rate": 1.939608636977058e-05, | |
| "loss": 1.636, | |
| "mean_token_accuracy": 0.6207191824913025, | |
| "num_tokens": 1022407.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.5127532601356506, | |
| "epoch": 0.09615384615384616, | |
| "grad_norm": 0.2711787223815918, | |
| "learning_rate": 1.9362348178137653e-05, | |
| "loss": 1.6767, | |
| "mean_token_accuracy": 0.615806394815445, | |
| "num_tokens": 1079738.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 1.6158159017562865, | |
| "epoch": 0.10121457489878542, | |
| "grad_norm": 0.29755550622940063, | |
| "learning_rate": 1.9328609986504726e-05, | |
| "loss": 1.7642, | |
| "mean_token_accuracy": 0.6007438480854035, | |
| "num_tokens": 1140680.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.4902734279632568, | |
| "epoch": 0.1062753036437247, | |
| "grad_norm": 0.24520562589168549, | |
| "learning_rate": 1.9294871794871796e-05, | |
| "loss": 1.5893, | |
| "mean_token_accuracy": 0.6293672084808349, | |
| "num_tokens": 1194492.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.6347212672233582, | |
| "epoch": 0.11133603238866396, | |
| "grad_norm": 0.3082791566848755, | |
| "learning_rate": 1.926113360323887e-05, | |
| "loss": 1.7482, | |
| "mean_token_accuracy": 0.602141198515892, | |
| "num_tokens": 1252053.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.5750308752059936, | |
| "epoch": 0.11639676113360324, | |
| "grad_norm": 0.23394237458705902, | |
| "learning_rate": 1.922739541160594e-05, | |
| "loss": 1.6651, | |
| "mean_token_accuracy": 0.6140229105949402, | |
| "num_tokens": 1308749.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 1.5293641209602356, | |
| "epoch": 0.1214574898785425, | |
| "grad_norm": 0.22243493795394897, | |
| "learning_rate": 1.919365721997301e-05, | |
| "loss": 1.5962, | |
| "mean_token_accuracy": 0.6277494192123413, | |
| "num_tokens": 1371806.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.5892576217651366, | |
| "epoch": 0.12651821862348178, | |
| "grad_norm": 0.23461221158504486, | |
| "learning_rate": 1.915991902834008e-05, | |
| "loss": 1.6669, | |
| "mean_token_accuracy": 0.6198325097560883, | |
| "num_tokens": 1427210.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.6534079551696776, | |
| "epoch": 0.13157894736842105, | |
| "grad_norm": 0.2797304391860962, | |
| "learning_rate": 1.9126180836707153e-05, | |
| "loss": 1.7432, | |
| "mean_token_accuracy": 0.6030194580554962, | |
| "num_tokens": 1485664.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.6070345997810365, | |
| "epoch": 0.13663967611336034, | |
| "grad_norm": 0.22065305709838867, | |
| "learning_rate": 1.9092442645074226e-05, | |
| "loss": 1.677, | |
| "mean_token_accuracy": 0.6108350694179535, | |
| "num_tokens": 1544169.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 1.658397912979126, | |
| "epoch": 0.1417004048582996, | |
| "grad_norm": 0.17878006398677826, | |
| "learning_rate": 1.9058704453441296e-05, | |
| "loss": 1.7484, | |
| "mean_token_accuracy": 0.6061225473880768, | |
| "num_tokens": 1607852.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 1.5805446743965148, | |
| "epoch": 0.14676113360323886, | |
| "grad_norm": 0.20498958230018616, | |
| "learning_rate": 1.902496626180837e-05, | |
| "loss": 1.6358, | |
| "mean_token_accuracy": 0.6217520833015442, | |
| "num_tokens": 1667280.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 1.553944504261017, | |
| "epoch": 0.15182186234817813, | |
| "grad_norm": 0.2072789967060089, | |
| "learning_rate": 1.899122807017544e-05, | |
| "loss": 1.6016, | |
| "mean_token_accuracy": 0.6277549624443054, | |
| "num_tokens": 1722987.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.6091766953468323, | |
| "epoch": 0.15688259109311742, | |
| "grad_norm": 0.25766435265541077, | |
| "learning_rate": 1.895748987854251e-05, | |
| "loss": 1.6603, | |
| "mean_token_accuracy": 0.6145843267440796, | |
| "num_tokens": 1777611.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 1.4922061681747436, | |
| "epoch": 0.16194331983805668, | |
| "grad_norm": 0.23709791898727417, | |
| "learning_rate": 1.8923751686909584e-05, | |
| "loss": 1.5237, | |
| "mean_token_accuracy": 0.638946932554245, | |
| "num_tokens": 1833335.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 1.5376826167106628, | |
| "epoch": 0.16700404858299595, | |
| "grad_norm": 0.24256624281406403, | |
| "learning_rate": 1.8890013495276657e-05, | |
| "loss": 1.5813, | |
| "mean_token_accuracy": 0.625263386964798, | |
| "num_tokens": 1883303.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 1.5892139554023743, | |
| "epoch": 0.1720647773279352, | |
| "grad_norm": 0.20020949840545654, | |
| "learning_rate": 1.8856275303643726e-05, | |
| "loss": 1.6522, | |
| "mean_token_accuracy": 0.6241094172000885, | |
| "num_tokens": 1937007.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 1.5604022860527038, | |
| "epoch": 0.1771255060728745, | |
| "grad_norm": 0.2134305238723755, | |
| "learning_rate": 1.8822537112010796e-05, | |
| "loss": 1.6036, | |
| "mean_token_accuracy": 0.617218679189682, | |
| "num_tokens": 1996601.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.6344910740852356, | |
| "epoch": 0.18218623481781376, | |
| "grad_norm": 0.2528083622455597, | |
| "learning_rate": 1.878879892037787e-05, | |
| "loss": 1.6885, | |
| "mean_token_accuracy": 0.6135373294353486, | |
| "num_tokens": 2051920.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 1.6087428450584411, | |
| "epoch": 0.18724696356275303, | |
| "grad_norm": 0.3239048421382904, | |
| "learning_rate": 1.8755060728744942e-05, | |
| "loss": 1.687, | |
| "mean_token_accuracy": 0.6124200880527496, | |
| "num_tokens": 2108082.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 1.5165512919425965, | |
| "epoch": 0.19230769230769232, | |
| "grad_norm": 0.21001844108104706, | |
| "learning_rate": 1.872132253711201e-05, | |
| "loss": 1.5231, | |
| "mean_token_accuracy": 0.6357404530048371, | |
| "num_tokens": 2164317.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 1.4784175634384156, | |
| "epoch": 0.19736842105263158, | |
| "grad_norm": 0.21521133184432983, | |
| "learning_rate": 1.8687584345479084e-05, | |
| "loss": 1.5055, | |
| "mean_token_accuracy": 0.6435703456401825, | |
| "num_tokens": 2222720.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 1.6346607565879823, | |
| "epoch": 0.20242914979757085, | |
| "grad_norm": 0.24888823926448822, | |
| "learning_rate": 1.8653846153846157e-05, | |
| "loss": 1.6701, | |
| "mean_token_accuracy": 0.6212630808353424, | |
| "num_tokens": 2284597.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.6906925320625306, | |
| "epoch": 0.2074898785425101, | |
| "grad_norm": 0.21836967766284943, | |
| "learning_rate": 1.8620107962213227e-05, | |
| "loss": 1.7238, | |
| "mean_token_accuracy": 0.606383764743805, | |
| "num_tokens": 2340566.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 1.4906925320625306, | |
| "epoch": 0.2125506072874494, | |
| "grad_norm": 0.21778637170791626, | |
| "learning_rate": 1.85863697705803e-05, | |
| "loss": 1.5135, | |
| "mean_token_accuracy": 0.6399281203746796, | |
| "num_tokens": 2397861.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 1.6318996787071227, | |
| "epoch": 0.21761133603238866, | |
| "grad_norm": 0.2725844979286194, | |
| "learning_rate": 1.8552631578947373e-05, | |
| "loss": 1.707, | |
| "mean_token_accuracy": 0.6183918356895447, | |
| "num_tokens": 2453175.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 1.530117428302765, | |
| "epoch": 0.22267206477732793, | |
| "grad_norm": 0.20461727678775787, | |
| "learning_rate": 1.8518893387314442e-05, | |
| "loss": 1.5423, | |
| "mean_token_accuracy": 0.6370847761631012, | |
| "num_tokens": 2511372.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 1.5733375310897828, | |
| "epoch": 0.22773279352226722, | |
| "grad_norm": 0.2394452542066574, | |
| "learning_rate": 1.848515519568151e-05, | |
| "loss": 1.5909, | |
| "mean_token_accuracy": 0.6229382216930389, | |
| "num_tokens": 2571400.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.4420257091522217, | |
| "epoch": 0.23279352226720648, | |
| "grad_norm": 0.23069949448108673, | |
| "learning_rate": 1.8451417004048584e-05, | |
| "loss": 1.4763, | |
| "mean_token_accuracy": 0.6464443206787109, | |
| "num_tokens": 2630220.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 1.5972508192062378, | |
| "epoch": 0.23785425101214575, | |
| "grad_norm": 0.22586746513843536, | |
| "learning_rate": 1.8417678812415657e-05, | |
| "loss": 1.6251, | |
| "mean_token_accuracy": 0.6198283314704895, | |
| "num_tokens": 2688923.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 1.4670196294784545, | |
| "epoch": 0.242914979757085, | |
| "grad_norm": 0.23567302525043488, | |
| "learning_rate": 1.8383940620782727e-05, | |
| "loss": 1.4654, | |
| "mean_token_accuracy": 0.6410723388195038, | |
| "num_tokens": 2745340.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 1.5074788808822632, | |
| "epoch": 0.2479757085020243, | |
| "grad_norm": 0.2870822548866272, | |
| "learning_rate": 1.83502024291498e-05, | |
| "loss": 1.532, | |
| "mean_token_accuracy": 0.6343021392822266, | |
| "num_tokens": 2796794.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 1.5003413558006287, | |
| "epoch": 0.25303643724696356, | |
| "grad_norm": 0.19105228781700134, | |
| "learning_rate": 1.8316464237516873e-05, | |
| "loss": 1.5132, | |
| "mean_token_accuracy": 0.643144553899765, | |
| "num_tokens": 2853606.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 1.4724809288978578, | |
| "epoch": 0.25809716599190285, | |
| "grad_norm": 0.2321540117263794, | |
| "learning_rate": 1.8282726045883942e-05, | |
| "loss": 1.499, | |
| "mean_token_accuracy": 0.6394071221351624, | |
| "num_tokens": 2910686.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 1.5466928601264953, | |
| "epoch": 0.2631578947368421, | |
| "grad_norm": 0.2588091790676117, | |
| "learning_rate": 1.8248987854251015e-05, | |
| "loss": 1.5745, | |
| "mean_token_accuracy": 0.6278865933418274, | |
| "num_tokens": 2969316.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 1.4397364735603333, | |
| "epoch": 0.2682186234817814, | |
| "grad_norm": 0.22344444692134857, | |
| "learning_rate": 1.8215249662618085e-05, | |
| "loss": 1.4459, | |
| "mean_token_accuracy": 0.6503956913948059, | |
| "num_tokens": 3021973.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 1.5749887704849244, | |
| "epoch": 0.2732793522267207, | |
| "grad_norm": 0.20665939152240753, | |
| "learning_rate": 1.8181511470985158e-05, | |
| "loss": 1.6023, | |
| "mean_token_accuracy": 0.622279840707779, | |
| "num_tokens": 3081510.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 1.519134557247162, | |
| "epoch": 0.2783400809716599, | |
| "grad_norm": 0.20693500339984894, | |
| "learning_rate": 1.8147773279352227e-05, | |
| "loss": 1.5253, | |
| "mean_token_accuracy": 0.6335927963256835, | |
| "num_tokens": 3139438.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 1.4850042462348938, | |
| "epoch": 0.2834008097165992, | |
| "grad_norm": 0.20233699679374695, | |
| "learning_rate": 1.81140350877193e-05, | |
| "loss": 1.5081, | |
| "mean_token_accuracy": 0.6392342805862427, | |
| "num_tokens": 3194184.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 1.6362142205238341, | |
| "epoch": 0.28846153846153844, | |
| "grad_norm": 0.19187521934509277, | |
| "learning_rate": 1.808029689608637e-05, | |
| "loss": 1.6497, | |
| "mean_token_accuracy": 0.6131653010845184, | |
| "num_tokens": 3253449.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 1.5829873204231262, | |
| "epoch": 0.2935222672064777, | |
| "grad_norm": 0.21769073605537415, | |
| "learning_rate": 1.8046558704453442e-05, | |
| "loss": 1.6063, | |
| "mean_token_accuracy": 0.6185498893260956, | |
| "num_tokens": 3309330.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 1.5365434288978577, | |
| "epoch": 0.298582995951417, | |
| "grad_norm": 0.20103144645690918, | |
| "learning_rate": 1.8012820512820515e-05, | |
| "loss": 1.5559, | |
| "mean_token_accuracy": 0.6354671478271484, | |
| "num_tokens": 3368237.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 1.5507975578308106, | |
| "epoch": 0.30364372469635625, | |
| "grad_norm": 0.20210447907447815, | |
| "learning_rate": 1.7979082321187585e-05, | |
| "loss": 1.5848, | |
| "mean_token_accuracy": 0.6247013151645661, | |
| "num_tokens": 3429760.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 1.668788731098175, | |
| "epoch": 0.30870445344129555, | |
| "grad_norm": 0.23076701164245605, | |
| "learning_rate": 1.7945344129554658e-05, | |
| "loss": 1.7186, | |
| "mean_token_accuracy": 0.6151426732540131, | |
| "num_tokens": 3481424.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 1.4967810273170472, | |
| "epoch": 0.31376518218623484, | |
| "grad_norm": 0.18658699095249176, | |
| "learning_rate": 1.791160593792173e-05, | |
| "loss": 1.5039, | |
| "mean_token_accuracy": 0.6395800650119782, | |
| "num_tokens": 3540974.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 1.5561461091041564, | |
| "epoch": 0.3188259109311741, | |
| "grad_norm": 0.2277403026819229, | |
| "learning_rate": 1.78778677462888e-05, | |
| "loss": 1.5933, | |
| "mean_token_accuracy": 0.6238772809505463, | |
| "num_tokens": 3599247.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 1.6327290773391723, | |
| "epoch": 0.32388663967611336, | |
| "grad_norm": 0.21525472402572632, | |
| "learning_rate": 1.784412955465587e-05, | |
| "loss": 1.6583, | |
| "mean_token_accuracy": 0.6137534499168396, | |
| "num_tokens": 3656797.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 1.6783543229103088, | |
| "epoch": 0.32894736842105265, | |
| "grad_norm": 0.2178918868303299, | |
| "learning_rate": 1.7810391363022943e-05, | |
| "loss": 1.7236, | |
| "mean_token_accuracy": 0.6091976821422577, | |
| "num_tokens": 3712597.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 1.458414077758789, | |
| "epoch": 0.3340080971659919, | |
| "grad_norm": 0.2724186182022095, | |
| "learning_rate": 1.7776653171390016e-05, | |
| "loss": 1.4712, | |
| "mean_token_accuracy": 0.6499071300029755, | |
| "num_tokens": 3769258.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 1.4010087609291078, | |
| "epoch": 0.3390688259109312, | |
| "grad_norm": 0.24354684352874756, | |
| "learning_rate": 1.7742914979757085e-05, | |
| "loss": 1.4125, | |
| "mean_token_accuracy": 0.6563641846179962, | |
| "num_tokens": 3827461.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 1.5870350360870362, | |
| "epoch": 0.3441295546558704, | |
| "grad_norm": 0.20323996245861053, | |
| "learning_rate": 1.7709176788124158e-05, | |
| "loss": 1.6215, | |
| "mean_token_accuracy": 0.624784529209137, | |
| "num_tokens": 3884189.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 1.6935706734657288, | |
| "epoch": 0.3491902834008097, | |
| "grad_norm": 0.24285322427749634, | |
| "learning_rate": 1.767543859649123e-05, | |
| "loss": 1.7141, | |
| "mean_token_accuracy": 0.6097041130065918, | |
| "num_tokens": 3939148.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 1.5216692566871644, | |
| "epoch": 0.354251012145749, | |
| "grad_norm": 0.24251361191272736, | |
| "learning_rate": 1.76417004048583e-05, | |
| "loss": 1.526, | |
| "mean_token_accuracy": 0.6344065189361572, | |
| "num_tokens": 3996059.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 1.584353470802307, | |
| "epoch": 0.35931174089068824, | |
| "grad_norm": 0.22013038396835327, | |
| "learning_rate": 1.7607962213225373e-05, | |
| "loss": 1.5894, | |
| "mean_token_accuracy": 0.6244750499725342, | |
| "num_tokens": 4056179.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 1.499224054813385, | |
| "epoch": 0.3643724696356275, | |
| "grad_norm": 0.22103145718574524, | |
| "learning_rate": 1.7574224021592443e-05, | |
| "loss": 1.5209, | |
| "mean_token_accuracy": 0.6322570383548737, | |
| "num_tokens": 4114329.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 1.3952741980552674, | |
| "epoch": 0.3694331983805668, | |
| "grad_norm": 0.19164645671844482, | |
| "learning_rate": 1.7540485829959516e-05, | |
| "loss": 1.4095, | |
| "mean_token_accuracy": 0.6568491697311402, | |
| "num_tokens": 4167072.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 1.484491491317749, | |
| "epoch": 0.37449392712550605, | |
| "grad_norm": 0.22778365015983582, | |
| "learning_rate": 1.7506747638326585e-05, | |
| "loss": 1.5054, | |
| "mean_token_accuracy": 0.6413045108318329, | |
| "num_tokens": 4225902.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 1.5875544667243957, | |
| "epoch": 0.37955465587044535, | |
| "grad_norm": 0.22424441576004028, | |
| "learning_rate": 1.7473009446693658e-05, | |
| "loss": 1.6189, | |
| "mean_token_accuracy": 0.6218379735946655, | |
| "num_tokens": 4282716.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 1.5914431929588317, | |
| "epoch": 0.38461538461538464, | |
| "grad_norm": 0.22598877549171448, | |
| "learning_rate": 1.743927125506073e-05, | |
| "loss": 1.629, | |
| "mean_token_accuracy": 0.6168800354003906, | |
| "num_tokens": 4341769.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 1.588646697998047, | |
| "epoch": 0.3896761133603239, | |
| "grad_norm": 0.24020566046237946, | |
| "learning_rate": 1.74055330634278e-05, | |
| "loss": 1.5962, | |
| "mean_token_accuracy": 0.6249550580978394, | |
| "num_tokens": 4400298.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 1.524513852596283, | |
| "epoch": 0.39473684210526316, | |
| "grad_norm": 0.19308218359947205, | |
| "learning_rate": 1.7371794871794873e-05, | |
| "loss": 1.5494, | |
| "mean_token_accuracy": 0.6319825410842895, | |
| "num_tokens": 4456170.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 1.5187025308609008, | |
| "epoch": 0.39979757085020245, | |
| "grad_norm": 0.2745817303657532, | |
| "learning_rate": 1.7338056680161946e-05, | |
| "loss": 1.5286, | |
| "mean_token_accuracy": 0.641098040342331, | |
| "num_tokens": 4509439.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 1.5416448593139649, | |
| "epoch": 0.4048582995951417, | |
| "grad_norm": 0.2520337998867035, | |
| "learning_rate": 1.7304318488529016e-05, | |
| "loss": 1.5522, | |
| "mean_token_accuracy": 0.6389577805995941, | |
| "num_tokens": 4568509.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 1.622413122653961, | |
| "epoch": 0.409919028340081, | |
| "grad_norm": 0.20173849165439606, | |
| "learning_rate": 1.7270580296896085e-05, | |
| "loss": 1.6312, | |
| "mean_token_accuracy": 0.6254597425460815, | |
| "num_tokens": 4621157.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 1.6604674816131593, | |
| "epoch": 0.4149797570850202, | |
| "grad_norm": 0.23679770529270172, | |
| "learning_rate": 1.723684210526316e-05, | |
| "loss": 1.6884, | |
| "mean_token_accuracy": 0.6142423152923584, | |
| "num_tokens": 4673418.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 1.512584674358368, | |
| "epoch": 0.4200404858299595, | |
| "grad_norm": 0.22097937762737274, | |
| "learning_rate": 1.720310391363023e-05, | |
| "loss": 1.5394, | |
| "mean_token_accuracy": 0.6410868644714356, | |
| "num_tokens": 4731386.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 1.4811100244522095, | |
| "epoch": 0.4251012145748988, | |
| "grad_norm": 0.1975807249546051, | |
| "learning_rate": 1.71693657219973e-05, | |
| "loss": 1.474, | |
| "mean_token_accuracy": 0.6388140618801117, | |
| "num_tokens": 4784928.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 1.6224814653396606, | |
| "epoch": 0.43016194331983804, | |
| "grad_norm": 0.21695128083229065, | |
| "learning_rate": 1.7135627530364374e-05, | |
| "loss": 1.6465, | |
| "mean_token_accuracy": 0.6171948432922363, | |
| "num_tokens": 4844351.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 1.4179201185703278, | |
| "epoch": 0.4352226720647773, | |
| "grad_norm": 0.2105616182088852, | |
| "learning_rate": 1.7101889338731447e-05, | |
| "loss": 1.4287, | |
| "mean_token_accuracy": 0.6521054327487945, | |
| "num_tokens": 4902506.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 1.5097766757011413, | |
| "epoch": 0.4402834008097166, | |
| "grad_norm": 0.23443420231342316, | |
| "learning_rate": 1.7068151147098516e-05, | |
| "loss": 1.526, | |
| "mean_token_accuracy": 0.6371770858764648, | |
| "num_tokens": 4957377.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 1.499946367740631, | |
| "epoch": 0.44534412955465585, | |
| "grad_norm": 0.1935402899980545, | |
| "learning_rate": 1.703441295546559e-05, | |
| "loss": 1.536, | |
| "mean_token_accuracy": 0.6418095469474793, | |
| "num_tokens": 5019057.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 1.5840212941169738, | |
| "epoch": 0.45040485829959515, | |
| "grad_norm": 0.2871309518814087, | |
| "learning_rate": 1.7000674763832662e-05, | |
| "loss": 1.5944, | |
| "mean_token_accuracy": 0.6274131119251252, | |
| "num_tokens": 5072125.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 1.6296968936920166, | |
| "epoch": 0.45546558704453444, | |
| "grad_norm": 0.19836841523647308, | |
| "learning_rate": 1.696693657219973e-05, | |
| "loss": 1.6328, | |
| "mean_token_accuracy": 0.621731948852539, | |
| "num_tokens": 5127397.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 1.5373274445533753, | |
| "epoch": 0.4605263157894737, | |
| "grad_norm": 0.24680444598197937, | |
| "learning_rate": 1.69331983805668e-05, | |
| "loss": 1.5417, | |
| "mean_token_accuracy": 0.628632801771164, | |
| "num_tokens": 5179785.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 1.4075371384620667, | |
| "epoch": 0.46558704453441296, | |
| "grad_norm": 0.23700740933418274, | |
| "learning_rate": 1.6899460188933874e-05, | |
| "loss": 1.4108, | |
| "mean_token_accuracy": 0.6608946800231934, | |
| "num_tokens": 5235573.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 1.5140914797782898, | |
| "epoch": 0.4706477732793522, | |
| "grad_norm": 0.23013481497764587, | |
| "learning_rate": 1.6865721997300947e-05, | |
| "loss": 1.5085, | |
| "mean_token_accuracy": 0.6322543203830719, | |
| "num_tokens": 5294146.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 1.5315414309501647, | |
| "epoch": 0.4757085020242915, | |
| "grad_norm": 0.27098962664604187, | |
| "learning_rate": 1.6831983805668016e-05, | |
| "loss": 1.5617, | |
| "mean_token_accuracy": 0.6292850613594055, | |
| "num_tokens": 5350989.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 1.479003095626831, | |
| "epoch": 0.4807692307692308, | |
| "grad_norm": 0.1984509378671646, | |
| "learning_rate": 1.679824561403509e-05, | |
| "loss": 1.4812, | |
| "mean_token_accuracy": 0.6419821918010712, | |
| "num_tokens": 5407325.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 1.5653569459915162, | |
| "epoch": 0.48582995951417, | |
| "grad_norm": 0.2867957353591919, | |
| "learning_rate": 1.6764507422402162e-05, | |
| "loss": 1.6079, | |
| "mean_token_accuracy": 0.6316430389881134, | |
| "num_tokens": 5460946.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 1.476916539669037, | |
| "epoch": 0.4908906882591093, | |
| "grad_norm": 0.30787891149520874, | |
| "learning_rate": 1.673076923076923e-05, | |
| "loss": 1.48, | |
| "mean_token_accuracy": 0.6418763399124146, | |
| "num_tokens": 5521311.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 1.521777379512787, | |
| "epoch": 0.4959514170040486, | |
| "grad_norm": 0.22446390986442566, | |
| "learning_rate": 1.6697031039136305e-05, | |
| "loss": 1.5293, | |
| "mean_token_accuracy": 0.6305320382118225, | |
| "num_tokens": 5586273.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 1.5841980934143067, | |
| "epoch": 0.5010121457489879, | |
| "grad_norm": 0.24676790833473206, | |
| "learning_rate": 1.6663292847503377e-05, | |
| "loss": 1.6064, | |
| "mean_token_accuracy": 0.6247429788112641, | |
| "num_tokens": 5645607.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 1.5454851269721985, | |
| "epoch": 0.5060728744939271, | |
| "grad_norm": 0.2086755633354187, | |
| "learning_rate": 1.6629554655870447e-05, | |
| "loss": 1.5715, | |
| "mean_token_accuracy": 0.6307513952255249, | |
| "num_tokens": 5703782.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 1.479654276371002, | |
| "epoch": 0.5111336032388664, | |
| "grad_norm": 0.21369728446006775, | |
| "learning_rate": 1.6595816464237517e-05, | |
| "loss": 1.4857, | |
| "mean_token_accuracy": 0.6503060281276702, | |
| "num_tokens": 5756050.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "entropy": 1.5413155913352967, | |
| "epoch": 0.5161943319838057, | |
| "grad_norm": 0.29068905115127563, | |
| "learning_rate": 1.656207827260459e-05, | |
| "loss": 1.5625, | |
| "mean_token_accuracy": 0.6323202788829804, | |
| "num_tokens": 5813575.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 1.5284120678901671, | |
| "epoch": 0.521255060728745, | |
| "grad_norm": 0.26866260170936584, | |
| "learning_rate": 1.6528340080971662e-05, | |
| "loss": 1.5296, | |
| "mean_token_accuracy": 0.6371418595314026, | |
| "num_tokens": 5868292.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "entropy": 1.5233107686042786, | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 0.2544384300708771, | |
| "learning_rate": 1.6494601889338732e-05, | |
| "loss": 1.5192, | |
| "mean_token_accuracy": 0.629064416885376, | |
| "num_tokens": 5926581.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 1.4819204211235046, | |
| "epoch": 0.5313765182186235, | |
| "grad_norm": 0.2691729962825775, | |
| "learning_rate": 1.6460863697705805e-05, | |
| "loss": 1.489, | |
| "mean_token_accuracy": 0.6453047692775726, | |
| "num_tokens": 5983604.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 1.343429481983185, | |
| "epoch": 0.5364372469635628, | |
| "grad_norm": 0.21679846942424774, | |
| "learning_rate": 1.6427125506072878e-05, | |
| "loss": 1.34, | |
| "mean_token_accuracy": 0.669063252210617, | |
| "num_tokens": 6040931.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 1.620318102836609, | |
| "epoch": 0.541497975708502, | |
| "grad_norm": 0.2846720516681671, | |
| "learning_rate": 1.6393387314439947e-05, | |
| "loss": 1.6464, | |
| "mean_token_accuracy": 0.625487893819809, | |
| "num_tokens": 6094196.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "entropy": 1.6185827255249023, | |
| "epoch": 0.5465587044534413, | |
| "grad_norm": 0.24272854626178741, | |
| "learning_rate": 1.635964912280702e-05, | |
| "loss": 1.6422, | |
| "mean_token_accuracy": 0.6269473850727081, | |
| "num_tokens": 6150350.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 1.5225468039512635, | |
| "epoch": 0.5516194331983806, | |
| "grad_norm": 0.2274954468011856, | |
| "learning_rate": 1.632591093117409e-05, | |
| "loss": 1.5128, | |
| "mean_token_accuracy": 0.6347517490386962, | |
| "num_tokens": 6203671.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "entropy": 1.4914517521858215, | |
| "epoch": 0.5566801619433198, | |
| "grad_norm": 0.20096349716186523, | |
| "learning_rate": 1.6292172739541163e-05, | |
| "loss": 1.5056, | |
| "mean_token_accuracy": 0.6353028774261474, | |
| "num_tokens": 6264849.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 1.5485971808433532, | |
| "epoch": 0.5617408906882592, | |
| "grad_norm": 0.24010322988033295, | |
| "learning_rate": 1.6258434547908232e-05, | |
| "loss": 1.5398, | |
| "mean_token_accuracy": 0.6348303139209748, | |
| "num_tokens": 6321385.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "entropy": 1.5580425620079041, | |
| "epoch": 0.5668016194331984, | |
| "grad_norm": 0.21382348239421844, | |
| "learning_rate": 1.6224696356275305e-05, | |
| "loss": 1.5824, | |
| "mean_token_accuracy": 0.6257192850112915, | |
| "num_tokens": 6377498.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 1.5573256254196166, | |
| "epoch": 0.5718623481781376, | |
| "grad_norm": 0.24488642811775208, | |
| "learning_rate": 1.6190958164642378e-05, | |
| "loss": 1.5628, | |
| "mean_token_accuracy": 0.6246356785297393, | |
| "num_tokens": 6432433.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "entropy": 1.4993282079696655, | |
| "epoch": 0.5769230769230769, | |
| "grad_norm": 0.2223263829946518, | |
| "learning_rate": 1.6157219973009447e-05, | |
| "loss": 1.5111, | |
| "mean_token_accuracy": 0.6382519125938415, | |
| "num_tokens": 6492547.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 1.4915427923202516, | |
| "epoch": 0.5819838056680162, | |
| "grad_norm": 0.232344850897789, | |
| "learning_rate": 1.612348178137652e-05, | |
| "loss": 1.5079, | |
| "mean_token_accuracy": 0.6373468995094299, | |
| "num_tokens": 6545726.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 1.561433982849121, | |
| "epoch": 0.5870445344129555, | |
| "grad_norm": 0.2586466073989868, | |
| "learning_rate": 1.6089743589743593e-05, | |
| "loss": 1.5638, | |
| "mean_token_accuracy": 0.6296638369560241, | |
| "num_tokens": 6606186.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 1.4575641989707946, | |
| "epoch": 0.5921052631578947, | |
| "grad_norm": 0.23262882232666016, | |
| "learning_rate": 1.6056005398110663e-05, | |
| "loss": 1.4734, | |
| "mean_token_accuracy": 0.646199643611908, | |
| "num_tokens": 6666588.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "entropy": 1.5205667972564698, | |
| "epoch": 0.597165991902834, | |
| "grad_norm": 0.2673611044883728, | |
| "learning_rate": 1.6022267206477736e-05, | |
| "loss": 1.5302, | |
| "mean_token_accuracy": 0.6332932889461518, | |
| "num_tokens": 6728351.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 1.5646514296531677, | |
| "epoch": 0.6022267206477733, | |
| "grad_norm": 0.24620375037193298, | |
| "learning_rate": 1.5988529014844805e-05, | |
| "loss": 1.5848, | |
| "mean_token_accuracy": 0.6307655155658722, | |
| "num_tokens": 6789626.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "entropy": 1.597201144695282, | |
| "epoch": 0.6072874493927125, | |
| "grad_norm": 0.28606894612312317, | |
| "learning_rate": 1.5954790823211878e-05, | |
| "loss": 1.5779, | |
| "mean_token_accuracy": 0.6305352866649627, | |
| "num_tokens": 6840612.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 1.6090473532676697, | |
| "epoch": 0.6123481781376519, | |
| "grad_norm": 0.26432231068611145, | |
| "learning_rate": 1.5921052631578948e-05, | |
| "loss": 1.6361, | |
| "mean_token_accuracy": 0.6166266143321991, | |
| "num_tokens": 6898491.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "entropy": 1.5265617489814758, | |
| "epoch": 0.6174089068825911, | |
| "grad_norm": 0.24568380415439606, | |
| "learning_rate": 1.588731443994602e-05, | |
| "loss": 1.5239, | |
| "mean_token_accuracy": 0.6334243714809418, | |
| "num_tokens": 6953682.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "entropy": 1.5580573081970215, | |
| "epoch": 0.6224696356275303, | |
| "grad_norm": 0.2606264650821686, | |
| "learning_rate": 1.5853576248313093e-05, | |
| "loss": 1.5652, | |
| "mean_token_accuracy": 0.6310077726840972, | |
| "num_tokens": 7009187.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "entropy": 1.5315574645996093, | |
| "epoch": 0.6275303643724697, | |
| "grad_norm": 0.23248089849948883, | |
| "learning_rate": 1.5819838056680163e-05, | |
| "loss": 1.5515, | |
| "mean_token_accuracy": 0.635067343711853, | |
| "num_tokens": 7069568.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "entropy": 1.4524394154548645, | |
| "epoch": 0.6325910931174089, | |
| "grad_norm": 0.20559658110141754, | |
| "learning_rate": 1.5786099865047236e-05, | |
| "loss": 1.4655, | |
| "mean_token_accuracy": 0.644383716583252, | |
| "num_tokens": 7132908.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 1.5834705471992492, | |
| "epoch": 0.6376518218623481, | |
| "grad_norm": 0.2312365472316742, | |
| "learning_rate": 1.5752361673414305e-05, | |
| "loss": 1.6107, | |
| "mean_token_accuracy": 0.6216763257980347, | |
| "num_tokens": 7193948.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "entropy": 1.5223916292190551, | |
| "epoch": 0.6427125506072875, | |
| "grad_norm": 0.302206426858902, | |
| "learning_rate": 1.5718623481781378e-05, | |
| "loss": 1.5347, | |
| "mean_token_accuracy": 0.6370865941047669, | |
| "num_tokens": 7249715.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "entropy": 1.68130704164505, | |
| "epoch": 0.6477732793522267, | |
| "grad_norm": 0.24234986305236816, | |
| "learning_rate": 1.5684885290148448e-05, | |
| "loss": 1.703, | |
| "mean_token_accuracy": 0.6063290297985077, | |
| "num_tokens": 7306114.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 1.6868065476417542, | |
| "epoch": 0.652834008097166, | |
| "grad_norm": 0.2558751702308655, | |
| "learning_rate": 1.565114709851552e-05, | |
| "loss": 1.7002, | |
| "mean_token_accuracy": 0.6132429718971253, | |
| "num_tokens": 7363295.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "entropy": 1.4368155479431153, | |
| "epoch": 0.6578947368421053, | |
| "grad_norm": 0.3175618350505829, | |
| "learning_rate": 1.561740890688259e-05, | |
| "loss": 1.4368, | |
| "mean_token_accuracy": 0.6544794201850891, | |
| "num_tokens": 7415420.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 1.5615759491920471, | |
| "epoch": 0.6629554655870445, | |
| "grad_norm": 0.2953908443450928, | |
| "learning_rate": 1.5583670715249663e-05, | |
| "loss": 1.5617, | |
| "mean_token_accuracy": 0.6310927093029022, | |
| "num_tokens": 7475642.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "entropy": 1.4271193981170653, | |
| "epoch": 0.6680161943319838, | |
| "grad_norm": 0.24695925414562225, | |
| "learning_rate": 1.5549932523616736e-05, | |
| "loss": 1.4189, | |
| "mean_token_accuracy": 0.6522926926612854, | |
| "num_tokens": 7538029.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "entropy": 1.5500613093376159, | |
| "epoch": 0.6730769230769231, | |
| "grad_norm": 0.2324494868516922, | |
| "learning_rate": 1.5516194331983806e-05, | |
| "loss": 1.5641, | |
| "mean_token_accuracy": 0.626498419046402, | |
| "num_tokens": 7597460.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "entropy": 1.476065456867218, | |
| "epoch": 0.6781376518218624, | |
| "grad_norm": 0.2418016493320465, | |
| "learning_rate": 1.548245614035088e-05, | |
| "loss": 1.4751, | |
| "mean_token_accuracy": 0.641443008184433, | |
| "num_tokens": 7652792.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "entropy": 1.5325765252113341, | |
| "epoch": 0.6831983805668016, | |
| "grad_norm": 0.23513104021549225, | |
| "learning_rate": 1.544871794871795e-05, | |
| "loss": 1.5499, | |
| "mean_token_accuracy": 0.6278112173080445, | |
| "num_tokens": 7706166.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 1.5952306509017944, | |
| "epoch": 0.6882591093117408, | |
| "grad_norm": 0.22960874438285828, | |
| "learning_rate": 1.541497975708502e-05, | |
| "loss": 1.6124, | |
| "mean_token_accuracy": 0.623203706741333, | |
| "num_tokens": 7762524.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 1.4605698585510254, | |
| "epoch": 0.6933198380566802, | |
| "grad_norm": 0.2283059060573578, | |
| "learning_rate": 1.5381241565452094e-05, | |
| "loss": 1.4702, | |
| "mean_token_accuracy": 0.6456966698169708, | |
| "num_tokens": 7816597.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "entropy": 1.3722566485404968, | |
| "epoch": 0.6983805668016194, | |
| "grad_norm": 0.24912376701831818, | |
| "learning_rate": 1.5347503373819163e-05, | |
| "loss": 1.3777, | |
| "mean_token_accuracy": 0.6624338209629059, | |
| "num_tokens": 7878907.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "entropy": 1.5658705353736877, | |
| "epoch": 0.7034412955465587, | |
| "grad_norm": 0.26213786005973816, | |
| "learning_rate": 1.5313765182186236e-05, | |
| "loss": 1.5614, | |
| "mean_token_accuracy": 0.6281410813331604, | |
| "num_tokens": 7931234.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "entropy": 1.4248001098632812, | |
| "epoch": 0.708502024291498, | |
| "grad_norm": 0.3189115822315216, | |
| "learning_rate": 1.5280026990553306e-05, | |
| "loss": 1.4343, | |
| "mean_token_accuracy": 0.6542839646339417, | |
| "num_tokens": 7983537.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 1.499564802646637, | |
| "epoch": 0.7135627530364372, | |
| "grad_norm": 0.24217011034488678, | |
| "learning_rate": 1.5246288798920379e-05, | |
| "loss": 1.5238, | |
| "mean_token_accuracy": 0.6327670216560364, | |
| "num_tokens": 8039434.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "entropy": 1.5084555625915528, | |
| "epoch": 0.7186234817813765, | |
| "grad_norm": 0.21525943279266357, | |
| "learning_rate": 1.521255060728745e-05, | |
| "loss": 1.5051, | |
| "mean_token_accuracy": 0.6452975988388061, | |
| "num_tokens": 8095407.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "entropy": 1.5463826656341553, | |
| "epoch": 0.7236842105263158, | |
| "grad_norm": 0.25616827607154846, | |
| "learning_rate": 1.5178812415654523e-05, | |
| "loss": 1.5526, | |
| "mean_token_accuracy": 0.6282021820545196, | |
| "num_tokens": 8150109.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "entropy": 1.6857656121253968, | |
| "epoch": 0.728744939271255, | |
| "grad_norm": 0.25321727991104126, | |
| "learning_rate": 1.5145074224021594e-05, | |
| "loss": 1.7184, | |
| "mean_token_accuracy": 0.6112756371498108, | |
| "num_tokens": 8214438.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "entropy": 1.5806215167045594, | |
| "epoch": 0.7338056680161943, | |
| "grad_norm": 0.21112073957920074, | |
| "learning_rate": 1.5111336032388665e-05, | |
| "loss": 1.5852, | |
| "mean_token_accuracy": 0.6202045798301696, | |
| "num_tokens": 8273743.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 1.4979040026664734, | |
| "epoch": 0.7388663967611336, | |
| "grad_norm": 0.22126545011997223, | |
| "learning_rate": 1.5077597840755738e-05, | |
| "loss": 1.5201, | |
| "mean_token_accuracy": 0.637889975309372, | |
| "num_tokens": 8334507.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "entropy": 1.5463458061218263, | |
| "epoch": 0.7439271255060729, | |
| "grad_norm": 0.22952505946159363, | |
| "learning_rate": 1.5043859649122808e-05, | |
| "loss": 1.5498, | |
| "mean_token_accuracy": 0.6345071375370026, | |
| "num_tokens": 8390405.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "entropy": 1.423577868938446, | |
| "epoch": 0.7489878542510121, | |
| "grad_norm": 0.2474886029958725, | |
| "learning_rate": 1.5010121457489879e-05, | |
| "loss": 1.4306, | |
| "mean_token_accuracy": 0.655298399925232, | |
| "num_tokens": 8452056.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "entropy": 1.5899730324745178, | |
| "epoch": 0.7540485829959515, | |
| "grad_norm": 0.2736392021179199, | |
| "learning_rate": 1.497638326585695e-05, | |
| "loss": 1.581, | |
| "mean_token_accuracy": 0.6186748504638672, | |
| "num_tokens": 8511999.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "entropy": 1.5433414101600647, | |
| "epoch": 0.7591093117408907, | |
| "grad_norm": 0.2836778163909912, | |
| "learning_rate": 1.4942645074224023e-05, | |
| "loss": 1.5544, | |
| "mean_token_accuracy": 0.6286308348178864, | |
| "num_tokens": 8566021.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 1.4887511134147644, | |
| "epoch": 0.7641700404858299, | |
| "grad_norm": 0.33601313829421997, | |
| "learning_rate": 1.4908906882591094e-05, | |
| "loss": 1.4994, | |
| "mean_token_accuracy": 0.6406654596328736, | |
| "num_tokens": 8622757.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "entropy": 1.5212846279144288, | |
| "epoch": 0.7692307692307693, | |
| "grad_norm": 0.2853647470474243, | |
| "learning_rate": 1.4875168690958165e-05, | |
| "loss": 1.5409, | |
| "mean_token_accuracy": 0.6337429225444794, | |
| "num_tokens": 8677777.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "entropy": 1.4735643148422242, | |
| "epoch": 0.7742914979757085, | |
| "grad_norm": 0.2369018942117691, | |
| "learning_rate": 1.4841430499325238e-05, | |
| "loss": 1.4812, | |
| "mean_token_accuracy": 0.6412514448165894, | |
| "num_tokens": 8735792.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "entropy": 1.5245864272117615, | |
| "epoch": 0.7793522267206477, | |
| "grad_norm": 0.2317512333393097, | |
| "learning_rate": 1.480769230769231e-05, | |
| "loss": 1.5354, | |
| "mean_token_accuracy": 0.6362193703651429, | |
| "num_tokens": 8795324.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "entropy": 1.487471914291382, | |
| "epoch": 0.7844129554655871, | |
| "grad_norm": 0.24812865257263184, | |
| "learning_rate": 1.477395411605938e-05, | |
| "loss": 1.487, | |
| "mean_token_accuracy": 0.6461592555046082, | |
| "num_tokens": 8848190.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 1.446857714653015, | |
| "epoch": 0.7894736842105263, | |
| "grad_norm": 0.23715689778327942, | |
| "learning_rate": 1.474021592442645e-05, | |
| "loss": 1.4494, | |
| "mean_token_accuracy": 0.654287850856781, | |
| "num_tokens": 8900078.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "entropy": 1.6414281487464906, | |
| "epoch": 0.7945344129554656, | |
| "grad_norm": 0.26817786693573, | |
| "learning_rate": 1.4706477732793523e-05, | |
| "loss": 1.6536, | |
| "mean_token_accuracy": 0.6186295211315155, | |
| "num_tokens": 8955471.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "entropy": 1.5608402729034423, | |
| "epoch": 0.7995951417004049, | |
| "grad_norm": 0.2652844190597534, | |
| "learning_rate": 1.4672739541160594e-05, | |
| "loss": 1.5787, | |
| "mean_token_accuracy": 0.62896608710289, | |
| "num_tokens": 9013912.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "entropy": 1.5290770292282105, | |
| "epoch": 0.8046558704453441, | |
| "grad_norm": 0.25053921341896057, | |
| "learning_rate": 1.4639001349527666e-05, | |
| "loss": 1.543, | |
| "mean_token_accuracy": 0.6325620353221894, | |
| "num_tokens": 9073236.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "entropy": 1.473749542236328, | |
| "epoch": 0.8097165991902834, | |
| "grad_norm": 0.2638007402420044, | |
| "learning_rate": 1.4605263157894739e-05, | |
| "loss": 1.4962, | |
| "mean_token_accuracy": 0.64018235206604, | |
| "num_tokens": 9130345.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 1.4807411432266235, | |
| "epoch": 0.8147773279352226, | |
| "grad_norm": 0.2131456434726715, | |
| "learning_rate": 1.457152496626181e-05, | |
| "loss": 1.4896, | |
| "mean_token_accuracy": 0.6396925866603851, | |
| "num_tokens": 9181695.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "entropy": 1.4747131943702698, | |
| "epoch": 0.819838056680162, | |
| "grad_norm": 0.25145605206489563, | |
| "learning_rate": 1.4537786774628881e-05, | |
| "loss": 1.4513, | |
| "mean_token_accuracy": 0.6473784625530243, | |
| "num_tokens": 9237367.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "entropy": 1.5602935075759887, | |
| "epoch": 0.8248987854251012, | |
| "grad_norm": 0.24879582226276398, | |
| "learning_rate": 1.4504048582995954e-05, | |
| "loss": 1.565, | |
| "mean_token_accuracy": 0.6289263606071472, | |
| "num_tokens": 9302101.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "entropy": 1.4359328031539917, | |
| "epoch": 0.8299595141700404, | |
| "grad_norm": 0.21965323388576508, | |
| "learning_rate": 1.4470310391363025e-05, | |
| "loss": 1.4408, | |
| "mean_token_accuracy": 0.6550322711467743, | |
| "num_tokens": 9361115.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "entropy": 1.5193968892097474, | |
| "epoch": 0.8350202429149798, | |
| "grad_norm": 0.27555471658706665, | |
| "learning_rate": 1.4436572199730096e-05, | |
| "loss": 1.5173, | |
| "mean_token_accuracy": 0.6335371434688568, | |
| "num_tokens": 9417109.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 1.5528843998908997, | |
| "epoch": 0.840080971659919, | |
| "grad_norm": 0.2689385414123535, | |
| "learning_rate": 1.4402834008097166e-05, | |
| "loss": 1.5668, | |
| "mean_token_accuracy": 0.6325760573148728, | |
| "num_tokens": 9473473.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "entropy": 1.4268815875053407, | |
| "epoch": 0.8451417004048583, | |
| "grad_norm": 0.3029450476169586, | |
| "learning_rate": 1.4369095816464239e-05, | |
| "loss": 1.4197, | |
| "mean_token_accuracy": 0.6514874160289764, | |
| "num_tokens": 9530575.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "entropy": 1.4315476655960082, | |
| "epoch": 0.8502024291497976, | |
| "grad_norm": 0.24891141057014465, | |
| "learning_rate": 1.433535762483131e-05, | |
| "loss": 1.4228, | |
| "mean_token_accuracy": 0.656501293182373, | |
| "num_tokens": 9590931.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "entropy": 1.5360273122787476, | |
| "epoch": 0.8552631578947368, | |
| "grad_norm": 0.30486878752708435, | |
| "learning_rate": 1.4301619433198381e-05, | |
| "loss": 1.5474, | |
| "mean_token_accuracy": 0.6348777890205384, | |
| "num_tokens": 9644652.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "entropy": 1.6101372838020325, | |
| "epoch": 0.8603238866396761, | |
| "grad_norm": 0.23739294707775116, | |
| "learning_rate": 1.4267881241565454e-05, | |
| "loss": 1.6222, | |
| "mean_token_accuracy": 0.6213286280632019, | |
| "num_tokens": 9697296.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 1.56304851770401, | |
| "epoch": 0.8653846153846154, | |
| "grad_norm": 0.2499363124370575, | |
| "learning_rate": 1.4234143049932525e-05, | |
| "loss": 1.5642, | |
| "mean_token_accuracy": 0.6282478511333466, | |
| "num_tokens": 9755265.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "entropy": 1.4945539951324462, | |
| "epoch": 0.8704453441295547, | |
| "grad_norm": 0.24991373717784882, | |
| "learning_rate": 1.4200404858299596e-05, | |
| "loss": 1.5336, | |
| "mean_token_accuracy": 0.6350914716720581, | |
| "num_tokens": 9815899.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "entropy": 1.5779843926429749, | |
| "epoch": 0.8755060728744939, | |
| "grad_norm": 0.24115176498889923, | |
| "learning_rate": 1.416666666666667e-05, | |
| "loss": 1.5933, | |
| "mean_token_accuracy": 0.6283825278282166, | |
| "num_tokens": 9872513.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "entropy": 1.4204454302787781, | |
| "epoch": 0.8805668016194332, | |
| "grad_norm": 0.22373662889003754, | |
| "learning_rate": 1.413292847503374e-05, | |
| "loss": 1.4136, | |
| "mean_token_accuracy": 0.6557290494441986, | |
| "num_tokens": 9932083.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "entropy": 1.636140561103821, | |
| "epoch": 0.8856275303643725, | |
| "grad_norm": 0.29674816131591797, | |
| "learning_rate": 1.409919028340081e-05, | |
| "loss": 1.662, | |
| "mean_token_accuracy": 0.6216094970703125, | |
| "num_tokens": 9988494.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 1.549510085582733, | |
| "epoch": 0.8906882591093117, | |
| "grad_norm": 0.24920591711997986, | |
| "learning_rate": 1.4065452091767881e-05, | |
| "loss": 1.5553, | |
| "mean_token_accuracy": 0.6351737916469574, | |
| "num_tokens": 10041605.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "entropy": 1.5425897359848022, | |
| "epoch": 0.895748987854251, | |
| "grad_norm": 0.2719487249851227, | |
| "learning_rate": 1.4031713900134953e-05, | |
| "loss": 1.5457, | |
| "mean_token_accuracy": 0.6341227173805237, | |
| "num_tokens": 10097471.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "entropy": 1.5892379999160766, | |
| "epoch": 0.9008097165991903, | |
| "grad_norm": 0.26108458638191223, | |
| "learning_rate": 1.3997975708502025e-05, | |
| "loss": 1.5846, | |
| "mean_token_accuracy": 0.6257834196090698, | |
| "num_tokens": 10157839.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "entropy": 1.5164817094802856, | |
| "epoch": 0.9058704453441295, | |
| "grad_norm": 0.255862295627594, | |
| "learning_rate": 1.3964237516869097e-05, | |
| "loss": 1.5325, | |
| "mean_token_accuracy": 0.6302552342414856, | |
| "num_tokens": 10215568.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "entropy": 1.5202425956726073, | |
| "epoch": 0.9109311740890689, | |
| "grad_norm": 0.2746359705924988, | |
| "learning_rate": 1.3930499325236168e-05, | |
| "loss": 1.5264, | |
| "mean_token_accuracy": 0.6395917236804962, | |
| "num_tokens": 10277752.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 1.5994849681854248, | |
| "epoch": 0.9159919028340081, | |
| "grad_norm": 0.259244441986084, | |
| "learning_rate": 1.389676113360324e-05, | |
| "loss": 1.6126, | |
| "mean_token_accuracy": 0.6206628024578095, | |
| "num_tokens": 10332436.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "entropy": 1.5928335905075073, | |
| "epoch": 0.9210526315789473, | |
| "grad_norm": 0.30553993582725525, | |
| "learning_rate": 1.3863022941970312e-05, | |
| "loss": 1.604, | |
| "mean_token_accuracy": 0.6238301634788513, | |
| "num_tokens": 10385660.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "entropy": 1.5503159523010255, | |
| "epoch": 0.9261133603238867, | |
| "grad_norm": 0.2695212662220001, | |
| "learning_rate": 1.3829284750337383e-05, | |
| "loss": 1.5727, | |
| "mean_token_accuracy": 0.6283754229545593, | |
| "num_tokens": 10440034.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "entropy": 1.472425067424774, | |
| "epoch": 0.9311740890688259, | |
| "grad_norm": 0.26096370816230774, | |
| "learning_rate": 1.3795546558704453e-05, | |
| "loss": 1.4744, | |
| "mean_token_accuracy": 0.6468591213226318, | |
| "num_tokens": 10495586.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "entropy": 1.4272591471672058, | |
| "epoch": 0.9362348178137652, | |
| "grad_norm": 0.2956947088241577, | |
| "learning_rate": 1.3761808367071526e-05, | |
| "loss": 1.4446, | |
| "mean_token_accuracy": 0.6488463521003723, | |
| "num_tokens": 10546414.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 1.4026084661483764, | |
| "epoch": 0.9412955465587044, | |
| "grad_norm": 0.24682804942131042, | |
| "learning_rate": 1.3728070175438597e-05, | |
| "loss": 1.3906, | |
| "mean_token_accuracy": 0.6536332130432129, | |
| "num_tokens": 10603382.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "entropy": 1.585541033744812, | |
| "epoch": 0.9463562753036437, | |
| "grad_norm": 0.28304097056388855, | |
| "learning_rate": 1.3694331983805668e-05, | |
| "loss": 1.5972, | |
| "mean_token_accuracy": 0.6255220711231232, | |
| "num_tokens": 10666030.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "entropy": 1.580546224117279, | |
| "epoch": 0.951417004048583, | |
| "grad_norm": 0.2616841793060303, | |
| "learning_rate": 1.3660593792172741e-05, | |
| "loss": 1.6051, | |
| "mean_token_accuracy": 0.6219939827919007, | |
| "num_tokens": 10725741.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "entropy": 1.6499082326889039, | |
| "epoch": 0.9564777327935222, | |
| "grad_norm": 0.2620835304260254, | |
| "learning_rate": 1.3626855600539812e-05, | |
| "loss": 1.6969, | |
| "mean_token_accuracy": 0.6166241288185119, | |
| "num_tokens": 10787880.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "entropy": 1.3888215899467469, | |
| "epoch": 0.9615384615384616, | |
| "grad_norm": 0.2680383324623108, | |
| "learning_rate": 1.3593117408906883e-05, | |
| "loss": 1.3917, | |
| "mean_token_accuracy": 0.6527835667133332, | |
| "num_tokens": 10844894.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 1.3836533963680266, | |
| "epoch": 0.9665991902834008, | |
| "grad_norm": 0.35761716961860657, | |
| "learning_rate": 1.3559379217273956e-05, | |
| "loss": 1.3895, | |
| "mean_token_accuracy": 0.6636650562286377, | |
| "num_tokens": 10900721.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "entropy": 1.451544201374054, | |
| "epoch": 0.97165991902834, | |
| "grad_norm": 0.26495417952537537, | |
| "learning_rate": 1.3525641025641028e-05, | |
| "loss": 1.447, | |
| "mean_token_accuracy": 0.6403470158576965, | |
| "num_tokens": 10951838.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "entropy": 1.5138379335403442, | |
| "epoch": 0.9767206477732794, | |
| "grad_norm": 0.23315957188606262, | |
| "learning_rate": 1.3491902834008099e-05, | |
| "loss": 1.5385, | |
| "mean_token_accuracy": 0.6303693652153015, | |
| "num_tokens": 11010569.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "entropy": 1.5039880394935607, | |
| "epoch": 0.9817813765182186, | |
| "grad_norm": 0.26653018593788147, | |
| "learning_rate": 1.3458164642375168e-05, | |
| "loss": 1.515, | |
| "mean_token_accuracy": 0.6446199715137482, | |
| "num_tokens": 11068307.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "entropy": 1.5039002180099488, | |
| "epoch": 0.9868421052631579, | |
| "grad_norm": 0.24144147336483002, | |
| "learning_rate": 1.3424426450742241e-05, | |
| "loss": 1.5012, | |
| "mean_token_accuracy": 0.6414350152015686, | |
| "num_tokens": 11131378.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 1.5108654856681825, | |
| "epoch": 0.9919028340080972, | |
| "grad_norm": 0.33613070845603943, | |
| "learning_rate": 1.3390688259109312e-05, | |
| "loss": 1.5229, | |
| "mean_token_accuracy": 0.6330624580383301, | |
| "num_tokens": 11189667.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "entropy": 1.43316547870636, | |
| "epoch": 0.9969635627530364, | |
| "grad_norm": 0.27450039982795715, | |
| "learning_rate": 1.3356950067476384e-05, | |
| "loss": 1.4358, | |
| "mean_token_accuracy": 0.6528611719608307, | |
| "num_tokens": 11248100.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "entropy": 1.5782551288604736, | |
| "epoch": 1.0020242914979758, | |
| "grad_norm": 0.2942919433116913, | |
| "learning_rate": 1.3323211875843457e-05, | |
| "loss": 1.5945, | |
| "mean_token_accuracy": 0.622716897726059, | |
| "num_tokens": 11301434.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "entropy": 1.5513013124465942, | |
| "epoch": 1.007085020242915, | |
| "grad_norm": 0.4627493619918823, | |
| "learning_rate": 1.3289473684210528e-05, | |
| "loss": 1.5645, | |
| "mean_token_accuracy": 0.6323555290699006, | |
| "num_tokens": 11357709.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "entropy": 1.5218539357185363, | |
| "epoch": 1.0121457489878543, | |
| "grad_norm": 0.29789215326309204, | |
| "learning_rate": 1.3255735492577599e-05, | |
| "loss": 1.5296, | |
| "mean_token_accuracy": 0.6385591834783554, | |
| "num_tokens": 11409081.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "entropy": 1.5782111883163452, | |
| "epoch": 1.0172064777327936, | |
| "grad_norm": 0.3623863458633423, | |
| "learning_rate": 1.3221997300944672e-05, | |
| "loss": 1.5815, | |
| "mean_token_accuracy": 0.6234244406223297, | |
| "num_tokens": 11461701.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "entropy": 1.478236198425293, | |
| "epoch": 1.0222672064777327, | |
| "grad_norm": 0.24126943945884705, | |
| "learning_rate": 1.3188259109311743e-05, | |
| "loss": 1.4773, | |
| "mean_token_accuracy": 0.6408190190792084, | |
| "num_tokens": 11522781.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "entropy": 1.474450170993805, | |
| "epoch": 1.027327935222672, | |
| "grad_norm": 0.27630022168159485, | |
| "learning_rate": 1.3154520917678813e-05, | |
| "loss": 1.4777, | |
| "mean_token_accuracy": 0.6390757083892822, | |
| "num_tokens": 11577690.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "entropy": 1.38731769323349, | |
| "epoch": 1.0323886639676114, | |
| "grad_norm": 0.2594892382621765, | |
| "learning_rate": 1.3120782726045884e-05, | |
| "loss": 1.4113, | |
| "mean_token_accuracy": 0.6555228769779206, | |
| "num_tokens": 11634378.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "entropy": 1.4996397018432617, | |
| "epoch": 1.0374493927125505, | |
| "grad_norm": 0.29768475890159607, | |
| "learning_rate": 1.3087044534412957e-05, | |
| "loss": 1.5046, | |
| "mean_token_accuracy": 0.6385474681854248, | |
| "num_tokens": 11691197.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "entropy": 1.7156208992004394, | |
| "epoch": 1.04251012145749, | |
| "grad_norm": 0.30838677287101746, | |
| "learning_rate": 1.3053306342780028e-05, | |
| "loss": 1.7196, | |
| "mean_token_accuracy": 0.6081624507904053, | |
| "num_tokens": 11744812.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "entropy": 1.4287778735160828, | |
| "epoch": 1.0475708502024292, | |
| "grad_norm": 0.30164098739624023, | |
| "learning_rate": 1.3019568151147099e-05, | |
| "loss": 1.4251, | |
| "mean_token_accuracy": 0.6514438152313232, | |
| "num_tokens": 11798182.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "entropy": 1.6277110576629639, | |
| "epoch": 1.0526315789473684, | |
| "grad_norm": 0.27688923478126526, | |
| "learning_rate": 1.2985829959514172e-05, | |
| "loss": 1.637, | |
| "mean_token_accuracy": 0.6167466878890991, | |
| "num_tokens": 11853640.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "entropy": 1.3780420899391175, | |
| "epoch": 1.0576923076923077, | |
| "grad_norm": 0.2407483607530594, | |
| "learning_rate": 1.2952091767881243e-05, | |
| "loss": 1.3775, | |
| "mean_token_accuracy": 0.6617866694927216, | |
| "num_tokens": 11909613.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "entropy": 1.4581809163093566, | |
| "epoch": 1.062753036437247, | |
| "grad_norm": 0.3337167203426361, | |
| "learning_rate": 1.2918353576248314e-05, | |
| "loss": 1.4533, | |
| "mean_token_accuracy": 0.6537846267223358, | |
| "num_tokens": 11967740.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "entropy": 1.4676265239715576, | |
| "epoch": 1.0678137651821862, | |
| "grad_norm": 0.2601131796836853, | |
| "learning_rate": 1.2884615384615386e-05, | |
| "loss": 1.4607, | |
| "mean_token_accuracy": 0.6463825047016144, | |
| "num_tokens": 12020775.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "entropy": 1.5144903063774109, | |
| "epoch": 1.0728744939271255, | |
| "grad_norm": 0.276044636964798, | |
| "learning_rate": 1.2850877192982459e-05, | |
| "loss": 1.5184, | |
| "mean_token_accuracy": 0.6339675188064575, | |
| "num_tokens": 12081273.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "entropy": 1.5458029508590698, | |
| "epoch": 1.0779352226720649, | |
| "grad_norm": 0.3157075047492981, | |
| "learning_rate": 1.2817139001349528e-05, | |
| "loss": 1.5519, | |
| "mean_token_accuracy": 0.6369691550731659, | |
| "num_tokens": 12134672.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "entropy": 1.4140147149562836, | |
| "epoch": 1.082995951417004, | |
| "grad_norm": 0.32847243547439575, | |
| "learning_rate": 1.27834008097166e-05, | |
| "loss": 1.4223, | |
| "mean_token_accuracy": 0.6571628749370575, | |
| "num_tokens": 12193683.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "entropy": 1.5222583651542663, | |
| "epoch": 1.0880566801619433, | |
| "grad_norm": 0.2528051435947418, | |
| "learning_rate": 1.274966261808367e-05, | |
| "loss": 1.5229, | |
| "mean_token_accuracy": 0.6361405253410339, | |
| "num_tokens": 12249416.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "entropy": 1.5322677731513976, | |
| "epoch": 1.0931174089068827, | |
| "grad_norm": 0.25397226214408875, | |
| "learning_rate": 1.2715924426450743e-05, | |
| "loss": 1.5353, | |
| "mean_token_accuracy": 0.6307880222797394, | |
| "num_tokens": 12312488.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "entropy": 1.4451451063156129, | |
| "epoch": 1.0981781376518218, | |
| "grad_norm": 0.3207351565361023, | |
| "learning_rate": 1.2682186234817815e-05, | |
| "loss": 1.4532, | |
| "mean_token_accuracy": 0.6479784369468689, | |
| "num_tokens": 12365397.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "entropy": 1.6216472387313843, | |
| "epoch": 1.1032388663967612, | |
| "grad_norm": 0.22639265656471252, | |
| "learning_rate": 1.2648448043184886e-05, | |
| "loss": 1.6331, | |
| "mean_token_accuracy": 0.6182599663734436, | |
| "num_tokens": 12426280.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "entropy": 1.417205023765564, | |
| "epoch": 1.1082995951417005, | |
| "grad_norm": 0.31163787841796875, | |
| "learning_rate": 1.2614709851551959e-05, | |
| "loss": 1.4197, | |
| "mean_token_accuracy": 0.6472279012203217, | |
| "num_tokens": 12481836.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "entropy": 1.5859375596046448, | |
| "epoch": 1.1133603238866396, | |
| "grad_norm": 0.2581881582736969, | |
| "learning_rate": 1.258097165991903e-05, | |
| "loss": 1.5947, | |
| "mean_token_accuracy": 0.6293219923973083, | |
| "num_tokens": 12537386.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "entropy": 1.6218139290809632, | |
| "epoch": 1.118421052631579, | |
| "grad_norm": 0.27295926213264465, | |
| "learning_rate": 1.2547233468286101e-05, | |
| "loss": 1.6235, | |
| "mean_token_accuracy": 0.6221803069114685, | |
| "num_tokens": 12591068.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "entropy": 1.5492971539497375, | |
| "epoch": 1.123481781376518, | |
| "grad_norm": 0.28580132126808167, | |
| "learning_rate": 1.251349527665317e-05, | |
| "loss": 1.5594, | |
| "mean_token_accuracy": 0.6253218352794647, | |
| "num_tokens": 12648353.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "entropy": 1.649086058139801, | |
| "epoch": 1.1285425101214575, | |
| "grad_norm": 0.24511824548244476, | |
| "learning_rate": 1.2479757085020244e-05, | |
| "loss": 1.6621, | |
| "mean_token_accuracy": 0.6179795920848846, | |
| "num_tokens": 12700443.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "entropy": 1.4533384203910829, | |
| "epoch": 1.1336032388663968, | |
| "grad_norm": 0.3033972382545471, | |
| "learning_rate": 1.2446018893387315e-05, | |
| "loss": 1.4451, | |
| "mean_token_accuracy": 0.6498919248580932, | |
| "num_tokens": 12748990.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "entropy": 1.5436882257461548, | |
| "epoch": 1.1386639676113361, | |
| "grad_norm": 0.2811788022518158, | |
| "learning_rate": 1.2412280701754386e-05, | |
| "loss": 1.5508, | |
| "mean_token_accuracy": 0.6290224313735961, | |
| "num_tokens": 12807477.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "entropy": 1.448672115802765, | |
| "epoch": 1.1437246963562753, | |
| "grad_norm": 0.29944077134132385, | |
| "learning_rate": 1.2378542510121459e-05, | |
| "loss": 1.4598, | |
| "mean_token_accuracy": 0.6483164548873901, | |
| "num_tokens": 12869123.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "entropy": 1.3786328792572022, | |
| "epoch": 1.1487854251012146, | |
| "grad_norm": 0.27392685413360596, | |
| "learning_rate": 1.234480431848853e-05, | |
| "loss": 1.3767, | |
| "mean_token_accuracy": 0.6597134828567505, | |
| "num_tokens": 12924572.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "entropy": 1.5663957238197326, | |
| "epoch": 1.1538461538461537, | |
| "grad_norm": 0.3136812150478363, | |
| "learning_rate": 1.2311066126855601e-05, | |
| "loss": 1.5661, | |
| "mean_token_accuracy": 0.6283589959144592, | |
| "num_tokens": 12982715.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "entropy": 1.4102508783340455, | |
| "epoch": 1.158906882591093, | |
| "grad_norm": 0.33586448431015015, | |
| "learning_rate": 1.2277327935222674e-05, | |
| "loss": 1.4242, | |
| "mean_token_accuracy": 0.6464997053146362, | |
| "num_tokens": 13035535.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "entropy": 1.4415246963500976, | |
| "epoch": 1.1639676113360324, | |
| "grad_norm": 0.24208928644657135, | |
| "learning_rate": 1.2243589743589746e-05, | |
| "loss": 1.4572, | |
| "mean_token_accuracy": 0.6485124588012695, | |
| "num_tokens": 13098688.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "entropy": 1.490816557407379, | |
| "epoch": 1.1690283400809718, | |
| "grad_norm": 0.27268052101135254, | |
| "learning_rate": 1.2209851551956815e-05, | |
| "loss": 1.4841, | |
| "mean_token_accuracy": 0.6446694970130921, | |
| "num_tokens": 13155516.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "entropy": 1.41130930185318, | |
| "epoch": 1.174089068825911, | |
| "grad_norm": 0.3298867642879486, | |
| "learning_rate": 1.2176113360323886e-05, | |
| "loss": 1.4114, | |
| "mean_token_accuracy": 0.6582064151763916, | |
| "num_tokens": 13208333.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "entropy": 1.6078017115592957, | |
| "epoch": 1.1791497975708503, | |
| "grad_norm": 0.2950042188167572, | |
| "learning_rate": 1.214237516869096e-05, | |
| "loss": 1.6164, | |
| "mean_token_accuracy": 0.6207537829875946, | |
| "num_tokens": 13264796.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "entropy": 1.500421929359436, | |
| "epoch": 1.1842105263157894, | |
| "grad_norm": 0.2659217417240143, | |
| "learning_rate": 1.210863697705803e-05, | |
| "loss": 1.5125, | |
| "mean_token_accuracy": 0.6368428528308868, | |
| "num_tokens": 13325761.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "entropy": 1.511633825302124, | |
| "epoch": 1.1892712550607287, | |
| "grad_norm": 0.2882932722568512, | |
| "learning_rate": 1.2074898785425102e-05, | |
| "loss": 1.5265, | |
| "mean_token_accuracy": 0.6347347319126129, | |
| "num_tokens": 13381225.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "entropy": 1.4531208157539368, | |
| "epoch": 1.194331983805668, | |
| "grad_norm": 0.2595268487930298, | |
| "learning_rate": 1.2041160593792175e-05, | |
| "loss": 1.4615, | |
| "mean_token_accuracy": 0.6477943778038024, | |
| "num_tokens": 13443099.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "entropy": 1.4483809113502502, | |
| "epoch": 1.1993927125506072, | |
| "grad_norm": 0.31083598732948303, | |
| "learning_rate": 1.2007422402159246e-05, | |
| "loss": 1.4345, | |
| "mean_token_accuracy": 0.6418466746807099, | |
| "num_tokens": 13492349.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "entropy": 1.4612587809562683, | |
| "epoch": 1.2044534412955465, | |
| "grad_norm": 0.3023878037929535, | |
| "learning_rate": 1.1973684210526317e-05, | |
| "loss": 1.4644, | |
| "mean_token_accuracy": 0.6457450866699219, | |
| "num_tokens": 13553635.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "entropy": 1.503815734386444, | |
| "epoch": 1.209514170040486, | |
| "grad_norm": 0.2668578326702118, | |
| "learning_rate": 1.193994601889339e-05, | |
| "loss": 1.5031, | |
| "mean_token_accuracy": 0.6367665946483612, | |
| "num_tokens": 13610860.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "entropy": 1.5158751249313354, | |
| "epoch": 1.214574898785425, | |
| "grad_norm": 0.22731706500053406, | |
| "learning_rate": 1.1906207827260461e-05, | |
| "loss": 1.5166, | |
| "mean_token_accuracy": 0.6408901572227478, | |
| "num_tokens": 13671500.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "entropy": 1.4245959162712096, | |
| "epoch": 1.2196356275303644, | |
| "grad_norm": 0.23208104074001312, | |
| "learning_rate": 1.187246963562753e-05, | |
| "loss": 1.4395, | |
| "mean_token_accuracy": 0.650111585855484, | |
| "num_tokens": 13732700.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "entropy": 1.5526673555374146, | |
| "epoch": 1.2246963562753037, | |
| "grad_norm": 0.3204510807991028, | |
| "learning_rate": 1.1838731443994602e-05, | |
| "loss": 1.5659, | |
| "mean_token_accuracy": 0.6272344350814819, | |
| "num_tokens": 13792638.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "entropy": 1.4588525891304016, | |
| "epoch": 1.2297570850202428, | |
| "grad_norm": 0.2778925895690918, | |
| "learning_rate": 1.1804993252361675e-05, | |
| "loss": 1.4745, | |
| "mean_token_accuracy": 0.6453329682350158, | |
| "num_tokens": 13848701.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "entropy": 1.3035455107688905, | |
| "epoch": 1.2348178137651822, | |
| "grad_norm": 0.26574888825416565, | |
| "learning_rate": 1.1771255060728746e-05, | |
| "loss": 1.3013, | |
| "mean_token_accuracy": 0.680269593000412, | |
| "num_tokens": 13903243.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "entropy": 1.5677086472511292, | |
| "epoch": 1.2398785425101215, | |
| "grad_norm": 0.2806277573108673, | |
| "learning_rate": 1.1737516869095817e-05, | |
| "loss": 1.5653, | |
| "mean_token_accuracy": 0.6303077161312103, | |
| "num_tokens": 13962439.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "entropy": 1.4167581439018249, | |
| "epoch": 1.2449392712550607, | |
| "grad_norm": 0.2721521258354187, | |
| "learning_rate": 1.1703778677462888e-05, | |
| "loss": 1.4122, | |
| "mean_token_accuracy": 0.6505212604999542, | |
| "num_tokens": 14017529.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "entropy": 1.5344619274139404, | |
| "epoch": 1.25, | |
| "grad_norm": 0.2629392445087433, | |
| "learning_rate": 1.1670040485829961e-05, | |
| "loss": 1.5489, | |
| "mean_token_accuracy": 0.6296425819396972, | |
| "num_tokens": 14074333.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "entropy": 1.4288833916187287, | |
| "epoch": 1.2550607287449393, | |
| "grad_norm": 0.28045085072517395, | |
| "learning_rate": 1.1636302294197033e-05, | |
| "loss": 1.4332, | |
| "mean_token_accuracy": 0.6531016409397126, | |
| "num_tokens": 14131260.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "entropy": 1.4341704964637756, | |
| "epoch": 1.2601214574898785, | |
| "grad_norm": 0.27869343757629395, | |
| "learning_rate": 1.1602564102564104e-05, | |
| "loss": 1.4245, | |
| "mean_token_accuracy": 0.6531503915786743, | |
| "num_tokens": 14187704.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "entropy": 1.5492194533348083, | |
| "epoch": 1.2651821862348178, | |
| "grad_norm": 0.3610108494758606, | |
| "learning_rate": 1.1568825910931173e-05, | |
| "loss": 1.5493, | |
| "mean_token_accuracy": 0.6251341938972473, | |
| "num_tokens": 14244227.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "entropy": 1.4428314566612244, | |
| "epoch": 1.2702429149797572, | |
| "grad_norm": 0.2730664908885956, | |
| "learning_rate": 1.1535087719298246e-05, | |
| "loss": 1.4481, | |
| "mean_token_accuracy": 0.6439902603626251, | |
| "num_tokens": 14301363.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "entropy": 1.6202573895454406, | |
| "epoch": 1.2753036437246963, | |
| "grad_norm": 0.2632329761981964, | |
| "learning_rate": 1.1501349527665317e-05, | |
| "loss": 1.6394, | |
| "mean_token_accuracy": 0.6166090041399002, | |
| "num_tokens": 14358360.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "entropy": 1.4789348363876342, | |
| "epoch": 1.2803643724696356, | |
| "grad_norm": 0.31635069847106934, | |
| "learning_rate": 1.1467611336032389e-05, | |
| "loss": 1.4909, | |
| "mean_token_accuracy": 0.6398876368999481, | |
| "num_tokens": 14414169.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "entropy": 1.5108978629112244, | |
| "epoch": 1.285425101214575, | |
| "grad_norm": 0.32884782552719116, | |
| "learning_rate": 1.1433873144399461e-05, | |
| "loss": 1.5177, | |
| "mean_token_accuracy": 0.6348686575889587, | |
| "num_tokens": 14475715.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "entropy": 1.419902467727661, | |
| "epoch": 1.290485829959514, | |
| "grad_norm": 0.2587096095085144, | |
| "learning_rate": 1.1400134952766533e-05, | |
| "loss": 1.4162, | |
| "mean_token_accuracy": 0.6549311280250549, | |
| "num_tokens": 14534625.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "entropy": 1.4202989101409913, | |
| "epoch": 1.2955465587044535, | |
| "grad_norm": 0.3693634271621704, | |
| "learning_rate": 1.1366396761133604e-05, | |
| "loss": 1.4086, | |
| "mean_token_accuracy": 0.6512441515922547, | |
| "num_tokens": 14587225.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "entropy": 1.646610152721405, | |
| "epoch": 1.3006072874493926, | |
| "grad_norm": 0.2674924433231354, | |
| "learning_rate": 1.1332658569500677e-05, | |
| "loss": 1.6561, | |
| "mean_token_accuracy": 0.6122437655925751, | |
| "num_tokens": 14645474.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "entropy": 1.5521462559700012, | |
| "epoch": 1.305668016194332, | |
| "grad_norm": 0.2970985770225525, | |
| "learning_rate": 1.1298920377867748e-05, | |
| "loss": 1.5528, | |
| "mean_token_accuracy": 0.6343778431415558, | |
| "num_tokens": 14702700.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "entropy": 1.556568205356598, | |
| "epoch": 1.3107287449392713, | |
| "grad_norm": 0.2645126283168793, | |
| "learning_rate": 1.1265182186234818e-05, | |
| "loss": 1.5629, | |
| "mean_token_accuracy": 0.6288919091224671, | |
| "num_tokens": 14757931.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "entropy": 1.4472679018974304, | |
| "epoch": 1.3157894736842106, | |
| "grad_norm": 0.2335396409034729, | |
| "learning_rate": 1.1231443994601889e-05, | |
| "loss": 1.4551, | |
| "mean_token_accuracy": 0.6467409670352936, | |
| "num_tokens": 14814936.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "entropy": 1.5397544741630553, | |
| "epoch": 1.3208502024291497, | |
| "grad_norm": 0.2709454298019409, | |
| "learning_rate": 1.1197705802968962e-05, | |
| "loss": 1.5446, | |
| "mean_token_accuracy": 0.6299596786499023, | |
| "num_tokens": 14875733.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "entropy": 1.4886646032333375, | |
| "epoch": 1.325910931174089, | |
| "grad_norm": 0.35333138704299927, | |
| "learning_rate": 1.1163967611336033e-05, | |
| "loss": 1.4863, | |
| "mean_token_accuracy": 0.63787921667099, | |
| "num_tokens": 14929765.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "entropy": 1.453588593006134, | |
| "epoch": 1.3309716599190282, | |
| "grad_norm": 0.27809369564056396, | |
| "learning_rate": 1.1130229419703104e-05, | |
| "loss": 1.4697, | |
| "mean_token_accuracy": 0.6435807704925537, | |
| "num_tokens": 14990890.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "entropy": 1.5616032361984253, | |
| "epoch": 1.3360323886639676, | |
| "grad_norm": 0.30011820793151855, | |
| "learning_rate": 1.1096491228070177e-05, | |
| "loss": 1.5712, | |
| "mean_token_accuracy": 0.6284253001213074, | |
| "num_tokens": 15051025.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "entropy": 1.5883211970329285, | |
| "epoch": 1.341093117408907, | |
| "grad_norm": 0.2934761345386505, | |
| "learning_rate": 1.1062753036437248e-05, | |
| "loss": 1.5979, | |
| "mean_token_accuracy": 0.6245935201644898, | |
| "num_tokens": 15108003.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "entropy": 1.6321449398994445, | |
| "epoch": 1.3461538461538463, | |
| "grad_norm": 0.2740587890148163, | |
| "learning_rate": 1.102901484480432e-05, | |
| "loss": 1.6281, | |
| "mean_token_accuracy": 0.6189014375209808, | |
| "num_tokens": 15164177.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "entropy": 1.5974322438240052, | |
| "epoch": 1.3512145748987854, | |
| "grad_norm": 0.26599040627479553, | |
| "learning_rate": 1.0995276653171392e-05, | |
| "loss": 1.5986, | |
| "mean_token_accuracy": 0.6264194548130035, | |
| "num_tokens": 15219699.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "entropy": 1.6627001881599426, | |
| "epoch": 1.3562753036437247, | |
| "grad_norm": 0.35696741938591003, | |
| "learning_rate": 1.0961538461538464e-05, | |
| "loss": 1.6903, | |
| "mean_token_accuracy": 0.6077935576438904, | |
| "num_tokens": 15275379.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "entropy": 1.4766412138938905, | |
| "epoch": 1.3613360323886639, | |
| "grad_norm": 0.32456570863723755, | |
| "learning_rate": 1.0927800269905533e-05, | |
| "loss": 1.473, | |
| "mean_token_accuracy": 0.6439019083976746, | |
| "num_tokens": 15334742.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "entropy": 1.534727895259857, | |
| "epoch": 1.3663967611336032, | |
| "grad_norm": 0.30418887734413147, | |
| "learning_rate": 1.0894062078272604e-05, | |
| "loss": 1.5354, | |
| "mean_token_accuracy": 0.6321536242961884, | |
| "num_tokens": 15384467.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "entropy": 1.4464212298393249, | |
| "epoch": 1.3714574898785425, | |
| "grad_norm": 0.2574264407157898, | |
| "learning_rate": 1.0860323886639677e-05, | |
| "loss": 1.4397, | |
| "mean_token_accuracy": 0.6525548756122589, | |
| "num_tokens": 15446519.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "entropy": 1.5958146333694458, | |
| "epoch": 1.376518218623482, | |
| "grad_norm": 0.28892847895622253, | |
| "learning_rate": 1.0826585695006748e-05, | |
| "loss": 1.5915, | |
| "mean_token_accuracy": 0.6221986651420593, | |
| "num_tokens": 15505401.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "entropy": 1.6138377904891967, | |
| "epoch": 1.381578947368421, | |
| "grad_norm": 0.2827686667442322, | |
| "learning_rate": 1.079284750337382e-05, | |
| "loss": 1.6357, | |
| "mean_token_accuracy": 0.622738265991211, | |
| "num_tokens": 15563279.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "entropy": 1.54440039396286, | |
| "epoch": 1.3866396761133604, | |
| "grad_norm": 0.2887682318687439, | |
| "learning_rate": 1.0759109311740893e-05, | |
| "loss": 1.5273, | |
| "mean_token_accuracy": 0.6313063859939575, | |
| "num_tokens": 15618960.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "entropy": 1.4138375759124755, | |
| "epoch": 1.3917004048582995, | |
| "grad_norm": 0.36498573422431946, | |
| "learning_rate": 1.0725371120107964e-05, | |
| "loss": 1.4189, | |
| "mean_token_accuracy": 0.6591821730136871, | |
| "num_tokens": 15670366.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "entropy": 1.4741955041885375, | |
| "epoch": 1.3967611336032388, | |
| "grad_norm": 0.3496224284172058, | |
| "learning_rate": 1.0691632928475035e-05, | |
| "loss": 1.4764, | |
| "mean_token_accuracy": 0.6401580095291137, | |
| "num_tokens": 15724201.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "entropy": 1.5501426219940186, | |
| "epoch": 1.4018218623481782, | |
| "grad_norm": 0.26639312505722046, | |
| "learning_rate": 1.0657894736842108e-05, | |
| "loss": 1.546, | |
| "mean_token_accuracy": 0.6274727523326874, | |
| "num_tokens": 15783636.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "entropy": 1.557990849018097, | |
| "epoch": 1.4068825910931175, | |
| "grad_norm": 0.34502512216567993, | |
| "learning_rate": 1.0624156545209177e-05, | |
| "loss": 1.5781, | |
| "mean_token_accuracy": 0.6323262035846711, | |
| "num_tokens": 15841694.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "entropy": 1.455728328227997, | |
| "epoch": 1.4119433198380567, | |
| "grad_norm": 0.2952381372451782, | |
| "learning_rate": 1.0590418353576249e-05, | |
| "loss": 1.4547, | |
| "mean_token_accuracy": 0.6422502875328064, | |
| "num_tokens": 15896343.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "entropy": 1.6696131229400635, | |
| "epoch": 1.417004048582996, | |
| "grad_norm": 0.2534728944301605, | |
| "learning_rate": 1.055668016194332e-05, | |
| "loss": 1.6816, | |
| "mean_token_accuracy": 0.607040387392044, | |
| "num_tokens": 15952655.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "entropy": 1.3929704070091247, | |
| "epoch": 1.4220647773279351, | |
| "grad_norm": 0.2545351982116699, | |
| "learning_rate": 1.0522941970310391e-05, | |
| "loss": 1.3815, | |
| "mean_token_accuracy": 0.6631879568099975, | |
| "num_tokens": 16009017.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "entropy": 1.4616627931594848, | |
| "epoch": 1.4271255060728745, | |
| "grad_norm": 0.29235726594924927, | |
| "learning_rate": 1.0489203778677464e-05, | |
| "loss": 1.469, | |
| "mean_token_accuracy": 0.6424239039421081, | |
| "num_tokens": 16064974.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "entropy": 1.5947192907333374, | |
| "epoch": 1.4321862348178138, | |
| "grad_norm": 0.4684313237667084, | |
| "learning_rate": 1.0455465587044535e-05, | |
| "loss": 1.6334, | |
| "mean_token_accuracy": 0.620320850610733, | |
| "num_tokens": 16121438.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "entropy": 1.5052786350250245, | |
| "epoch": 1.4372469635627532, | |
| "grad_norm": 0.2901478707790375, | |
| "learning_rate": 1.0421727395411606e-05, | |
| "loss": 1.5228, | |
| "mean_token_accuracy": 0.6432769238948822, | |
| "num_tokens": 16177348.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "entropy": 1.5745530486106873, | |
| "epoch": 1.4423076923076923, | |
| "grad_norm": 0.4461107552051544, | |
| "learning_rate": 1.038798920377868e-05, | |
| "loss": 1.5797, | |
| "mean_token_accuracy": 0.628256207704544, | |
| "num_tokens": 16233533.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "entropy": 1.6525885105133056, | |
| "epoch": 1.4473684210526316, | |
| "grad_norm": 0.30729052424430847, | |
| "learning_rate": 1.035425101214575e-05, | |
| "loss": 1.659, | |
| "mean_token_accuracy": 0.6156529784202576, | |
| "num_tokens": 16288984.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "entropy": 1.4813060998916625, | |
| "epoch": 1.4524291497975708, | |
| "grad_norm": 0.26118186116218567, | |
| "learning_rate": 1.0320512820512822e-05, | |
| "loss": 1.4694, | |
| "mean_token_accuracy": 0.6394685864448547, | |
| "num_tokens": 16347312.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "entropy": 1.3725073099136353, | |
| "epoch": 1.45748987854251, | |
| "grad_norm": 0.24992327392101288, | |
| "learning_rate": 1.0286774628879891e-05, | |
| "loss": 1.3778, | |
| "mean_token_accuracy": 0.6600593090057373, | |
| "num_tokens": 16401182.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "entropy": 1.5925581932067872, | |
| "epoch": 1.4625506072874495, | |
| "grad_norm": 0.3013634979724884, | |
| "learning_rate": 1.0253036437246964e-05, | |
| "loss": 1.5989, | |
| "mean_token_accuracy": 0.6274087786674499, | |
| "num_tokens": 16463180.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "entropy": 1.395955240726471, | |
| "epoch": 1.4676113360323888, | |
| "grad_norm": 0.2821931540966034, | |
| "learning_rate": 1.0219298245614035e-05, | |
| "loss": 1.3955, | |
| "mean_token_accuracy": 0.6572477340698242, | |
| "num_tokens": 16524984.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "entropy": 1.493795931339264, | |
| "epoch": 1.472672064777328, | |
| "grad_norm": 0.27723386883735657, | |
| "learning_rate": 1.0185560053981107e-05, | |
| "loss": 1.4988, | |
| "mean_token_accuracy": 0.6318200826644897, | |
| "num_tokens": 16585454.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "entropy": 1.608326256275177, | |
| "epoch": 1.4777327935222673, | |
| "grad_norm": 0.24880221486091614, | |
| "learning_rate": 1.015182186234818e-05, | |
| "loss": 1.6037, | |
| "mean_token_accuracy": 0.6237947404384613, | |
| "num_tokens": 16642878.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "entropy": 1.4563136458396913, | |
| "epoch": 1.4827935222672064, | |
| "grad_norm": 0.2714000940322876, | |
| "learning_rate": 1.011808367071525e-05, | |
| "loss": 1.4609, | |
| "mean_token_accuracy": 0.6409155547618866, | |
| "num_tokens": 16697425.0, | |
| "step": 2930 | |
| }, | |
| { | |
| "entropy": 1.4760780036449432, | |
| "epoch": 1.4878542510121457, | |
| "grad_norm": 0.3031882047653198, | |
| "learning_rate": 1.0084345479082322e-05, | |
| "loss": 1.4802, | |
| "mean_token_accuracy": 0.6450917005538941, | |
| "num_tokens": 16760118.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "entropy": 1.493908405303955, | |
| "epoch": 1.492914979757085, | |
| "grad_norm": 0.2621052861213684, | |
| "learning_rate": 1.0050607287449395e-05, | |
| "loss": 1.4918, | |
| "mean_token_accuracy": 0.6401423692703248, | |
| "num_tokens": 16813749.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "entropy": 1.6856267690658568, | |
| "epoch": 1.4979757085020242, | |
| "grad_norm": 0.26623499393463135, | |
| "learning_rate": 1.0016869095816466e-05, | |
| "loss": 1.6777, | |
| "mean_token_accuracy": 0.6135709464550019, | |
| "num_tokens": 16874289.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "entropy": 1.4696342468261718, | |
| "epoch": 1.5030364372469636, | |
| "grad_norm": 0.2687808871269226, | |
| "learning_rate": 9.983130904183537e-06, | |
| "loss": 1.4727, | |
| "mean_token_accuracy": 0.6447311758995056, | |
| "num_tokens": 16930145.0, | |
| "step": 2970 | |
| }, | |
| { | |
| "entropy": 1.4744965791702271, | |
| "epoch": 1.5080971659919027, | |
| "grad_norm": 0.23845624923706055, | |
| "learning_rate": 9.949392712550608e-06, | |
| "loss": 1.4721, | |
| "mean_token_accuracy": 0.645156466960907, | |
| "num_tokens": 16984053.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "entropy": 1.4356729149818421, | |
| "epoch": 1.513157894736842, | |
| "grad_norm": 0.3086620271205902, | |
| "learning_rate": 9.91565452091768e-06, | |
| "loss": 1.4271, | |
| "mean_token_accuracy": 0.6454346477985382, | |
| "num_tokens": 17040474.0, | |
| "step": 2990 | |
| }, | |
| { | |
| "entropy": 1.4318643450736999, | |
| "epoch": 1.5182186234817814, | |
| "grad_norm": 0.31296011805534363, | |
| "learning_rate": 9.881916329284751e-06, | |
| "loss": 1.4284, | |
| "mean_token_accuracy": 0.6570405125617981, | |
| "num_tokens": 17091443.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "entropy": 1.486535382270813, | |
| "epoch": 1.5232793522267207, | |
| "grad_norm": 0.24280501902103424, | |
| "learning_rate": 9.848178137651822e-06, | |
| "loss": 1.4782, | |
| "mean_token_accuracy": 0.6421392917633056, | |
| "num_tokens": 17145789.0, | |
| "step": 3010 | |
| }, | |
| { | |
| "entropy": 1.3536667227745056, | |
| "epoch": 1.52834008097166, | |
| "grad_norm": 0.3393391966819763, | |
| "learning_rate": 9.814439946018895e-06, | |
| "loss": 1.3665, | |
| "mean_token_accuracy": 0.659710270166397, | |
| "num_tokens": 17200045.0, | |
| "step": 3020 | |
| }, | |
| { | |
| "entropy": 1.477955400943756, | |
| "epoch": 1.5334008097165992, | |
| "grad_norm": 0.2695980668067932, | |
| "learning_rate": 9.780701754385966e-06, | |
| "loss": 1.4773, | |
| "mean_token_accuracy": 0.6442347228527069, | |
| "num_tokens": 17253382.0, | |
| "step": 3030 | |
| }, | |
| { | |
| "entropy": 1.4808340609073638, | |
| "epoch": 1.5384615384615383, | |
| "grad_norm": 0.32629549503326416, | |
| "learning_rate": 9.746963562753037e-06, | |
| "loss": 1.487, | |
| "mean_token_accuracy": 0.6431676924228669, | |
| "num_tokens": 17306138.0, | |
| "step": 3040 | |
| }, | |
| { | |
| "entropy": 1.4529295325279237, | |
| "epoch": 1.5435222672064777, | |
| "grad_norm": 0.2536776661872864, | |
| "learning_rate": 9.713225371120109e-06, | |
| "loss": 1.4591, | |
| "mean_token_accuracy": 0.6488350391387939, | |
| "num_tokens": 17368255.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "entropy": 1.438970947265625, | |
| "epoch": 1.548582995951417, | |
| "grad_norm": 0.26340344548225403, | |
| "learning_rate": 9.67948717948718e-06, | |
| "loss": 1.4513, | |
| "mean_token_accuracy": 0.6449286341667175, | |
| "num_tokens": 17426575.0, | |
| "step": 3060 | |
| }, | |
| { | |
| "entropy": 1.709315264225006, | |
| "epoch": 1.5536437246963564, | |
| "grad_norm": 0.31817519664764404, | |
| "learning_rate": 9.645748987854253e-06, | |
| "loss": 1.7215, | |
| "mean_token_accuracy": 0.6018387496471405, | |
| "num_tokens": 17488131.0, | |
| "step": 3070 | |
| }, | |
| { | |
| "entropy": 1.5443035364151, | |
| "epoch": 1.5587044534412957, | |
| "grad_norm": 0.3266107141971588, | |
| "learning_rate": 9.612010796221324e-06, | |
| "loss": 1.5511, | |
| "mean_token_accuracy": 0.63644158244133, | |
| "num_tokens": 17546517.0, | |
| "step": 3080 | |
| }, | |
| { | |
| "entropy": 1.6439727783203124, | |
| "epoch": 1.5637651821862348, | |
| "grad_norm": 0.25957760214805603, | |
| "learning_rate": 9.578272604588395e-06, | |
| "loss": 1.6584, | |
| "mean_token_accuracy": 0.6159623801708222, | |
| "num_tokens": 17605712.0, | |
| "step": 3090 | |
| }, | |
| { | |
| "entropy": 1.5427301168441772, | |
| "epoch": 1.568825910931174, | |
| "grad_norm": 0.27618587017059326, | |
| "learning_rate": 9.544534412955466e-06, | |
| "loss": 1.5451, | |
| "mean_token_accuracy": 0.6260794997215271, | |
| "num_tokens": 17659721.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "entropy": 1.383056926727295, | |
| "epoch": 1.5738866396761133, | |
| "grad_norm": 0.3027380406856537, | |
| "learning_rate": 9.510796221322538e-06, | |
| "loss": 1.3743, | |
| "mean_token_accuracy": 0.6610878467559814, | |
| "num_tokens": 17714100.0, | |
| "step": 3110 | |
| }, | |
| { | |
| "entropy": 1.503647792339325, | |
| "epoch": 1.5789473684210527, | |
| "grad_norm": 0.29517048597335815, | |
| "learning_rate": 9.47705802968961e-06, | |
| "loss": 1.5053, | |
| "mean_token_accuracy": 0.6365588068962097, | |
| "num_tokens": 17771308.0, | |
| "step": 3120 | |
| }, | |
| { | |
| "entropy": 1.548321044445038, | |
| "epoch": 1.584008097165992, | |
| "grad_norm": 0.255573570728302, | |
| "learning_rate": 9.44331983805668e-06, | |
| "loss": 1.5523, | |
| "mean_token_accuracy": 0.6278903543949127, | |
| "num_tokens": 17828285.0, | |
| "step": 3130 | |
| }, | |
| { | |
| "entropy": 1.5210648417472838, | |
| "epoch": 1.5890688259109311, | |
| "grad_norm": 0.3004836142063141, | |
| "learning_rate": 9.409581646423753e-06, | |
| "loss": 1.5331, | |
| "mean_token_accuracy": 0.6306875169277191, | |
| "num_tokens": 17888745.0, | |
| "step": 3140 | |
| }, | |
| { | |
| "entropy": 1.4695003390312196, | |
| "epoch": 1.5941295546558705, | |
| "grad_norm": 0.2813291549682617, | |
| "learning_rate": 9.375843454790824e-06, | |
| "loss": 1.4673, | |
| "mean_token_accuracy": 0.6395570158958435, | |
| "num_tokens": 17949494.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "entropy": 1.4948333382606507, | |
| "epoch": 1.5991902834008096, | |
| "grad_norm": 0.3244977593421936, | |
| "learning_rate": 9.342105263157895e-06, | |
| "loss": 1.5044, | |
| "mean_token_accuracy": 0.6397220313549041, | |
| "num_tokens": 18006803.0, | |
| "step": 3160 | |
| }, | |
| { | |
| "entropy": 1.4767319202423095, | |
| "epoch": 1.604251012145749, | |
| "grad_norm": 0.2612328827381134, | |
| "learning_rate": 9.308367071524967e-06, | |
| "loss": 1.4795, | |
| "mean_token_accuracy": 0.6446912109851837, | |
| "num_tokens": 18062396.0, | |
| "step": 3170 | |
| }, | |
| { | |
| "entropy": 1.5266436815261841, | |
| "epoch": 1.6093117408906883, | |
| "grad_norm": 0.3239694833755493, | |
| "learning_rate": 9.274628879892038e-06, | |
| "loss": 1.5418, | |
| "mean_token_accuracy": 0.6299160838127136, | |
| "num_tokens": 18110883.0, | |
| "step": 3180 | |
| }, | |
| { | |
| "entropy": 1.4701735734939576, | |
| "epoch": 1.6143724696356276, | |
| "grad_norm": 0.2672860324382782, | |
| "learning_rate": 9.240890688259109e-06, | |
| "loss": 1.4503, | |
| "mean_token_accuracy": 0.6537446200847625, | |
| "num_tokens": 18159767.0, | |
| "step": 3190 | |
| }, | |
| { | |
| "entropy": 1.4009694814682008, | |
| "epoch": 1.6194331983805668, | |
| "grad_norm": 0.29456961154937744, | |
| "learning_rate": 9.207152496626182e-06, | |
| "loss": 1.4045, | |
| "mean_token_accuracy": 0.6546292185783387, | |
| "num_tokens": 18217624.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "entropy": 1.6173853397369384, | |
| "epoch": 1.624493927125506, | |
| "grad_norm": 0.30044859647750854, | |
| "learning_rate": 9.173414304993253e-06, | |
| "loss": 1.6125, | |
| "mean_token_accuracy": 0.6170080423355102, | |
| "num_tokens": 18277255.0, | |
| "step": 3210 | |
| }, | |
| { | |
| "entropy": 1.5926665306091308, | |
| "epoch": 1.6295546558704452, | |
| "grad_norm": 0.29986920952796936, | |
| "learning_rate": 9.139676113360324e-06, | |
| "loss": 1.6003, | |
| "mean_token_accuracy": 0.6278518795967102, | |
| "num_tokens": 18335766.0, | |
| "step": 3220 | |
| }, | |
| { | |
| "entropy": 1.4131199598312378, | |
| "epoch": 1.6346153846153846, | |
| "grad_norm": 0.33528971672058105, | |
| "learning_rate": 9.105937921727396e-06, | |
| "loss": 1.4244, | |
| "mean_token_accuracy": 0.6553650915622711, | |
| "num_tokens": 18392231.0, | |
| "step": 3230 | |
| }, | |
| { | |
| "entropy": 1.5079811573028565, | |
| "epoch": 1.639676113360324, | |
| "grad_norm": 0.32541990280151367, | |
| "learning_rate": 9.072199730094467e-06, | |
| "loss": 1.5182, | |
| "mean_token_accuracy": 0.6279927968978882, | |
| "num_tokens": 18447207.0, | |
| "step": 3240 | |
| }, | |
| { | |
| "entropy": 1.5234640002250672, | |
| "epoch": 1.6447368421052633, | |
| "grad_norm": 0.2562153935432434, | |
| "learning_rate": 9.03846153846154e-06, | |
| "loss": 1.5149, | |
| "mean_token_accuracy": 0.6328702330589294, | |
| "num_tokens": 18505601.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "entropy": 1.6198287844657897, | |
| "epoch": 1.6497975708502024, | |
| "grad_norm": 0.3361916244029999, | |
| "learning_rate": 9.004723346828611e-06, | |
| "loss": 1.6255, | |
| "mean_token_accuracy": 0.61873180270195, | |
| "num_tokens": 18558902.0, | |
| "step": 3260 | |
| }, | |
| { | |
| "entropy": 1.4600189566612243, | |
| "epoch": 1.6548582995951417, | |
| "grad_norm": 0.304756760597229, | |
| "learning_rate": 8.970985155195682e-06, | |
| "loss": 1.4471, | |
| "mean_token_accuracy": 0.6477857530117035, | |
| "num_tokens": 18619635.0, | |
| "step": 3270 | |
| }, | |
| { | |
| "entropy": 1.5648074269294738, | |
| "epoch": 1.6599190283400809, | |
| "grad_norm": 0.30415093898773193, | |
| "learning_rate": 8.937246963562753e-06, | |
| "loss": 1.5767, | |
| "mean_token_accuracy": 0.6242169559001922, | |
| "num_tokens": 18674088.0, | |
| "step": 3280 | |
| }, | |
| { | |
| "entropy": 1.5605645179748535, | |
| "epoch": 1.6649797570850202, | |
| "grad_norm": 0.26909834146499634, | |
| "learning_rate": 8.903508771929825e-06, | |
| "loss": 1.5605, | |
| "mean_token_accuracy": 0.635642808675766, | |
| "num_tokens": 18734453.0, | |
| "step": 3290 | |
| }, | |
| { | |
| "entropy": 1.5742518544197082, | |
| "epoch": 1.6700404858299596, | |
| "grad_norm": 0.2826893925666809, | |
| "learning_rate": 8.869770580296898e-06, | |
| "loss": 1.5643, | |
| "mean_token_accuracy": 0.6317550718784333, | |
| "num_tokens": 18793346.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "entropy": 1.6354934215545653, | |
| "epoch": 1.675101214574899, | |
| "grad_norm": 0.2833310067653656, | |
| "learning_rate": 8.836032388663969e-06, | |
| "loss": 1.6417, | |
| "mean_token_accuracy": 0.6152911841869354, | |
| "num_tokens": 18845698.0, | |
| "step": 3310 | |
| }, | |
| { | |
| "entropy": 1.5844687461853026, | |
| "epoch": 1.680161943319838, | |
| "grad_norm": 0.3369496762752533, | |
| "learning_rate": 8.80229419703104e-06, | |
| "loss": 1.59, | |
| "mean_token_accuracy": 0.6237683236598969, | |
| "num_tokens": 18903498.0, | |
| "step": 3320 | |
| }, | |
| { | |
| "entropy": 1.570583975315094, | |
| "epoch": 1.6852226720647774, | |
| "grad_norm": 0.36443012952804565, | |
| "learning_rate": 8.768556005398111e-06, | |
| "loss": 1.5757, | |
| "mean_token_accuracy": 0.6224404633045196, | |
| "num_tokens": 18957652.0, | |
| "step": 3330 | |
| }, | |
| { | |
| "entropy": 1.6000779747962952, | |
| "epoch": 1.6902834008097165, | |
| "grad_norm": 0.32085222005844116, | |
| "learning_rate": 8.734817813765182e-06, | |
| "loss": 1.6067, | |
| "mean_token_accuracy": 0.6194514989852905, | |
| "num_tokens": 19018526.0, | |
| "step": 3340 | |
| }, | |
| { | |
| "entropy": 1.4028130412101745, | |
| "epoch": 1.6953441295546559, | |
| "grad_norm": 0.2869940996170044, | |
| "learning_rate": 8.701079622132255e-06, | |
| "loss": 1.41, | |
| "mean_token_accuracy": 0.6613210141658783, | |
| "num_tokens": 19073801.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "entropy": 1.5723829984664917, | |
| "epoch": 1.7004048582995952, | |
| "grad_norm": 0.3251384496688843, | |
| "learning_rate": 8.667341430499327e-06, | |
| "loss": 1.5647, | |
| "mean_token_accuracy": 0.6301065504550933, | |
| "num_tokens": 19128923.0, | |
| "step": 3360 | |
| }, | |
| { | |
| "entropy": 1.5013223052024842, | |
| "epoch": 1.7054655870445345, | |
| "grad_norm": 0.30307453870773315, | |
| "learning_rate": 8.633603238866398e-06, | |
| "loss": 1.4962, | |
| "mean_token_accuracy": 0.6359362661838531, | |
| "num_tokens": 19186932.0, | |
| "step": 3370 | |
| }, | |
| { | |
| "entropy": 1.602057731151581, | |
| "epoch": 1.7105263157894737, | |
| "grad_norm": 0.369747132062912, | |
| "learning_rate": 8.599865047233469e-06, | |
| "loss": 1.5956, | |
| "mean_token_accuracy": 0.6288884073495865, | |
| "num_tokens": 19245687.0, | |
| "step": 3380 | |
| }, | |
| { | |
| "entropy": 1.5921403288841247, | |
| "epoch": 1.7155870445344128, | |
| "grad_norm": 0.2498423159122467, | |
| "learning_rate": 8.56612685560054e-06, | |
| "loss": 1.5971, | |
| "mean_token_accuracy": 0.621882963180542, | |
| "num_tokens": 19307247.0, | |
| "step": 3390 | |
| }, | |
| { | |
| "entropy": 1.5030475974082946, | |
| "epoch": 1.7206477732793521, | |
| "grad_norm": 0.3407726585865021, | |
| "learning_rate": 8.532388663967613e-06, | |
| "loss": 1.5109, | |
| "mean_token_accuracy": 0.6346028625965119, | |
| "num_tokens": 19367320.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "entropy": 1.4825459122657776, | |
| "epoch": 1.7257085020242915, | |
| "grad_norm": 0.27978742122650146, | |
| "learning_rate": 8.498650472334684e-06, | |
| "loss": 1.4831, | |
| "mean_token_accuracy": 0.6394042372703552, | |
| "num_tokens": 19429919.0, | |
| "step": 3410 | |
| }, | |
| { | |
| "entropy": 1.4931324481964112, | |
| "epoch": 1.7307692307692308, | |
| "grad_norm": 0.288116455078125, | |
| "learning_rate": 8.464912280701755e-06, | |
| "loss": 1.4957, | |
| "mean_token_accuracy": 0.6380782008171082, | |
| "num_tokens": 19485577.0, | |
| "step": 3420 | |
| }, | |
| { | |
| "entropy": 1.4949661374092102, | |
| "epoch": 1.7358299595141702, | |
| "grad_norm": 0.31869447231292725, | |
| "learning_rate": 8.431174089068827e-06, | |
| "loss": 1.4926, | |
| "mean_token_accuracy": 0.6403613984584808, | |
| "num_tokens": 19541077.0, | |
| "step": 3430 | |
| }, | |
| { | |
| "entropy": 1.596668827533722, | |
| "epoch": 1.7408906882591093, | |
| "grad_norm": 0.28910359740257263, | |
| "learning_rate": 8.397435897435898e-06, | |
| "loss": 1.6038, | |
| "mean_token_accuracy": 0.6189565002918244, | |
| "num_tokens": 19605457.0, | |
| "step": 3440 | |
| }, | |
| { | |
| "entropy": 1.4728464007377624, | |
| "epoch": 1.7459514170040484, | |
| "grad_norm": 0.27498626708984375, | |
| "learning_rate": 8.36369770580297e-06, | |
| "loss": 1.485, | |
| "mean_token_accuracy": 0.6360372960567474, | |
| "num_tokens": 19667915.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "entropy": 1.5345482110977173, | |
| "epoch": 1.7510121457489878, | |
| "grad_norm": 0.2618876099586487, | |
| "learning_rate": 8.32995951417004e-06, | |
| "loss": 1.5381, | |
| "mean_token_accuracy": 0.6326977252960205, | |
| "num_tokens": 19725030.0, | |
| "step": 3460 | |
| }, | |
| { | |
| "entropy": 1.393745517730713, | |
| "epoch": 1.7560728744939271, | |
| "grad_norm": 0.28456103801727295, | |
| "learning_rate": 8.296221322537113e-06, | |
| "loss": 1.3836, | |
| "mean_token_accuracy": 0.6631475329399109, | |
| "num_tokens": 19784982.0, | |
| "step": 3470 | |
| }, | |
| { | |
| "entropy": 1.3884447813034058, | |
| "epoch": 1.7611336032388665, | |
| "grad_norm": 0.27543261647224426, | |
| "learning_rate": 8.262483130904184e-06, | |
| "loss": 1.3847, | |
| "mean_token_accuracy": 0.6586002767086029, | |
| "num_tokens": 19848002.0, | |
| "step": 3480 | |
| }, | |
| { | |
| "entropy": 1.4454799056053163, | |
| "epoch": 1.7661943319838058, | |
| "grad_norm": 0.36814162135124207, | |
| "learning_rate": 8.228744939271256e-06, | |
| "loss": 1.455, | |
| "mean_token_accuracy": 0.6476415753364563, | |
| "num_tokens": 19906592.0, | |
| "step": 3490 | |
| }, | |
| { | |
| "entropy": 1.3266122221946717, | |
| "epoch": 1.771255060728745, | |
| "grad_norm": 0.2580831050872803, | |
| "learning_rate": 8.195006747638327e-06, | |
| "loss": 1.329, | |
| "mean_token_accuracy": 0.6699170589447021, | |
| "num_tokens": 19963138.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "entropy": 1.670899212360382, | |
| "epoch": 1.776315789473684, | |
| "grad_norm": 0.29895538091659546, | |
| "learning_rate": 8.161268556005398e-06, | |
| "loss": 1.6755, | |
| "mean_token_accuracy": 0.613396269083023, | |
| "num_tokens": 20021345.0, | |
| "step": 3510 | |
| }, | |
| { | |
| "entropy": 1.354777181148529, | |
| "epoch": 1.7813765182186234, | |
| "grad_norm": 0.34177860617637634, | |
| "learning_rate": 8.12753036437247e-06, | |
| "loss": 1.3456, | |
| "mean_token_accuracy": 0.6686203420162201, | |
| "num_tokens": 20072875.0, | |
| "step": 3520 | |
| }, | |
| { | |
| "entropy": 1.5505508065223694, | |
| "epoch": 1.7864372469635628, | |
| "grad_norm": 0.2592535614967346, | |
| "learning_rate": 8.093792172739542e-06, | |
| "loss": 1.5535, | |
| "mean_token_accuracy": 0.6263529658317566, | |
| "num_tokens": 20132207.0, | |
| "step": 3530 | |
| }, | |
| { | |
| "entropy": 1.485759150981903, | |
| "epoch": 1.791497975708502, | |
| "grad_norm": 0.2742493450641632, | |
| "learning_rate": 8.060053981106613e-06, | |
| "loss": 1.4964, | |
| "mean_token_accuracy": 0.635893827676773, | |
| "num_tokens": 20195010.0, | |
| "step": 3540 | |
| }, | |
| { | |
| "entropy": 1.5584338903427124, | |
| "epoch": 1.7965587044534415, | |
| "grad_norm": 0.2946804463863373, | |
| "learning_rate": 8.026315789473685e-06, | |
| "loss": 1.5553, | |
| "mean_token_accuracy": 0.6236848413944245, | |
| "num_tokens": 20257540.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "entropy": 1.5169085144996644, | |
| "epoch": 1.8016194331983806, | |
| "grad_norm": 0.26114436984062195, | |
| "learning_rate": 7.992577597840756e-06, | |
| "loss": 1.5138, | |
| "mean_token_accuracy": 0.6321025729179383, | |
| "num_tokens": 20318045.0, | |
| "step": 3560 | |
| }, | |
| { | |
| "entropy": 1.337432289123535, | |
| "epoch": 1.8066801619433197, | |
| "grad_norm": 0.29184892773628235, | |
| "learning_rate": 7.958839406207827e-06, | |
| "loss": 1.3471, | |
| "mean_token_accuracy": 0.6645301103591919, | |
| "num_tokens": 20373587.0, | |
| "step": 3570 | |
| }, | |
| { | |
| "entropy": 1.5995257258415223, | |
| "epoch": 1.811740890688259, | |
| "grad_norm": 0.3016499876976013, | |
| "learning_rate": 7.9251012145749e-06, | |
| "loss": 1.5929, | |
| "mean_token_accuracy": 0.6236974120140075, | |
| "num_tokens": 20431451.0, | |
| "step": 3580 | |
| }, | |
| { | |
| "entropy": 1.623330581188202, | |
| "epoch": 1.8168016194331984, | |
| "grad_norm": 0.35448580980300903, | |
| "learning_rate": 7.891363022941971e-06, | |
| "loss": 1.6129, | |
| "mean_token_accuracy": 0.6200532436370849, | |
| "num_tokens": 20487984.0, | |
| "step": 3590 | |
| }, | |
| { | |
| "entropy": 1.5125982403755187, | |
| "epoch": 1.8218623481781377, | |
| "grad_norm": 0.32799309492111206, | |
| "learning_rate": 7.857624831309042e-06, | |
| "loss": 1.5025, | |
| "mean_token_accuracy": 0.6384036839008331, | |
| "num_tokens": 20541725.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "entropy": 1.53478661775589, | |
| "epoch": 1.8269230769230769, | |
| "grad_norm": 0.32730069756507874, | |
| "learning_rate": 7.823886639676114e-06, | |
| "loss": 1.5294, | |
| "mean_token_accuracy": 0.6311649143695831, | |
| "num_tokens": 20600145.0, | |
| "step": 3610 | |
| }, | |
| { | |
| "entropy": 1.5410036087036132, | |
| "epoch": 1.8319838056680162, | |
| "grad_norm": 0.3669460117816925, | |
| "learning_rate": 7.790148448043185e-06, | |
| "loss": 1.5537, | |
| "mean_token_accuracy": 0.6282461225986481, | |
| "num_tokens": 20655732.0, | |
| "step": 3620 | |
| }, | |
| { | |
| "entropy": 1.453836238384247, | |
| "epoch": 1.8370445344129553, | |
| "grad_norm": 0.31468528509140015, | |
| "learning_rate": 7.756410256410258e-06, | |
| "loss": 1.4568, | |
| "mean_token_accuracy": 0.6447117567062378, | |
| "num_tokens": 20712525.0, | |
| "step": 3630 | |
| }, | |
| { | |
| "entropy": 1.475819957256317, | |
| "epoch": 1.8421052631578947, | |
| "grad_norm": 0.29064053297042847, | |
| "learning_rate": 7.722672064777329e-06, | |
| "loss": 1.4821, | |
| "mean_token_accuracy": 0.6439218640327453, | |
| "num_tokens": 20768387.0, | |
| "step": 3640 | |
| }, | |
| { | |
| "entropy": 1.4041451275348664, | |
| "epoch": 1.847165991902834, | |
| "grad_norm": 0.2812243700027466, | |
| "learning_rate": 7.6889338731444e-06, | |
| "loss": 1.4044, | |
| "mean_token_accuracy": 0.6594688057899475, | |
| "num_tokens": 20826082.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "entropy": 1.6357195615768432, | |
| "epoch": 1.8522267206477734, | |
| "grad_norm": 0.2777828276157379, | |
| "learning_rate": 7.655195681511471e-06, | |
| "loss": 1.6412, | |
| "mean_token_accuracy": 0.6139614999294281, | |
| "num_tokens": 20882894.0, | |
| "step": 3660 | |
| }, | |
| { | |
| "entropy": 1.535701298713684, | |
| "epoch": 1.8572874493927125, | |
| "grad_norm": 0.3234771490097046, | |
| "learning_rate": 7.6214574898785435e-06, | |
| "loss": 1.5333, | |
| "mean_token_accuracy": 0.6341882109642029, | |
| "num_tokens": 20938122.0, | |
| "step": 3670 | |
| }, | |
| { | |
| "entropy": 1.5881774067878722, | |
| "epoch": 1.8623481781376519, | |
| "grad_norm": 0.3148040175437927, | |
| "learning_rate": 7.587719298245615e-06, | |
| "loss": 1.6014, | |
| "mean_token_accuracy": 0.6275585472583771, | |
| "num_tokens": 20995824.0, | |
| "step": 3680 | |
| }, | |
| { | |
| "entropy": 1.3315507769584656, | |
| "epoch": 1.867408906882591, | |
| "grad_norm": 0.327178031206131, | |
| "learning_rate": 7.553981106612687e-06, | |
| "loss": 1.3346, | |
| "mean_token_accuracy": 0.668057644367218, | |
| "num_tokens": 21050607.0, | |
| "step": 3690 | |
| }, | |
| { | |
| "entropy": 1.5097809910774231, | |
| "epoch": 1.8724696356275303, | |
| "grad_norm": 0.29023247957229614, | |
| "learning_rate": 7.520242914979757e-06, | |
| "loss": 1.5045, | |
| "mean_token_accuracy": 0.6351129233837127, | |
| "num_tokens": 21106066.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "entropy": 1.4823681235313415, | |
| "epoch": 1.8775303643724697, | |
| "grad_norm": 0.3215828537940979, | |
| "learning_rate": 7.486504723346829e-06, | |
| "loss": 1.4818, | |
| "mean_token_accuracy": 0.6453963398933411, | |
| "num_tokens": 21164981.0, | |
| "step": 3710 | |
| }, | |
| { | |
| "entropy": 1.4289534091949463, | |
| "epoch": 1.882591093117409, | |
| "grad_norm": 0.3170277178287506, | |
| "learning_rate": 7.452766531713901e-06, | |
| "loss": 1.446, | |
| "mean_token_accuracy": 0.6434959769248962, | |
| "num_tokens": 21223777.0, | |
| "step": 3720 | |
| }, | |
| { | |
| "entropy": 1.5202300190925597, | |
| "epoch": 1.8876518218623481, | |
| "grad_norm": 0.2913142740726471, | |
| "learning_rate": 7.4190283400809725e-06, | |
| "loss": 1.5349, | |
| "mean_token_accuracy": 0.6304753959178925, | |
| "num_tokens": 21279646.0, | |
| "step": 3730 | |
| }, | |
| { | |
| "entropy": 1.6774636268615724, | |
| "epoch": 1.8927125506072875, | |
| "grad_norm": 0.33726078271865845, | |
| "learning_rate": 7.385290148448044e-06, | |
| "loss": 1.6783, | |
| "mean_token_accuracy": 0.6076300263404846, | |
| "num_tokens": 21335265.0, | |
| "step": 3740 | |
| }, | |
| { | |
| "entropy": 1.5423774600028992, | |
| "epoch": 1.8977732793522266, | |
| "grad_norm": 0.27264466881752014, | |
| "learning_rate": 7.351551956815115e-06, | |
| "loss": 1.5533, | |
| "mean_token_accuracy": 0.6308148026466369, | |
| "num_tokens": 21396070.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "entropy": 1.4624953866004944, | |
| "epoch": 1.902834008097166, | |
| "grad_norm": 0.35332223773002625, | |
| "learning_rate": 7.317813765182187e-06, | |
| "loss": 1.4655, | |
| "mean_token_accuracy": 0.641634488105774, | |
| "num_tokens": 21452996.0, | |
| "step": 3760 | |
| }, | |
| { | |
| "entropy": 1.4817042350769043, | |
| "epoch": 1.9078947368421053, | |
| "grad_norm": 0.3333725035190582, | |
| "learning_rate": 7.284075573549258e-06, | |
| "loss": 1.4903, | |
| "mean_token_accuracy": 0.6411226511001586, | |
| "num_tokens": 21508906.0, | |
| "step": 3770 | |
| }, | |
| { | |
| "entropy": 1.5518399238586427, | |
| "epoch": 1.9129554655870447, | |
| "grad_norm": 0.2960481643676758, | |
| "learning_rate": 7.25033738191633e-06, | |
| "loss": 1.5445, | |
| "mean_token_accuracy": 0.6266183733940125, | |
| "num_tokens": 21568967.0, | |
| "step": 3780 | |
| }, | |
| { | |
| "entropy": 1.5748514771461486, | |
| "epoch": 1.9180161943319838, | |
| "grad_norm": 0.31355923414230347, | |
| "learning_rate": 7.216599190283401e-06, | |
| "loss": 1.5716, | |
| "mean_token_accuracy": 0.6256311893463135, | |
| "num_tokens": 21631403.0, | |
| "step": 3790 | |
| }, | |
| { | |
| "entropy": 1.450837540626526, | |
| "epoch": 1.9230769230769231, | |
| "grad_norm": 0.27845069766044617, | |
| "learning_rate": 7.182860998650473e-06, | |
| "loss": 1.4611, | |
| "mean_token_accuracy": 0.650393956899643, | |
| "num_tokens": 21688727.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "entropy": 1.611345076560974, | |
| "epoch": 1.9281376518218623, | |
| "grad_norm": 0.2685949206352234, | |
| "learning_rate": 7.149122807017545e-06, | |
| "loss": 1.6126, | |
| "mean_token_accuracy": 0.6262206137180328, | |
| "num_tokens": 21742484.0, | |
| "step": 3810 | |
| }, | |
| { | |
| "entropy": 1.3427100419998168, | |
| "epoch": 1.9331983805668016, | |
| "grad_norm": 0.41044095158576965, | |
| "learning_rate": 7.115384615384616e-06, | |
| "loss": 1.3418, | |
| "mean_token_accuracy": 0.663384473323822, | |
| "num_tokens": 21799804.0, | |
| "step": 3820 | |
| }, | |
| { | |
| "entropy": 1.4495494306087493, | |
| "epoch": 1.938259109311741, | |
| "grad_norm": 0.5138364434242249, | |
| "learning_rate": 7.081646423751688e-06, | |
| "loss": 1.4481, | |
| "mean_token_accuracy": 0.6450947999954224, | |
| "num_tokens": 21858581.0, | |
| "step": 3830 | |
| }, | |
| { | |
| "entropy": 1.3911212921142577, | |
| "epoch": 1.9433198380566803, | |
| "grad_norm": 0.29537278413772583, | |
| "learning_rate": 7.047908232118758e-06, | |
| "loss": 1.3992, | |
| "mean_token_accuracy": 0.6531029522418976, | |
| "num_tokens": 21915585.0, | |
| "step": 3840 | |
| }, | |
| { | |
| "entropy": 1.4535645723342896, | |
| "epoch": 1.9483805668016194, | |
| "grad_norm": 0.25756731629371643, | |
| "learning_rate": 7.0141700404858304e-06, | |
| "loss": 1.4401, | |
| "mean_token_accuracy": 0.6463619887828826, | |
| "num_tokens": 21976079.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "entropy": 1.4952040553092956, | |
| "epoch": 1.9534412955465585, | |
| "grad_norm": 0.3046974539756775, | |
| "learning_rate": 6.9804318488529025e-06, | |
| "loss": 1.5097, | |
| "mean_token_accuracy": 0.6341541647911072, | |
| "num_tokens": 22035025.0, | |
| "step": 3860 | |
| }, | |
| { | |
| "entropy": 1.5177082777023316, | |
| "epoch": 1.958502024291498, | |
| "grad_norm": 0.3251610994338989, | |
| "learning_rate": 6.946693657219974e-06, | |
| "loss": 1.5163, | |
| "mean_token_accuracy": 0.6359520852565765, | |
| "num_tokens": 22092788.0, | |
| "step": 3870 | |
| }, | |
| { | |
| "entropy": 1.4667699456214904, | |
| "epoch": 1.9635627530364372, | |
| "grad_norm": 0.3152090311050415, | |
| "learning_rate": 6.912955465587045e-06, | |
| "loss": 1.4715, | |
| "mean_token_accuracy": 0.6418612182140351, | |
| "num_tokens": 22153745.0, | |
| "step": 3880 | |
| }, | |
| { | |
| "entropy": 1.6101324200630187, | |
| "epoch": 1.9686234817813766, | |
| "grad_norm": 0.340584933757782, | |
| "learning_rate": 6.879217273954116e-06, | |
| "loss": 1.6212, | |
| "mean_token_accuracy": 0.6180540084838867, | |
| "num_tokens": 22211817.0, | |
| "step": 3890 | |
| }, | |
| { | |
| "entropy": 1.459115242958069, | |
| "epoch": 1.973684210526316, | |
| "grad_norm": 0.2879182696342468, | |
| "learning_rate": 6.845479082321188e-06, | |
| "loss": 1.4419, | |
| "mean_token_accuracy": 0.6466407418251038, | |
| "num_tokens": 22265817.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "entropy": 1.4101441740989684, | |
| "epoch": 1.978744939271255, | |
| "grad_norm": 0.3250649571418762, | |
| "learning_rate": 6.81174089068826e-06, | |
| "loss": 1.4063, | |
| "mean_token_accuracy": 0.6551910638809204, | |
| "num_tokens": 22324629.0, | |
| "step": 3910 | |
| }, | |
| { | |
| "entropy": 1.6089503526687623, | |
| "epoch": 1.9838056680161942, | |
| "grad_norm": 0.3786233961582184, | |
| "learning_rate": 6.7780026990553315e-06, | |
| "loss": 1.6147, | |
| "mean_token_accuracy": 0.6272029399871826, | |
| "num_tokens": 22381691.0, | |
| "step": 3920 | |
| }, | |
| { | |
| "entropy": 1.3815577149391174, | |
| "epoch": 1.9888663967611335, | |
| "grad_norm": 0.304582417011261, | |
| "learning_rate": 6.744264507422402e-06, | |
| "loss": 1.3759, | |
| "mean_token_accuracy": 0.657072639465332, | |
| "num_tokens": 22432987.0, | |
| "step": 3930 | |
| }, | |
| { | |
| "entropy": 1.6114310383796693, | |
| "epoch": 1.9939271255060729, | |
| "grad_norm": 0.3556569218635559, | |
| "learning_rate": 6.710526315789474e-06, | |
| "loss": 1.6089, | |
| "mean_token_accuracy": 0.6203605115413666, | |
| "num_tokens": 22491567.0, | |
| "step": 3940 | |
| }, | |
| { | |
| "entropy": 1.5013386726379394, | |
| "epoch": 1.9989878542510122, | |
| "grad_norm": 0.3433378040790558, | |
| "learning_rate": 6.676788124156546e-06, | |
| "loss": 1.497, | |
| "mean_token_accuracy": 0.6365504443645478, | |
| "num_tokens": 22548351.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "entropy": 1.4863505601882934, | |
| "epoch": 2.0040485829959516, | |
| "grad_norm": 0.348243772983551, | |
| "learning_rate": 6.643049932523617e-06, | |
| "loss": 1.4864, | |
| "mean_token_accuracy": 0.6374901950359344, | |
| "num_tokens": 22596557.0, | |
| "step": 3960 | |
| }, | |
| { | |
| "entropy": 1.5316878080368042, | |
| "epoch": 2.0091093117408905, | |
| "grad_norm": 0.32034119963645935, | |
| "learning_rate": 6.609311740890689e-06, | |
| "loss": 1.538, | |
| "mean_token_accuracy": 0.6406886577606201, | |
| "num_tokens": 22656578.0, | |
| "step": 3970 | |
| }, | |
| { | |
| "entropy": 1.422401201725006, | |
| "epoch": 2.01417004048583, | |
| "grad_norm": 0.2935118079185486, | |
| "learning_rate": 6.57557354925776e-06, | |
| "loss": 1.4232, | |
| "mean_token_accuracy": 0.6517488479614257, | |
| "num_tokens": 22715169.0, | |
| "step": 3980 | |
| }, | |
| { | |
| "entropy": 1.4487539887428285, | |
| "epoch": 2.019230769230769, | |
| "grad_norm": 0.311564177274704, | |
| "learning_rate": 6.541835357624832e-06, | |
| "loss": 1.4388, | |
| "mean_token_accuracy": 0.6472173929214478, | |
| "num_tokens": 22772089.0, | |
| "step": 3990 | |
| }, | |
| { | |
| "entropy": 1.5003145456314086, | |
| "epoch": 2.0242914979757085, | |
| "grad_norm": 0.2912486493587494, | |
| "learning_rate": 6.508097165991904e-06, | |
| "loss": 1.5015, | |
| "mean_token_accuracy": 0.6321758210659028, | |
| "num_tokens": 22834505.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "entropy": 1.4098521590232849, | |
| "epoch": 2.029352226720648, | |
| "grad_norm": 0.29250964522361755, | |
| "learning_rate": 6.474358974358975e-06, | |
| "loss": 1.4107, | |
| "mean_token_accuracy": 0.6528907954692841, | |
| "num_tokens": 22889105.0, | |
| "step": 4010 | |
| }, | |
| { | |
| "entropy": 1.4532611846923829, | |
| "epoch": 2.034412955465587, | |
| "grad_norm": 0.34667733311653137, | |
| "learning_rate": 6.440620782726047e-06, | |
| "loss": 1.4581, | |
| "mean_token_accuracy": 0.6446337521076202, | |
| "num_tokens": 22942406.0, | |
| "step": 4020 | |
| }, | |
| { | |
| "entropy": 1.5700780391693114, | |
| "epoch": 2.039473684210526, | |
| "grad_norm": 0.3028770685195923, | |
| "learning_rate": 6.406882591093117e-06, | |
| "loss": 1.5643, | |
| "mean_token_accuracy": 0.6249816060066223, | |
| "num_tokens": 22996028.0, | |
| "step": 4030 | |
| }, | |
| { | |
| "entropy": 1.6611987948417664, | |
| "epoch": 2.0445344129554655, | |
| "grad_norm": 0.30681440234184265, | |
| "learning_rate": 6.3731443994601894e-06, | |
| "loss": 1.6827, | |
| "mean_token_accuracy": 0.6147861301898956, | |
| "num_tokens": 23051645.0, | |
| "step": 4040 | |
| }, | |
| { | |
| "entropy": 1.4732018947601317, | |
| "epoch": 2.049595141700405, | |
| "grad_norm": 0.26491233706474304, | |
| "learning_rate": 6.3394062078272615e-06, | |
| "loss": 1.466, | |
| "mean_token_accuracy": 0.6404920816421509, | |
| "num_tokens": 23105066.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "entropy": 1.5172441840171813, | |
| "epoch": 2.054655870445344, | |
| "grad_norm": 0.3094307780265808, | |
| "learning_rate": 6.305668016194333e-06, | |
| "loss": 1.5004, | |
| "mean_token_accuracy": 0.6372400879859924, | |
| "num_tokens": 23157352.0, | |
| "step": 4060 | |
| }, | |
| { | |
| "entropy": 1.422630524635315, | |
| "epoch": 2.0597165991902835, | |
| "grad_norm": 0.29695579409599304, | |
| "learning_rate": 6.271929824561404e-06, | |
| "loss": 1.428, | |
| "mean_token_accuracy": 0.6465956628322601, | |
| "num_tokens": 23212465.0, | |
| "step": 4070 | |
| }, | |
| { | |
| "entropy": 1.4499358654022216, | |
| "epoch": 2.064777327935223, | |
| "grad_norm": 0.3413025438785553, | |
| "learning_rate": 6.238191632928475e-06, | |
| "loss": 1.4555, | |
| "mean_token_accuracy": 0.6432287812232971, | |
| "num_tokens": 23268400.0, | |
| "step": 4080 | |
| }, | |
| { | |
| "entropy": 1.433293628692627, | |
| "epoch": 2.0698380566801617, | |
| "grad_norm": 0.27788856625556946, | |
| "learning_rate": 6.204453441295547e-06, | |
| "loss": 1.4404, | |
| "mean_token_accuracy": 0.6448906004428864, | |
| "num_tokens": 23330858.0, | |
| "step": 4090 | |
| }, | |
| { | |
| "entropy": 1.527322268486023, | |
| "epoch": 2.074898785425101, | |
| "grad_norm": 0.28372228145599365, | |
| "learning_rate": 6.170715249662618e-06, | |
| "loss": 1.5369, | |
| "mean_token_accuracy": 0.6296894669532775, | |
| "num_tokens": 23388049.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "entropy": 1.654162836074829, | |
| "epoch": 2.0799595141700404, | |
| "grad_norm": 0.3283277451992035, | |
| "learning_rate": 6.1369770580296905e-06, | |
| "loss": 1.6652, | |
| "mean_token_accuracy": 0.6081342697143555, | |
| "num_tokens": 23450327.0, | |
| "step": 4110 | |
| }, | |
| { | |
| "entropy": 1.5552624464035034, | |
| "epoch": 2.08502024291498, | |
| "grad_norm": 0.3101661205291748, | |
| "learning_rate": 6.103238866396761e-06, | |
| "loss": 1.5571, | |
| "mean_token_accuracy": 0.6288932502269745, | |
| "num_tokens": 23507582.0, | |
| "step": 4120 | |
| }, | |
| { | |
| "entropy": 1.5187462210655212, | |
| "epoch": 2.090080971659919, | |
| "grad_norm": 0.26190704107284546, | |
| "learning_rate": 6.069500674763833e-06, | |
| "loss": 1.5231, | |
| "mean_token_accuracy": 0.6347708106040955, | |
| "num_tokens": 23570085.0, | |
| "step": 4130 | |
| }, | |
| { | |
| "entropy": 1.4180486440658568, | |
| "epoch": 2.0951417004048585, | |
| "grad_norm": 0.24935229122638702, | |
| "learning_rate": 6.035762483130905e-06, | |
| "loss": 1.4134, | |
| "mean_token_accuracy": 0.6535919070243835, | |
| "num_tokens": 23629729.0, | |
| "step": 4140 | |
| }, | |
| { | |
| "entropy": 1.5712830781936646, | |
| "epoch": 2.1002024291497974, | |
| "grad_norm": 0.28485989570617676, | |
| "learning_rate": 6.002024291497976e-06, | |
| "loss": 1.5661, | |
| "mean_token_accuracy": 0.6283676266670227, | |
| "num_tokens": 23686822.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "entropy": 1.487233829498291, | |
| "epoch": 2.1052631578947367, | |
| "grad_norm": 0.3802538812160492, | |
| "learning_rate": 5.968286099865048e-06, | |
| "loss": 1.5071, | |
| "mean_token_accuracy": 0.636066097021103, | |
| "num_tokens": 23743196.0, | |
| "step": 4160 | |
| }, | |
| { | |
| "entropy": 1.485396420955658, | |
| "epoch": 2.110323886639676, | |
| "grad_norm": 0.37386566400527954, | |
| "learning_rate": 5.934547908232119e-06, | |
| "loss": 1.4772, | |
| "mean_token_accuracy": 0.6422532796859741, | |
| "num_tokens": 23798229.0, | |
| "step": 4170 | |
| }, | |
| { | |
| "entropy": 1.535237228870392, | |
| "epoch": 2.1153846153846154, | |
| "grad_norm": 0.26898157596588135, | |
| "learning_rate": 5.900809716599191e-06, | |
| "loss": 1.5333, | |
| "mean_token_accuracy": 0.6358494937419892, | |
| "num_tokens": 23852408.0, | |
| "step": 4180 | |
| }, | |
| { | |
| "entropy": 1.5727092146873474, | |
| "epoch": 2.1204453441295548, | |
| "grad_norm": 0.3571448028087616, | |
| "learning_rate": 5.867071524966263e-06, | |
| "loss": 1.5678, | |
| "mean_token_accuracy": 0.6239661037921905, | |
| "num_tokens": 23902266.0, | |
| "step": 4190 | |
| }, | |
| { | |
| "entropy": 1.5237385392189027, | |
| "epoch": 2.125506072874494, | |
| "grad_norm": 0.28321143984794617, | |
| "learning_rate": 5.833333333333334e-06, | |
| "loss": 1.5365, | |
| "mean_token_accuracy": 0.6352564930915833, | |
| "num_tokens": 23959815.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "entropy": 1.5299026012420653, | |
| "epoch": 2.130566801619433, | |
| "grad_norm": 0.3400108218193054, | |
| "learning_rate": 5.799595141700405e-06, | |
| "loss": 1.519, | |
| "mean_token_accuracy": 0.6339640021324158, | |
| "num_tokens": 24012133.0, | |
| "step": 4210 | |
| }, | |
| { | |
| "entropy": 1.657011294364929, | |
| "epoch": 2.1356275303643724, | |
| "grad_norm": 0.3595241606235504, | |
| "learning_rate": 5.765856950067476e-06, | |
| "loss": 1.668, | |
| "mean_token_accuracy": 0.6125568807125091, | |
| "num_tokens": 24063677.0, | |
| "step": 4220 | |
| }, | |
| { | |
| "entropy": 1.5003764629364014, | |
| "epoch": 2.1406882591093117, | |
| "grad_norm": 0.32139450311660767, | |
| "learning_rate": 5.7321187584345484e-06, | |
| "loss": 1.4876, | |
| "mean_token_accuracy": 0.6435904741287232, | |
| "num_tokens": 24120380.0, | |
| "step": 4230 | |
| }, | |
| { | |
| "entropy": 1.6574489951133728, | |
| "epoch": 2.145748987854251, | |
| "grad_norm": 0.30065852403640747, | |
| "learning_rate": 5.6983805668016205e-06, | |
| "loss": 1.6782, | |
| "mean_token_accuracy": 0.6093615233898163, | |
| "num_tokens": 24181603.0, | |
| "step": 4240 | |
| }, | |
| { | |
| "entropy": 1.4604612827301025, | |
| "epoch": 2.1508097165991904, | |
| "grad_norm": 0.28791046142578125, | |
| "learning_rate": 5.664642375168692e-06, | |
| "loss": 1.4376, | |
| "mean_token_accuracy": 0.6457455456256866, | |
| "num_tokens": 24239096.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "entropy": 1.4780054807662963, | |
| "epoch": 2.1558704453441297, | |
| "grad_norm": 0.2827425003051758, | |
| "learning_rate": 5.630904183535763e-06, | |
| "loss": 1.4805, | |
| "mean_token_accuracy": 0.6447736561298371, | |
| "num_tokens": 24295397.0, | |
| "step": 4260 | |
| }, | |
| { | |
| "entropy": 1.4344088315963746, | |
| "epoch": 2.1609311740890687, | |
| "grad_norm": 0.3887704908847809, | |
| "learning_rate": 5.597165991902834e-06, | |
| "loss": 1.4266, | |
| "mean_token_accuracy": 0.6494575679302216, | |
| "num_tokens": 24345669.0, | |
| "step": 4270 | |
| }, | |
| { | |
| "entropy": 1.5128828644752503, | |
| "epoch": 2.165991902834008, | |
| "grad_norm": 0.34420716762542725, | |
| "learning_rate": 5.563427800269906e-06, | |
| "loss": 1.5186, | |
| "mean_token_accuracy": 0.6373259782791137, | |
| "num_tokens": 24403704.0, | |
| "step": 4280 | |
| }, | |
| { | |
| "entropy": 1.3984260201454162, | |
| "epoch": 2.1710526315789473, | |
| "grad_norm": 0.33548930287361145, | |
| "learning_rate": 5.5296896086369774e-06, | |
| "loss": 1.381, | |
| "mean_token_accuracy": 0.6609737515449524, | |
| "num_tokens": 24457935.0, | |
| "step": 4290 | |
| }, | |
| { | |
| "entropy": 1.4911738991737367, | |
| "epoch": 2.1761133603238867, | |
| "grad_norm": 0.2852116823196411, | |
| "learning_rate": 5.4959514170040495e-06, | |
| "loss": 1.4799, | |
| "mean_token_accuracy": 0.6415831744670868, | |
| "num_tokens": 24511977.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "entropy": 1.4702451825141907, | |
| "epoch": 2.181174089068826, | |
| "grad_norm": 0.28457802534103394, | |
| "learning_rate": 5.46221322537112e-06, | |
| "loss": 1.4768, | |
| "mean_token_accuracy": 0.6372047007083893, | |
| "num_tokens": 24569954.0, | |
| "step": 4310 | |
| }, | |
| { | |
| "entropy": 1.4613691449165345, | |
| "epoch": 2.1862348178137654, | |
| "grad_norm": 0.31222304701805115, | |
| "learning_rate": 5.428475033738192e-06, | |
| "loss": 1.4692, | |
| "mean_token_accuracy": 0.6442633271217346, | |
| "num_tokens": 24625268.0, | |
| "step": 4320 | |
| }, | |
| { | |
| "entropy": 1.466537070274353, | |
| "epoch": 2.1912955465587043, | |
| "grad_norm": 0.2962714433670044, | |
| "learning_rate": 5.394736842105264e-06, | |
| "loss": 1.4664, | |
| "mean_token_accuracy": 0.6492825329303742, | |
| "num_tokens": 24688289.0, | |
| "step": 4330 | |
| }, | |
| { | |
| "entropy": 1.5810052037239075, | |
| "epoch": 2.1963562753036436, | |
| "grad_norm": 0.30552032589912415, | |
| "learning_rate": 5.360998650472335e-06, | |
| "loss": 1.5811, | |
| "mean_token_accuracy": 0.6259881913661957, | |
| "num_tokens": 24746697.0, | |
| "step": 4340 | |
| }, | |
| { | |
| "entropy": 1.4260846734046937, | |
| "epoch": 2.201417004048583, | |
| "grad_norm": 0.2985803484916687, | |
| "learning_rate": 5.327260458839406e-06, | |
| "loss": 1.4137, | |
| "mean_token_accuracy": 0.6532795548439025, | |
| "num_tokens": 24810772.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "entropy": 1.5106618881225586, | |
| "epoch": 2.2064777327935223, | |
| "grad_norm": 0.33830076456069946, | |
| "learning_rate": 5.293522267206478e-06, | |
| "loss": 1.522, | |
| "mean_token_accuracy": 0.6390328884124756, | |
| "num_tokens": 24870122.0, | |
| "step": 4360 | |
| }, | |
| { | |
| "entropy": 1.527205801010132, | |
| "epoch": 2.2115384615384617, | |
| "grad_norm": 0.444986492395401, | |
| "learning_rate": 5.25978407557355e-06, | |
| "loss": 1.5237, | |
| "mean_token_accuracy": 0.6333723068237305, | |
| "num_tokens": 24929676.0, | |
| "step": 4370 | |
| }, | |
| { | |
| "entropy": 1.571653914451599, | |
| "epoch": 2.216599190283401, | |
| "grad_norm": 0.27972137928009033, | |
| "learning_rate": 5.226045883940622e-06, | |
| "loss": 1.5782, | |
| "mean_token_accuracy": 0.62519211769104, | |
| "num_tokens": 24984648.0, | |
| "step": 4380 | |
| }, | |
| { | |
| "entropy": 1.579957866668701, | |
| "epoch": 2.22165991902834, | |
| "grad_norm": 0.35601162910461426, | |
| "learning_rate": 5.192307692307693e-06, | |
| "loss": 1.5916, | |
| "mean_token_accuracy": 0.6265009582042694, | |
| "num_tokens": 25039282.0, | |
| "step": 4390 | |
| }, | |
| { | |
| "entropy": 1.590737247467041, | |
| "epoch": 2.2267206477732793, | |
| "grad_norm": 0.3328033685684204, | |
| "learning_rate": 5.158569500674764e-06, | |
| "loss": 1.5942, | |
| "mean_token_accuracy": 0.6266931772232056, | |
| "num_tokens": 25084698.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "entropy": 1.4461635231971741, | |
| "epoch": 2.2317813765182186, | |
| "grad_norm": 0.3073853850364685, | |
| "learning_rate": 5.124831309041835e-06, | |
| "loss": 1.4532, | |
| "mean_token_accuracy": 0.6430659115314483, | |
| "num_tokens": 25145917.0, | |
| "step": 4410 | |
| }, | |
| { | |
| "entropy": 1.6023080706596375, | |
| "epoch": 2.236842105263158, | |
| "grad_norm": 0.38999930024147034, | |
| "learning_rate": 5.0910931174089075e-06, | |
| "loss": 1.6065, | |
| "mean_token_accuracy": 0.6303758680820465, | |
| "num_tokens": 25200499.0, | |
| "step": 4420 | |
| }, | |
| { | |
| "entropy": 1.403742289543152, | |
| "epoch": 2.2419028340080973, | |
| "grad_norm": 0.3020265996456146, | |
| "learning_rate": 5.057354925775979e-06, | |
| "loss": 1.3936, | |
| "mean_token_accuracy": 0.6550646901130677, | |
| "num_tokens": 25253626.0, | |
| "step": 4430 | |
| }, | |
| { | |
| "entropy": 1.5970208644866943, | |
| "epoch": 2.246963562753036, | |
| "grad_norm": 0.34803110361099243, | |
| "learning_rate": 5.023616734143051e-06, | |
| "loss": 1.6128, | |
| "mean_token_accuracy": 0.6253244817256928, | |
| "num_tokens": 25315718.0, | |
| "step": 4440 | |
| }, | |
| { | |
| "entropy": 1.4895619392395019, | |
| "epoch": 2.2520242914979756, | |
| "grad_norm": 0.295636385679245, | |
| "learning_rate": 4.989878542510122e-06, | |
| "loss": 1.4976, | |
| "mean_token_accuracy": 0.6415492594242096, | |
| "num_tokens": 25378490.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "entropy": 1.500291097164154, | |
| "epoch": 2.257085020242915, | |
| "grad_norm": 0.29003915190696716, | |
| "learning_rate": 4.956140350877193e-06, | |
| "loss": 1.4741, | |
| "mean_token_accuracy": 0.6455156445503235, | |
| "num_tokens": 25435125.0, | |
| "step": 4460 | |
| }, | |
| { | |
| "entropy": 1.5137645125389099, | |
| "epoch": 2.2621457489878543, | |
| "grad_norm": 0.345222145318985, | |
| "learning_rate": 4.922402159244265e-06, | |
| "loss": 1.5106, | |
| "mean_token_accuracy": 0.6373549580574036, | |
| "num_tokens": 25492838.0, | |
| "step": 4470 | |
| }, | |
| { | |
| "entropy": 1.4126244068145752, | |
| "epoch": 2.2672064777327936, | |
| "grad_norm": 0.43444496393203735, | |
| "learning_rate": 4.8886639676113364e-06, | |
| "loss": 1.402, | |
| "mean_token_accuracy": 0.6513433575630188, | |
| "num_tokens": 25552113.0, | |
| "step": 4480 | |
| }, | |
| { | |
| "entropy": 1.5574785828590394, | |
| "epoch": 2.272267206477733, | |
| "grad_norm": 0.28663352131843567, | |
| "learning_rate": 4.854925775978408e-06, | |
| "loss": 1.5719, | |
| "mean_token_accuracy": 0.6330413460731507, | |
| "num_tokens": 25604938.0, | |
| "step": 4490 | |
| }, | |
| { | |
| "entropy": 1.5517175793647766, | |
| "epoch": 2.2773279352226723, | |
| "grad_norm": 0.3585723042488098, | |
| "learning_rate": 4.82118758434548e-06, | |
| "loss": 1.5492, | |
| "mean_token_accuracy": 0.6311025798320771, | |
| "num_tokens": 25663827.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "entropy": 1.7192303657531738, | |
| "epoch": 2.282388663967611, | |
| "grad_norm": 0.3171631395816803, | |
| "learning_rate": 4.787449392712551e-06, | |
| "loss": 1.7084, | |
| "mean_token_accuracy": 0.5979065060615539, | |
| "num_tokens": 25718627.0, | |
| "step": 4510 | |
| }, | |
| { | |
| "entropy": 1.4433665156364441, | |
| "epoch": 2.2874493927125505, | |
| "grad_norm": 0.31859585642814636, | |
| "learning_rate": 4.753711201079623e-06, | |
| "loss": 1.431, | |
| "mean_token_accuracy": 0.6453494548797607, | |
| "num_tokens": 25779859.0, | |
| "step": 4520 | |
| }, | |
| { | |
| "entropy": 1.493071937561035, | |
| "epoch": 2.29251012145749, | |
| "grad_norm": 0.3323538303375244, | |
| "learning_rate": 4.719973009446694e-06, | |
| "loss": 1.5016, | |
| "mean_token_accuracy": 0.6344216048717499, | |
| "num_tokens": 25835705.0, | |
| "step": 4530 | |
| }, | |
| { | |
| "entropy": 1.5348315596580506, | |
| "epoch": 2.2975708502024292, | |
| "grad_norm": 0.29418283700942993, | |
| "learning_rate": 4.686234817813765e-06, | |
| "loss": 1.5299, | |
| "mean_token_accuracy": 0.6337445557117463, | |
| "num_tokens": 25896484.0, | |
| "step": 4540 | |
| }, | |
| { | |
| "entropy": 1.4027626633644104, | |
| "epoch": 2.3026315789473686, | |
| "grad_norm": 0.3454079031944275, | |
| "learning_rate": 4.652496626180837e-06, | |
| "loss": 1.3954, | |
| "mean_token_accuracy": 0.6570545434951782, | |
| "num_tokens": 25946989.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "entropy": 1.4810479283332825, | |
| "epoch": 2.3076923076923075, | |
| "grad_norm": 0.30555200576782227, | |
| "learning_rate": 4.618758434547909e-06, | |
| "loss": 1.4935, | |
| "mean_token_accuracy": 0.6418456912040711, | |
| "num_tokens": 26005212.0, | |
| "step": 4560 | |
| }, | |
| { | |
| "entropy": 1.5501378655433655, | |
| "epoch": 2.312753036437247, | |
| "grad_norm": 0.2936731278896332, | |
| "learning_rate": 4.585020242914981e-06, | |
| "loss": 1.5493, | |
| "mean_token_accuracy": 0.6311659216880798, | |
| "num_tokens": 26061206.0, | |
| "step": 4570 | |
| }, | |
| { | |
| "entropy": 1.5965832471847534, | |
| "epoch": 2.317813765182186, | |
| "grad_norm": 0.3174577057361603, | |
| "learning_rate": 4.551282051282052e-06, | |
| "loss": 1.5986, | |
| "mean_token_accuracy": 0.6272948026657105, | |
| "num_tokens": 26117314.0, | |
| "step": 4580 | |
| }, | |
| { | |
| "entropy": 1.497817873954773, | |
| "epoch": 2.3228744939271255, | |
| "grad_norm": 0.3074813485145569, | |
| "learning_rate": 4.517543859649123e-06, | |
| "loss": 1.5177, | |
| "mean_token_accuracy": 0.639699399471283, | |
| "num_tokens": 26177625.0, | |
| "step": 4590 | |
| }, | |
| { | |
| "entropy": 1.398792815208435, | |
| "epoch": 2.327935222672065, | |
| "grad_norm": 0.3233450949192047, | |
| "learning_rate": 4.483805668016194e-06, | |
| "loss": 1.3972, | |
| "mean_token_accuracy": 0.6578422546386719, | |
| "num_tokens": 26229108.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "entropy": 1.3582614064216614, | |
| "epoch": 2.332995951417004, | |
| "grad_norm": 0.3194423019886017, | |
| "learning_rate": 4.4500674763832665e-06, | |
| "loss": 1.3473, | |
| "mean_token_accuracy": 0.6627348363399506, | |
| "num_tokens": 26281682.0, | |
| "step": 4610 | |
| }, | |
| { | |
| "entropy": 1.4663148880004884, | |
| "epoch": 2.3380566801619436, | |
| "grad_norm": 0.317622572183609, | |
| "learning_rate": 4.416329284750338e-06, | |
| "loss": 1.4749, | |
| "mean_token_accuracy": 0.6402939558029175, | |
| "num_tokens": 26343090.0, | |
| "step": 4620 | |
| }, | |
| { | |
| "entropy": 1.4386041164398193, | |
| "epoch": 2.3431174089068825, | |
| "grad_norm": 0.37403181195259094, | |
| "learning_rate": 4.382591093117409e-06, | |
| "loss": 1.4399, | |
| "mean_token_accuracy": 0.6470987558364868, | |
| "num_tokens": 26398372.0, | |
| "step": 4630 | |
| }, | |
| { | |
| "entropy": 1.591576099395752, | |
| "epoch": 2.348178137651822, | |
| "grad_norm": 0.27833235263824463, | |
| "learning_rate": 4.348852901484481e-06, | |
| "loss": 1.6015, | |
| "mean_token_accuracy": 0.6296046376228333, | |
| "num_tokens": 26458865.0, | |
| "step": 4640 | |
| }, | |
| { | |
| "entropy": 1.4324705123901367, | |
| "epoch": 2.353238866396761, | |
| "grad_norm": 0.3234311044216156, | |
| "learning_rate": 4.315114709851552e-06, | |
| "loss": 1.4182, | |
| "mean_token_accuracy": 0.6525469720363617, | |
| "num_tokens": 26514094.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "entropy": 1.5859549045562744, | |
| "epoch": 2.3582995951417005, | |
| "grad_norm": 0.31048783659935, | |
| "learning_rate": 4.281376518218624e-06, | |
| "loss": 1.6055, | |
| "mean_token_accuracy": 0.6206431567668915, | |
| "num_tokens": 26573568.0, | |
| "step": 4660 | |
| }, | |
| { | |
| "entropy": 1.4157851219177247, | |
| "epoch": 2.36336032388664, | |
| "grad_norm": 0.27004745602607727, | |
| "learning_rate": 4.2476383265856954e-06, | |
| "loss": 1.4191, | |
| "mean_token_accuracy": 0.6526973366737365, | |
| "num_tokens": 26628281.0, | |
| "step": 4670 | |
| }, | |
| { | |
| "entropy": 1.4219112515449523, | |
| "epoch": 2.3684210526315788, | |
| "grad_norm": 0.3162846863269806, | |
| "learning_rate": 4.213900134952767e-06, | |
| "loss": 1.4237, | |
| "mean_token_accuracy": 0.6481447339057922, | |
| "num_tokens": 26683329.0, | |
| "step": 4680 | |
| }, | |
| { | |
| "entropy": 1.474673593044281, | |
| "epoch": 2.373481781376518, | |
| "grad_norm": 0.2558523714542389, | |
| "learning_rate": 4.180161943319838e-06, | |
| "loss": 1.4789, | |
| "mean_token_accuracy": 0.644309651851654, | |
| "num_tokens": 26741726.0, | |
| "step": 4690 | |
| }, | |
| { | |
| "entropy": 1.545168387889862, | |
| "epoch": 2.3785425101214575, | |
| "grad_norm": 0.3100733160972595, | |
| "learning_rate": 4.14642375168691e-06, | |
| "loss": 1.5585, | |
| "mean_token_accuracy": 0.6251280426979064, | |
| "num_tokens": 26801987.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "entropy": 1.4952475309371949, | |
| "epoch": 2.383603238866397, | |
| "grad_norm": 0.2840896546840668, | |
| "learning_rate": 4.112685560053982e-06, | |
| "loss": 1.4928, | |
| "mean_token_accuracy": 0.6407946467399597, | |
| "num_tokens": 26862449.0, | |
| "step": 4710 | |
| }, | |
| { | |
| "entropy": 1.3853577494621276, | |
| "epoch": 2.388663967611336, | |
| "grad_norm": 0.315100759267807, | |
| "learning_rate": 4.078947368421053e-06, | |
| "loss": 1.3891, | |
| "mean_token_accuracy": 0.6517343044281005, | |
| "num_tokens": 26923528.0, | |
| "step": 4720 | |
| }, | |
| { | |
| "entropy": 1.5327417492866515, | |
| "epoch": 2.3937246963562755, | |
| "grad_norm": 0.3072359561920166, | |
| "learning_rate": 4.0452091767881244e-06, | |
| "loss": 1.5438, | |
| "mean_token_accuracy": 0.638210940361023, | |
| "num_tokens": 26976129.0, | |
| "step": 4730 | |
| }, | |
| { | |
| "entropy": 1.6007991313934327, | |
| "epoch": 2.3987854251012144, | |
| "grad_norm": 0.28095099329948425, | |
| "learning_rate": 4.011470985155196e-06, | |
| "loss": 1.6025, | |
| "mean_token_accuracy": 0.6204523742198944, | |
| "num_tokens": 27030769.0, | |
| "step": 4740 | |
| }, | |
| { | |
| "entropy": 1.5538129091262818, | |
| "epoch": 2.4038461538461537, | |
| "grad_norm": 0.3622888922691345, | |
| "learning_rate": 3.977732793522268e-06, | |
| "loss": 1.5497, | |
| "mean_token_accuracy": 0.6246297895908356, | |
| "num_tokens": 27085119.0, | |
| "step": 4750 | |
| }, | |
| { | |
| "entropy": 1.4716430306434631, | |
| "epoch": 2.408906882591093, | |
| "grad_norm": 0.2776808738708496, | |
| "learning_rate": 3.943994601889339e-06, | |
| "loss": 1.4715, | |
| "mean_token_accuracy": 0.6430730044841766, | |
| "num_tokens": 27146308.0, | |
| "step": 4760 | |
| }, | |
| { | |
| "entropy": 1.4779613852500915, | |
| "epoch": 2.4139676113360324, | |
| "grad_norm": 0.30735519528388977, | |
| "learning_rate": 3.910256410256411e-06, | |
| "loss": 1.481, | |
| "mean_token_accuracy": 0.6421349704265594, | |
| "num_tokens": 27204236.0, | |
| "step": 4770 | |
| }, | |
| { | |
| "entropy": 1.6263086080551148, | |
| "epoch": 2.419028340080972, | |
| "grad_norm": 0.3509717881679535, | |
| "learning_rate": 3.876518218623482e-06, | |
| "loss": 1.6306, | |
| "mean_token_accuracy": 0.6189518332481384, | |
| "num_tokens": 27253795.0, | |
| "step": 4780 | |
| }, | |
| { | |
| "entropy": 1.5051485419273376, | |
| "epoch": 2.4240890688259107, | |
| "grad_norm": 0.36502060294151306, | |
| "learning_rate": 3.842780026990553e-06, | |
| "loss": 1.5045, | |
| "mean_token_accuracy": 0.6390359103679657, | |
| "num_tokens": 27311173.0, | |
| "step": 4790 | |
| }, | |
| { | |
| "entropy": 1.5122657060623168, | |
| "epoch": 2.42914979757085, | |
| "grad_norm": 0.35788798332214355, | |
| "learning_rate": 3.8090418353576255e-06, | |
| "loss": 1.4811, | |
| "mean_token_accuracy": 0.6367557644844055, | |
| "num_tokens": 27366839.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "entropy": 1.5352485537528993, | |
| "epoch": 2.4342105263157894, | |
| "grad_norm": 0.2877010107040405, | |
| "learning_rate": 3.7753036437246967e-06, | |
| "loss": 1.5402, | |
| "mean_token_accuracy": 0.6323030471801758, | |
| "num_tokens": 27423988.0, | |
| "step": 4810 | |
| }, | |
| { | |
| "entropy": 1.329223895072937, | |
| "epoch": 2.4392712550607287, | |
| "grad_norm": 0.27826353907585144, | |
| "learning_rate": 3.7415654520917683e-06, | |
| "loss": 1.3322, | |
| "mean_token_accuracy": 0.6661195576190948, | |
| "num_tokens": 27482284.0, | |
| "step": 4820 | |
| }, | |
| { | |
| "entropy": 1.460306990146637, | |
| "epoch": 2.444331983805668, | |
| "grad_norm": 0.2664757966995239, | |
| "learning_rate": 3.7078272604588395e-06, | |
| "loss": 1.4645, | |
| "mean_token_accuracy": 0.6439946055412292, | |
| "num_tokens": 27542235.0, | |
| "step": 4830 | |
| }, | |
| { | |
| "entropy": 1.420573878288269, | |
| "epoch": 2.4493927125506074, | |
| "grad_norm": 0.3187576234340668, | |
| "learning_rate": 3.674089068825911e-06, | |
| "loss": 1.4271, | |
| "mean_token_accuracy": 0.6494402289390564, | |
| "num_tokens": 27606521.0, | |
| "step": 4840 | |
| }, | |
| { | |
| "entropy": 1.5605995893478393, | |
| "epoch": 2.4544534412955468, | |
| "grad_norm": 0.3589235842227936, | |
| "learning_rate": 3.640350877192983e-06, | |
| "loss": 1.5464, | |
| "mean_token_accuracy": 0.636710187792778, | |
| "num_tokens": 27655589.0, | |
| "step": 4850 | |
| }, | |
| { | |
| "entropy": 1.5500049710273742, | |
| "epoch": 2.4595141700404857, | |
| "grad_norm": 0.42818954586982727, | |
| "learning_rate": 3.606612685560054e-06, | |
| "loss": 1.5422, | |
| "mean_token_accuracy": 0.6335929155349731, | |
| "num_tokens": 27707321.0, | |
| "step": 4860 | |
| }, | |
| { | |
| "entropy": 1.5264463543891906, | |
| "epoch": 2.464574898785425, | |
| "grad_norm": 0.30446869134902954, | |
| "learning_rate": 3.572874493927126e-06, | |
| "loss": 1.5354, | |
| "mean_token_accuracy": 0.6377040445804596, | |
| "num_tokens": 27766680.0, | |
| "step": 4870 | |
| }, | |
| { | |
| "entropy": 1.5602357268333436, | |
| "epoch": 2.4696356275303644, | |
| "grad_norm": 0.31952470541000366, | |
| "learning_rate": 3.5391363022941973e-06, | |
| "loss": 1.563, | |
| "mean_token_accuracy": 0.6299617826938629, | |
| "num_tokens": 27825128.0, | |
| "step": 4880 | |
| }, | |
| { | |
| "entropy": 1.4979919075965882, | |
| "epoch": 2.4746963562753037, | |
| "grad_norm": 0.3032040596008301, | |
| "learning_rate": 3.505398110661269e-06, | |
| "loss": 1.5194, | |
| "mean_token_accuracy": 0.6328540325164795, | |
| "num_tokens": 27886765.0, | |
| "step": 4890 | |
| }, | |
| { | |
| "entropy": 1.571874487400055, | |
| "epoch": 2.479757085020243, | |
| "grad_norm": 0.3398491144180298, | |
| "learning_rate": 3.47165991902834e-06, | |
| "loss": 1.568, | |
| "mean_token_accuracy": 0.6220065712928772, | |
| "num_tokens": 27942690.0, | |
| "step": 4900 | |
| }, | |
| { | |
| "entropy": 1.4888028264045716, | |
| "epoch": 2.484817813765182, | |
| "grad_norm": 0.2785778343677521, | |
| "learning_rate": 3.437921727395412e-06, | |
| "loss": 1.482, | |
| "mean_token_accuracy": 0.6426316261291504, | |
| "num_tokens": 28001045.0, | |
| "step": 4910 | |
| }, | |
| { | |
| "entropy": 1.4304234504699707, | |
| "epoch": 2.4898785425101213, | |
| "grad_norm": 0.36416903138160706, | |
| "learning_rate": 3.4041835357624834e-06, | |
| "loss": 1.4412, | |
| "mean_token_accuracy": 0.6480507373809814, | |
| "num_tokens": 28060050.0, | |
| "step": 4920 | |
| }, | |
| { | |
| "entropy": 1.4549246668815612, | |
| "epoch": 2.4949392712550607, | |
| "grad_norm": 0.3209365904331207, | |
| "learning_rate": 3.3704453441295546e-06, | |
| "loss": 1.4444, | |
| "mean_token_accuracy": 0.6485071182250977, | |
| "num_tokens": 28119937.0, | |
| "step": 4930 | |
| }, | |
| { | |
| "entropy": 1.6035995841026307, | |
| "epoch": 2.5, | |
| "grad_norm": 0.3263776898384094, | |
| "learning_rate": 3.3367071524966267e-06, | |
| "loss": 1.596, | |
| "mean_token_accuracy": 0.6212283372879028, | |
| "num_tokens": 28176098.0, | |
| "step": 4940 | |
| }, | |
| { | |
| "entropy": 1.3420706629753112, | |
| "epoch": 2.5050607287449393, | |
| "grad_norm": 0.29616400599479675, | |
| "learning_rate": 3.302968960863698e-06, | |
| "loss": 1.3361, | |
| "mean_token_accuracy": 0.6657415688037872, | |
| "num_tokens": 28232898.0, | |
| "step": 4950 | |
| }, | |
| { | |
| "entropy": 1.5731253027915955, | |
| "epoch": 2.5101214574898787, | |
| "grad_norm": 0.2652728259563446, | |
| "learning_rate": 3.2692307692307696e-06, | |
| "loss": 1.569, | |
| "mean_token_accuracy": 0.6270411610603333, | |
| "num_tokens": 28289187.0, | |
| "step": 4960 | |
| }, | |
| { | |
| "entropy": 1.4383020401000977, | |
| "epoch": 2.515182186234818, | |
| "grad_norm": 0.3313502371311188, | |
| "learning_rate": 3.2354925775978408e-06, | |
| "loss": 1.4301, | |
| "mean_token_accuracy": 0.6567471146583557, | |
| "num_tokens": 28345870.0, | |
| "step": 4970 | |
| }, | |
| { | |
| "entropy": 1.4449619054794312, | |
| "epoch": 2.520242914979757, | |
| "grad_norm": 0.299467533826828, | |
| "learning_rate": 3.2017543859649124e-06, | |
| "loss": 1.4596, | |
| "mean_token_accuracy": 0.6480660021305085, | |
| "num_tokens": 28401335.0, | |
| "step": 4980 | |
| }, | |
| { | |
| "entropy": 1.407576084136963, | |
| "epoch": 2.5253036437246963, | |
| "grad_norm": 0.33703747391700745, | |
| "learning_rate": 3.168016194331984e-06, | |
| "loss": 1.4026, | |
| "mean_token_accuracy": 0.6588316440582276, | |
| "num_tokens": 28451027.0, | |
| "step": 4990 | |
| }, | |
| { | |
| "entropy": 1.6358988881111145, | |
| "epoch": 2.5303643724696356, | |
| "grad_norm": 0.3531615138053894, | |
| "learning_rate": 3.1342780026990553e-06, | |
| "loss": 1.6387, | |
| "mean_token_accuracy": 0.6192252457141876, | |
| "num_tokens": 28508717.0, | |
| "step": 5000 | |
| }, | |
| { | |
| "entropy": 1.530623769760132, | |
| "epoch": 2.535425101214575, | |
| "grad_norm": 0.2998420000076294, | |
| "learning_rate": 3.1005398110661273e-06, | |
| "loss": 1.5209, | |
| "mean_token_accuracy": 0.6354014992713928, | |
| "num_tokens": 28566256.0, | |
| "step": 5010 | |
| }, | |
| { | |
| "entropy": 1.5933383703231812, | |
| "epoch": 2.5404858299595143, | |
| "grad_norm": 0.3689696192741394, | |
| "learning_rate": 3.0668016194331985e-06, | |
| "loss": 1.5881, | |
| "mean_token_accuracy": 0.6318571925163269, | |
| "num_tokens": 28618249.0, | |
| "step": 5020 | |
| }, | |
| { | |
| "entropy": 1.4564833164215087, | |
| "epoch": 2.5455465587044532, | |
| "grad_norm": 0.30524808168411255, | |
| "learning_rate": 3.03306342780027e-06, | |
| "loss": 1.4375, | |
| "mean_token_accuracy": 0.6440569698810578, | |
| "num_tokens": 28674342.0, | |
| "step": 5030 | |
| }, | |
| { | |
| "entropy": 1.510752511024475, | |
| "epoch": 2.5506072874493926, | |
| "grad_norm": 0.3323598802089691, | |
| "learning_rate": 2.999325236167342e-06, | |
| "loss": 1.5278, | |
| "mean_token_accuracy": 0.6354637145996094, | |
| "num_tokens": 28731622.0, | |
| "step": 5040 | |
| }, | |
| { | |
| "entropy": 1.4739052295684814, | |
| "epoch": 2.555668016194332, | |
| "grad_norm": 0.31869643926620483, | |
| "learning_rate": 2.965587044534413e-06, | |
| "loss": 1.4649, | |
| "mean_token_accuracy": 0.6425871312618255, | |
| "num_tokens": 28791133.0, | |
| "step": 5050 | |
| }, | |
| { | |
| "entropy": 1.5100542187690735, | |
| "epoch": 2.5607287449392713, | |
| "grad_norm": 0.3328213095664978, | |
| "learning_rate": 2.931848852901485e-06, | |
| "loss": 1.5045, | |
| "mean_token_accuracy": 0.6392671585083007, | |
| "num_tokens": 28847713.0, | |
| "step": 5060 | |
| }, | |
| { | |
| "entropy": 1.4085248589515686, | |
| "epoch": 2.5657894736842106, | |
| "grad_norm": 0.281522661447525, | |
| "learning_rate": 2.8981106612685563e-06, | |
| "loss": 1.3982, | |
| "mean_token_accuracy": 0.6513190269470215, | |
| "num_tokens": 28910189.0, | |
| "step": 5070 | |
| }, | |
| { | |
| "entropy": 1.397442674636841, | |
| "epoch": 2.57085020242915, | |
| "grad_norm": 0.3210408091545105, | |
| "learning_rate": 2.864372469635628e-06, | |
| "loss": 1.3977, | |
| "mean_token_accuracy": 0.6574838936328888, | |
| "num_tokens": 28966241.0, | |
| "step": 5080 | |
| }, | |
| { | |
| "entropy": 1.5165488362312316, | |
| "epoch": 2.5759109311740893, | |
| "grad_norm": 0.31288620829582214, | |
| "learning_rate": 2.830634278002699e-06, | |
| "loss": 1.5124, | |
| "mean_token_accuracy": 0.6387628674507141, | |
| "num_tokens": 29026210.0, | |
| "step": 5090 | |
| }, | |
| { | |
| "entropy": 1.5974119186401368, | |
| "epoch": 2.580971659919028, | |
| "grad_norm": 0.3497001826763153, | |
| "learning_rate": 2.796896086369771e-06, | |
| "loss": 1.61, | |
| "mean_token_accuracy": 0.6236252367496491, | |
| "num_tokens": 29083556.0, | |
| "step": 5100 | |
| }, | |
| { | |
| "entropy": 1.5403811931610107, | |
| "epoch": 2.5860323886639676, | |
| "grad_norm": 0.31958791613578796, | |
| "learning_rate": 2.7631578947368424e-06, | |
| "loss": 1.5418, | |
| "mean_token_accuracy": 0.634338253736496, | |
| "num_tokens": 29142090.0, | |
| "step": 5110 | |
| }, | |
| { | |
| "entropy": 1.4701064825057983, | |
| "epoch": 2.591093117408907, | |
| "grad_norm": 0.28594285249710083, | |
| "learning_rate": 2.7294197031039137e-06, | |
| "loss": 1.4693, | |
| "mean_token_accuracy": 0.6509437322616577, | |
| "num_tokens": 29198039.0, | |
| "step": 5120 | |
| }, | |
| { | |
| "entropy": 1.508654534816742, | |
| "epoch": 2.5961538461538463, | |
| "grad_norm": 0.28295132517814636, | |
| "learning_rate": 2.6956815114709857e-06, | |
| "loss": 1.5107, | |
| "mean_token_accuracy": 0.6393173456192016, | |
| "num_tokens": 29258240.0, | |
| "step": 5130 | |
| }, | |
| { | |
| "entropy": 1.573255705833435, | |
| "epoch": 2.601214574898785, | |
| "grad_norm": 0.2459454983472824, | |
| "learning_rate": 2.661943319838057e-06, | |
| "loss": 1.5903, | |
| "mean_token_accuracy": 0.6283860564231872, | |
| "num_tokens": 29318879.0, | |
| "step": 5140 | |
| }, | |
| { | |
| "entropy": 1.5287572503089906, | |
| "epoch": 2.6062753036437245, | |
| "grad_norm": 0.31771403551101685, | |
| "learning_rate": 2.6282051282051286e-06, | |
| "loss": 1.5452, | |
| "mean_token_accuracy": 0.6344579041004181, | |
| "num_tokens": 29379919.0, | |
| "step": 5150 | |
| }, | |
| { | |
| "entropy": 1.3615296483039856, | |
| "epoch": 2.611336032388664, | |
| "grad_norm": 0.28625616431236267, | |
| "learning_rate": 2.5944669365721998e-06, | |
| "loss": 1.349, | |
| "mean_token_accuracy": 0.6637236177921295, | |
| "num_tokens": 29438959.0, | |
| "step": 5160 | |
| }, | |
| { | |
| "entropy": 1.4767539501190186, | |
| "epoch": 2.616396761133603, | |
| "grad_norm": 0.2911388874053955, | |
| "learning_rate": 2.5607287449392714e-06, | |
| "loss": 1.4775, | |
| "mean_token_accuracy": 0.6405583918094635, | |
| "num_tokens": 29495248.0, | |
| "step": 5170 | |
| }, | |
| { | |
| "entropy": 1.4118461966514588, | |
| "epoch": 2.6214574898785425, | |
| "grad_norm": 0.3035772442817688, | |
| "learning_rate": 2.526990553306343e-06, | |
| "loss": 1.4266, | |
| "mean_token_accuracy": 0.6568454921245575, | |
| "num_tokens": 29549374.0, | |
| "step": 5180 | |
| }, | |
| { | |
| "entropy": 1.3858314156532288, | |
| "epoch": 2.626518218623482, | |
| "grad_norm": 0.28831735253334045, | |
| "learning_rate": 2.4932523616734143e-06, | |
| "loss": 1.3659, | |
| "mean_token_accuracy": 0.6626292169094086, | |
| "num_tokens": 29608335.0, | |
| "step": 5190 | |
| }, | |
| { | |
| "entropy": 1.5293712258338927, | |
| "epoch": 2.6315789473684212, | |
| "grad_norm": 0.33819642663002014, | |
| "learning_rate": 2.459514170040486e-06, | |
| "loss": 1.5299, | |
| "mean_token_accuracy": 0.629097181558609, | |
| "num_tokens": 29666401.0, | |
| "step": 5200 | |
| }, | |
| { | |
| "entropy": 1.5522411942481995, | |
| "epoch": 2.6366396761133606, | |
| "grad_norm": 0.37447431683540344, | |
| "learning_rate": 2.4257759784075576e-06, | |
| "loss": 1.5546, | |
| "mean_token_accuracy": 0.6252642631530761, | |
| "num_tokens": 29722977.0, | |
| "step": 5210 | |
| }, | |
| { | |
| "entropy": 1.5046650171279907, | |
| "epoch": 2.6417004048582995, | |
| "grad_norm": 0.32877567410469055, | |
| "learning_rate": 2.392037786774629e-06, | |
| "loss": 1.4941, | |
| "mean_token_accuracy": 0.6403312921524048, | |
| "num_tokens": 29777693.0, | |
| "step": 5220 | |
| }, | |
| { | |
| "entropy": 1.4904412388801576, | |
| "epoch": 2.646761133603239, | |
| "grad_norm": 0.30846232175827026, | |
| "learning_rate": 2.358299595141701e-06, | |
| "loss": 1.4874, | |
| "mean_token_accuracy": 0.6401443660259247, | |
| "num_tokens": 29841451.0, | |
| "step": 5230 | |
| }, | |
| { | |
| "entropy": 1.4474842250347137, | |
| "epoch": 2.651821862348178, | |
| "grad_norm": 0.3371650278568268, | |
| "learning_rate": 2.324561403508772e-06, | |
| "loss": 1.4514, | |
| "mean_token_accuracy": 0.6537328362464905, | |
| "num_tokens": 29900142.0, | |
| "step": 5240 | |
| }, | |
| { | |
| "entropy": 1.5490441560745238, | |
| "epoch": 2.6568825910931175, | |
| "grad_norm": 0.28833135962486267, | |
| "learning_rate": 2.2908232118758437e-06, | |
| "loss": 1.5525, | |
| "mean_token_accuracy": 0.6344904005527496, | |
| "num_tokens": 29965665.0, | |
| "step": 5250 | |
| }, | |
| { | |
| "entropy": 1.4371688961982727, | |
| "epoch": 2.6619433198380564, | |
| "grad_norm": 0.27346664667129517, | |
| "learning_rate": 2.257085020242915e-06, | |
| "loss": 1.4386, | |
| "mean_token_accuracy": 0.6554741203784943, | |
| "num_tokens": 30020063.0, | |
| "step": 5260 | |
| }, | |
| { | |
| "entropy": 1.57616069316864, | |
| "epoch": 2.667004048582996, | |
| "grad_norm": 0.31261205673217773, | |
| "learning_rate": 2.2233468286099865e-06, | |
| "loss": 1.5878, | |
| "mean_token_accuracy": 0.6287827432155609, | |
| "num_tokens": 30079648.0, | |
| "step": 5270 | |
| }, | |
| { | |
| "entropy": 1.6309450030326844, | |
| "epoch": 2.672064777327935, | |
| "grad_norm": 0.36513420939445496, | |
| "learning_rate": 2.189608636977058e-06, | |
| "loss": 1.6362, | |
| "mean_token_accuracy": 0.6139590203762054, | |
| "num_tokens": 30139557.0, | |
| "step": 5280 | |
| }, | |
| { | |
| "entropy": 1.6020007967948913, | |
| "epoch": 2.6771255060728745, | |
| "grad_norm": 0.3361331522464752, | |
| "learning_rate": 2.15587044534413e-06, | |
| "loss": 1.5899, | |
| "mean_token_accuracy": 0.623996788263321, | |
| "num_tokens": 30194644.0, | |
| "step": 5290 | |
| }, | |
| { | |
| "entropy": 1.4187337517738343, | |
| "epoch": 2.682186234817814, | |
| "grad_norm": 0.3711530864238739, | |
| "learning_rate": 2.1221322537112015e-06, | |
| "loss": 1.4225, | |
| "mean_token_accuracy": 0.6517966687679291, | |
| "num_tokens": 30249182.0, | |
| "step": 5300 | |
| }, | |
| { | |
| "entropy": 1.4419126749038695, | |
| "epoch": 2.687246963562753, | |
| "grad_norm": 0.34213292598724365, | |
| "learning_rate": 2.0883940620782727e-06, | |
| "loss": 1.4502, | |
| "mean_token_accuracy": 0.6504493892192841, | |
| "num_tokens": 30307151.0, | |
| "step": 5310 | |
| }, | |
| { | |
| "entropy": 1.593650794029236, | |
| "epoch": 2.6923076923076925, | |
| "grad_norm": 0.2626771032810211, | |
| "learning_rate": 2.0546558704453443e-06, | |
| "loss": 1.5977, | |
| "mean_token_accuracy": 0.6253896594047547, | |
| "num_tokens": 30363799.0, | |
| "step": 5320 | |
| }, | |
| { | |
| "entropy": 1.505863094329834, | |
| "epoch": 2.6973684210526314, | |
| "grad_norm": 0.31610244512557983, | |
| "learning_rate": 2.020917678812416e-06, | |
| "loss": 1.507, | |
| "mean_token_accuracy": 0.6344715654850006, | |
| "num_tokens": 30420544.0, | |
| "step": 5330 | |
| }, | |
| { | |
| "entropy": 1.5584728479385377, | |
| "epoch": 2.7024291497975708, | |
| "grad_norm": 0.3088075518608093, | |
| "learning_rate": 1.987179487179487e-06, | |
| "loss": 1.5504, | |
| "mean_token_accuracy": 0.6318127393722535, | |
| "num_tokens": 30479027.0, | |
| "step": 5340 | |
| }, | |
| { | |
| "entropy": 1.464753222465515, | |
| "epoch": 2.70748987854251, | |
| "grad_norm": 0.4019823372364044, | |
| "learning_rate": 1.953441295546559e-06, | |
| "loss": 1.4567, | |
| "mean_token_accuracy": 0.6482177615165711, | |
| "num_tokens": 30534454.0, | |
| "step": 5350 | |
| }, | |
| { | |
| "entropy": 1.5660398364067079, | |
| "epoch": 2.7125506072874495, | |
| "grad_norm": 0.2922350764274597, | |
| "learning_rate": 1.9197031039136304e-06, | |
| "loss": 1.5742, | |
| "mean_token_accuracy": 0.6296724855899811, | |
| "num_tokens": 30591311.0, | |
| "step": 5360 | |
| }, | |
| { | |
| "entropy": 1.513328456878662, | |
| "epoch": 2.717611336032389, | |
| "grad_norm": 0.34194323420524597, | |
| "learning_rate": 1.8859649122807019e-06, | |
| "loss": 1.5109, | |
| "mean_token_accuracy": 0.6368795096874237, | |
| "num_tokens": 30648991.0, | |
| "step": 5370 | |
| }, | |
| { | |
| "entropy": 1.517847204208374, | |
| "epoch": 2.7226720647773277, | |
| "grad_norm": 0.35915765166282654, | |
| "learning_rate": 1.8522267206477735e-06, | |
| "loss": 1.5111, | |
| "mean_token_accuracy": 0.6370461285114288, | |
| "num_tokens": 30702765.0, | |
| "step": 5380 | |
| }, | |
| { | |
| "entropy": 1.4219279885292053, | |
| "epoch": 2.727732793522267, | |
| "grad_norm": 0.31105926632881165, | |
| "learning_rate": 1.818488529014845e-06, | |
| "loss": 1.416, | |
| "mean_token_accuracy": 0.6535437107086182, | |
| "num_tokens": 30759049.0, | |
| "step": 5390 | |
| }, | |
| { | |
| "entropy": 1.4244774222373962, | |
| "epoch": 2.7327935222672064, | |
| "grad_norm": 0.3058363199234009, | |
| "learning_rate": 1.7847503373819164e-06, | |
| "loss": 1.4116, | |
| "mean_token_accuracy": 0.6499510526657104, | |
| "num_tokens": 30815038.0, | |
| "step": 5400 | |
| }, | |
| { | |
| "entropy": 1.6432706594467164, | |
| "epoch": 2.7378542510121457, | |
| "grad_norm": 0.33452996611595154, | |
| "learning_rate": 1.7510121457489878e-06, | |
| "loss": 1.6396, | |
| "mean_token_accuracy": 0.6203866958618164, | |
| "num_tokens": 30871076.0, | |
| "step": 5410 | |
| }, | |
| { | |
| "entropy": 1.6394325613975524, | |
| "epoch": 2.742914979757085, | |
| "grad_norm": 0.283194363117218, | |
| "learning_rate": 1.7172739541160596e-06, | |
| "loss": 1.6447, | |
| "mean_token_accuracy": 0.6192583978176117, | |
| "num_tokens": 30931499.0, | |
| "step": 5420 | |
| }, | |
| { | |
| "entropy": 1.613875186443329, | |
| "epoch": 2.7479757085020244, | |
| "grad_norm": 0.3175935745239258, | |
| "learning_rate": 1.683535762483131e-06, | |
| "loss": 1.616, | |
| "mean_token_accuracy": 0.6225574970245361, | |
| "num_tokens": 30993640.0, | |
| "step": 5430 | |
| }, | |
| { | |
| "entropy": 1.6437927842140199, | |
| "epoch": 2.753036437246964, | |
| "grad_norm": 0.2761462926864624, | |
| "learning_rate": 1.6497975708502027e-06, | |
| "loss": 1.6461, | |
| "mean_token_accuracy": 0.6168906092643738, | |
| "num_tokens": 31046563.0, | |
| "step": 5440 | |
| }, | |
| { | |
| "entropy": 1.3887561798095702, | |
| "epoch": 2.7580971659919027, | |
| "grad_norm": 0.3212042450904846, | |
| "learning_rate": 1.6160593792172741e-06, | |
| "loss": 1.3872, | |
| "mean_token_accuracy": 0.66387038230896, | |
| "num_tokens": 31100699.0, | |
| "step": 5450 | |
| }, | |
| { | |
| "entropy": 1.5592396020889283, | |
| "epoch": 2.763157894736842, | |
| "grad_norm": 0.28648391366004944, | |
| "learning_rate": 1.5823211875843455e-06, | |
| "loss": 1.5583, | |
| "mean_token_accuracy": 0.6273209810256958, | |
| "num_tokens": 31164910.0, | |
| "step": 5460 | |
| }, | |
| { | |
| "entropy": 1.546663898229599, | |
| "epoch": 2.7682186234817814, | |
| "grad_norm": 0.3598899841308594, | |
| "learning_rate": 1.548582995951417e-06, | |
| "loss": 1.5324, | |
| "mean_token_accuracy": 0.6319786071777344, | |
| "num_tokens": 31220029.0, | |
| "step": 5470 | |
| }, | |
| { | |
| "entropy": 1.5003357887268067, | |
| "epoch": 2.7732793522267207, | |
| "grad_norm": 0.2860889732837677, | |
| "learning_rate": 1.5148448043184886e-06, | |
| "loss": 1.4952, | |
| "mean_token_accuracy": 0.6411886811256409, | |
| "num_tokens": 31279401.0, | |
| "step": 5480 | |
| }, | |
| { | |
| "entropy": 1.418429398536682, | |
| "epoch": 2.77834008097166, | |
| "grad_norm": 0.2821556627750397, | |
| "learning_rate": 1.4811066126855602e-06, | |
| "loss": 1.421, | |
| "mean_token_accuracy": 0.6593441128730774, | |
| "num_tokens": 31334950.0, | |
| "step": 5490 | |
| }, | |
| { | |
| "entropy": 1.5357149362564086, | |
| "epoch": 2.783400809716599, | |
| "grad_norm": 0.3190230131149292, | |
| "learning_rate": 1.4473684210526317e-06, | |
| "loss": 1.5381, | |
| "mean_token_accuracy": 0.6347779989242553, | |
| "num_tokens": 31392814.0, | |
| "step": 5500 | |
| }, | |
| { | |
| "entropy": 1.4718513011932373, | |
| "epoch": 2.7884615384615383, | |
| "grad_norm": 0.2940792441368103, | |
| "learning_rate": 1.4136302294197033e-06, | |
| "loss": 1.4801, | |
| "mean_token_accuracy": 0.6389244079589844, | |
| "num_tokens": 31449900.0, | |
| "step": 5510 | |
| }, | |
| { | |
| "entropy": 1.3709456086158753, | |
| "epoch": 2.7935222672064777, | |
| "grad_norm": 0.30266401171684265, | |
| "learning_rate": 1.3798920377867747e-06, | |
| "loss": 1.3599, | |
| "mean_token_accuracy": 0.6677371621131897, | |
| "num_tokens": 31503636.0, | |
| "step": 5520 | |
| }, | |
| { | |
| "entropy": 1.4986552715301513, | |
| "epoch": 2.798582995951417, | |
| "grad_norm": 0.35532623529434204, | |
| "learning_rate": 1.3461538461538462e-06, | |
| "loss": 1.5069, | |
| "mean_token_accuracy": 0.6403627216815948, | |
| "num_tokens": 31563188.0, | |
| "step": 5530 | |
| }, | |
| { | |
| "entropy": 1.53000727891922, | |
| "epoch": 2.8036437246963564, | |
| "grad_norm": 0.3287500739097595, | |
| "learning_rate": 1.3124156545209176e-06, | |
| "loss": 1.5289, | |
| "mean_token_accuracy": 0.6303663849830627, | |
| "num_tokens": 31622655.0, | |
| "step": 5540 | |
| }, | |
| { | |
| "entropy": 1.399080502986908, | |
| "epoch": 2.8087044534412957, | |
| "grad_norm": 0.2796313762664795, | |
| "learning_rate": 1.2786774628879894e-06, | |
| "loss": 1.3962, | |
| "mean_token_accuracy": 0.6611886739730835, | |
| "num_tokens": 31678734.0, | |
| "step": 5550 | |
| }, | |
| { | |
| "entropy": 1.4993074774742126, | |
| "epoch": 2.813765182186235, | |
| "grad_norm": 0.2762647867202759, | |
| "learning_rate": 1.2449392712550609e-06, | |
| "loss": 1.5019, | |
| "mean_token_accuracy": 0.6430062472820282, | |
| "num_tokens": 31738238.0, | |
| "step": 5560 | |
| }, | |
| { | |
| "entropy": 1.5500144839286805, | |
| "epoch": 2.818825910931174, | |
| "grad_norm": 0.4136376678943634, | |
| "learning_rate": 1.2112010796221325e-06, | |
| "loss": 1.5483, | |
| "mean_token_accuracy": 0.6305223643779755, | |
| "num_tokens": 31794084.0, | |
| "step": 5570 | |
| }, | |
| { | |
| "entropy": 1.378357458114624, | |
| "epoch": 2.8238866396761133, | |
| "grad_norm": 0.2796184718608856, | |
| "learning_rate": 1.177462887989204e-06, | |
| "loss": 1.3879, | |
| "mean_token_accuracy": 0.6623157143592835, | |
| "num_tokens": 31850160.0, | |
| "step": 5580 | |
| }, | |
| { | |
| "entropy": 1.5122020602226258, | |
| "epoch": 2.8289473684210527, | |
| "grad_norm": 0.3030454218387604, | |
| "learning_rate": 1.1437246963562754e-06, | |
| "loss": 1.5336, | |
| "mean_token_accuracy": 0.6398816347122193, | |
| "num_tokens": 31908958.0, | |
| "step": 5590 | |
| }, | |
| { | |
| "entropy": 1.4396753072738648, | |
| "epoch": 2.834008097165992, | |
| "grad_norm": 0.329406201839447, | |
| "learning_rate": 1.109986504723347e-06, | |
| "loss": 1.4524, | |
| "mean_token_accuracy": 0.6439715623855591, | |
| "num_tokens": 31963221.0, | |
| "step": 5600 | |
| }, | |
| { | |
| "entropy": 1.4529581308364867, | |
| "epoch": 2.839068825910931, | |
| "grad_norm": 0.30805808305740356, | |
| "learning_rate": 1.0762483130904184e-06, | |
| "loss": 1.4535, | |
| "mean_token_accuracy": 0.6503434360027314, | |
| "num_tokens": 32024028.0, | |
| "step": 5610 | |
| }, | |
| { | |
| "entropy": 1.4559079647064208, | |
| "epoch": 2.8441295546558703, | |
| "grad_norm": 0.2905729115009308, | |
| "learning_rate": 1.0425101214574899e-06, | |
| "loss": 1.4595, | |
| "mean_token_accuracy": 0.6404688119888305, | |
| "num_tokens": 32079570.0, | |
| "step": 5620 | |
| }, | |
| { | |
| "entropy": 1.566865622997284, | |
| "epoch": 2.8491902834008096, | |
| "grad_norm": 0.3712847828865051, | |
| "learning_rate": 1.0087719298245615e-06, | |
| "loss": 1.5897, | |
| "mean_token_accuracy": 0.6281434834003449, | |
| "num_tokens": 32140749.0, | |
| "step": 5630 | |
| }, | |
| { | |
| "entropy": 1.3378282070159913, | |
| "epoch": 2.854251012145749, | |
| "grad_norm": 0.34094497561454773, | |
| "learning_rate": 9.750337381916331e-07, | |
| "loss": 1.3271, | |
| "mean_token_accuracy": 0.6688917458057404, | |
| "num_tokens": 32197723.0, | |
| "step": 5640 | |
| }, | |
| { | |
| "entropy": 1.5701539039611816, | |
| "epoch": 2.8593117408906883, | |
| "grad_norm": 0.3105640113353729, | |
| "learning_rate": 9.412955465587046e-07, | |
| "loss": 1.5691, | |
| "mean_token_accuracy": 0.631563925743103, | |
| "num_tokens": 32253364.0, | |
| "step": 5650 | |
| }, | |
| { | |
| "entropy": 1.3988579750061034, | |
| "epoch": 2.8643724696356276, | |
| "grad_norm": 0.33697089552879333, | |
| "learning_rate": 9.07557354925776e-07, | |
| "loss": 1.3774, | |
| "mean_token_accuracy": 0.6580399334430694, | |
| "num_tokens": 32307759.0, | |
| "step": 5660 | |
| }, | |
| { | |
| "entropy": 1.5067651271820068, | |
| "epoch": 2.869433198380567, | |
| "grad_norm": 0.4209248721599579, | |
| "learning_rate": 8.738191632928476e-07, | |
| "loss": 1.5117, | |
| "mean_token_accuracy": 0.6395208477973938, | |
| "num_tokens": 32361901.0, | |
| "step": 5670 | |
| }, | |
| { | |
| "entropy": 1.4897794604301453, | |
| "epoch": 2.8744939271255063, | |
| "grad_norm": 0.26533105969429016, | |
| "learning_rate": 8.400809716599192e-07, | |
| "loss": 1.492, | |
| "mean_token_accuracy": 0.6359480619430542, | |
| "num_tokens": 32421074.0, | |
| "step": 5680 | |
| }, | |
| { | |
| "entropy": 1.5117918133735657, | |
| "epoch": 2.8795546558704452, | |
| "grad_norm": 0.2814977169036865, | |
| "learning_rate": 8.063427800269906e-07, | |
| "loss": 1.5099, | |
| "mean_token_accuracy": 0.6378594696521759, | |
| "num_tokens": 32475249.0, | |
| "step": 5690 | |
| }, | |
| { | |
| "entropy": 1.5758733749389648, | |
| "epoch": 2.8846153846153846, | |
| "grad_norm": 0.3215586543083191, | |
| "learning_rate": 7.726045883940621e-07, | |
| "loss": 1.5742, | |
| "mean_token_accuracy": 0.6247093558311463, | |
| "num_tokens": 32533856.0, | |
| "step": 5700 | |
| }, | |
| { | |
| "entropy": 1.5476130247116089, | |
| "epoch": 2.889676113360324, | |
| "grad_norm": 0.3249874413013458, | |
| "learning_rate": 7.388663967611337e-07, | |
| "loss": 1.5596, | |
| "mean_token_accuracy": 0.630809611082077, | |
| "num_tokens": 32590748.0, | |
| "step": 5710 | |
| }, | |
| { | |
| "entropy": 1.6522730588912964, | |
| "epoch": 2.8947368421052633, | |
| "grad_norm": 0.30724644660949707, | |
| "learning_rate": 7.051282051282052e-07, | |
| "loss": 1.6494, | |
| "mean_token_accuracy": 0.6230604112148285, | |
| "num_tokens": 32646249.0, | |
| "step": 5720 | |
| }, | |
| { | |
| "entropy": 1.544532060623169, | |
| "epoch": 2.899797570850202, | |
| "grad_norm": 0.2921552062034607, | |
| "learning_rate": 6.713900134952767e-07, | |
| "loss": 1.5418, | |
| "mean_token_accuracy": 0.6278964817523957, | |
| "num_tokens": 32705968.0, | |
| "step": 5730 | |
| }, | |
| { | |
| "entropy": 1.6792848348617553, | |
| "epoch": 2.9048582995951415, | |
| "grad_norm": 0.35362908244132996, | |
| "learning_rate": 6.376518218623482e-07, | |
| "loss": 1.6863, | |
| "mean_token_accuracy": 0.606015944480896, | |
| "num_tokens": 32764283.0, | |
| "step": 5740 | |
| }, | |
| { | |
| "entropy": 1.3941245913505553, | |
| "epoch": 2.909919028340081, | |
| "grad_norm": 0.3051432967185974, | |
| "learning_rate": 6.039136302294198e-07, | |
| "loss": 1.3916, | |
| "mean_token_accuracy": 0.655018413066864, | |
| "num_tokens": 32821604.0, | |
| "step": 5750 | |
| }, | |
| { | |
| "entropy": 1.3279268383979796, | |
| "epoch": 2.91497975708502, | |
| "grad_norm": 0.28279706835746765, | |
| "learning_rate": 5.701754385964912e-07, | |
| "loss": 1.3252, | |
| "mean_token_accuracy": 0.6721781909465789, | |
| "num_tokens": 32882742.0, | |
| "step": 5760 | |
| }, | |
| { | |
| "entropy": 1.40997371673584, | |
| "epoch": 2.9200404858299596, | |
| "grad_norm": 0.27882078289985657, | |
| "learning_rate": 5.364372469635628e-07, | |
| "loss": 1.4014, | |
| "mean_token_accuracy": 0.653525573015213, | |
| "num_tokens": 32940852.0, | |
| "step": 5770 | |
| }, | |
| { | |
| "entropy": 1.4527880668640136, | |
| "epoch": 2.925101214574899, | |
| "grad_norm": 0.34039291739463806, | |
| "learning_rate": 5.026990553306344e-07, | |
| "loss": 1.4668, | |
| "mean_token_accuracy": 0.6496657609939576, | |
| "num_tokens": 32993007.0, | |
| "step": 5780 | |
| }, | |
| { | |
| "entropy": 1.4252225756645203, | |
| "epoch": 2.9301619433198383, | |
| "grad_norm": 0.33022540807724, | |
| "learning_rate": 4.6896086369770585e-07, | |
| "loss": 1.4163, | |
| "mean_token_accuracy": 0.6603101253509521, | |
| "num_tokens": 33052311.0, | |
| "step": 5790 | |
| }, | |
| { | |
| "entropy": 1.3463350296020509, | |
| "epoch": 2.9352226720647776, | |
| "grad_norm": 0.3052782416343689, | |
| "learning_rate": 4.352226720647774e-07, | |
| "loss": 1.3402, | |
| "mean_token_accuracy": 0.6646123170852661, | |
| "num_tokens": 33107681.0, | |
| "step": 5800 | |
| }, | |
| { | |
| "entropy": 1.4629117488861083, | |
| "epoch": 2.9402834008097165, | |
| "grad_norm": 0.3405231535434723, | |
| "learning_rate": 4.0148448043184886e-07, | |
| "loss": 1.4697, | |
| "mean_token_accuracy": 0.6421536803245544, | |
| "num_tokens": 33160562.0, | |
| "step": 5810 | |
| }, | |
| { | |
| "entropy": 1.5115989089012145, | |
| "epoch": 2.945344129554656, | |
| "grad_norm": 0.25086501240730286, | |
| "learning_rate": 3.677462887989204e-07, | |
| "loss": 1.5254, | |
| "mean_token_accuracy": 0.636814546585083, | |
| "num_tokens": 33219166.0, | |
| "step": 5820 | |
| }, | |
| { | |
| "entropy": 1.5050144791603088, | |
| "epoch": 2.950404858299595, | |
| "grad_norm": 0.30874550342559814, | |
| "learning_rate": 3.34008097165992e-07, | |
| "loss": 1.5059, | |
| "mean_token_accuracy": 0.6406992137432098, | |
| "num_tokens": 33275918.0, | |
| "step": 5830 | |
| }, | |
| { | |
| "entropy": 1.4055772423744202, | |
| "epoch": 2.9554655870445345, | |
| "grad_norm": 0.37710893154144287, | |
| "learning_rate": 3.0026990553306346e-07, | |
| "loss": 1.4007, | |
| "mean_token_accuracy": 0.6559918403625489, | |
| "num_tokens": 33330271.0, | |
| "step": 5840 | |
| }, | |
| { | |
| "entropy": 1.514758288860321, | |
| "epoch": 2.9605263157894735, | |
| "grad_norm": 0.2986261248588562, | |
| "learning_rate": 2.66531713900135e-07, | |
| "loss": 1.5106, | |
| "mean_token_accuracy": 0.6324501454830169, | |
| "num_tokens": 33390790.0, | |
| "step": 5850 | |
| }, | |
| { | |
| "entropy": 1.4632804989814758, | |
| "epoch": 2.965587044534413, | |
| "grad_norm": 0.297039657831192, | |
| "learning_rate": 2.327935222672065e-07, | |
| "loss": 1.4562, | |
| "mean_token_accuracy": 0.652844125032425, | |
| "num_tokens": 33451092.0, | |
| "step": 5860 | |
| }, | |
| { | |
| "entropy": 1.619661772251129, | |
| "epoch": 2.970647773279352, | |
| "grad_norm": 0.32788631319999695, | |
| "learning_rate": 1.9905533063427803e-07, | |
| "loss": 1.6222, | |
| "mean_token_accuracy": 0.6278849899768829, | |
| "num_tokens": 33509293.0, | |
| "step": 5870 | |
| }, | |
| { | |
| "entropy": 1.6123327970504762, | |
| "epoch": 2.9757085020242915, | |
| "grad_norm": 0.30364230275154114, | |
| "learning_rate": 1.6531713900134953e-07, | |
| "loss": 1.62, | |
| "mean_token_accuracy": 0.6268645524978638, | |
| "num_tokens": 33567748.0, | |
| "step": 5880 | |
| }, | |
| { | |
| "entropy": 1.465350294113159, | |
| "epoch": 2.980769230769231, | |
| "grad_norm": 0.271182119846344, | |
| "learning_rate": 1.3157894736842107e-07, | |
| "loss": 1.4767, | |
| "mean_token_accuracy": 0.6428022742271423, | |
| "num_tokens": 33626678.0, | |
| "step": 5890 | |
| }, | |
| { | |
| "entropy": 1.5106886863708495, | |
| "epoch": 2.98582995951417, | |
| "grad_norm": 0.30039140582084656, | |
| "learning_rate": 9.784075573549259e-08, | |
| "loss": 1.501, | |
| "mean_token_accuracy": 0.6413563072681427, | |
| "num_tokens": 33681713.0, | |
| "step": 5900 | |
| }, | |
| { | |
| "entropy": 1.6268660426139832, | |
| "epoch": 2.9908906882591095, | |
| "grad_norm": 0.30086028575897217, | |
| "learning_rate": 6.41025641025641e-08, | |
| "loss": 1.6387, | |
| "mean_token_accuracy": 0.6218379974365235, | |
| "num_tokens": 33738372.0, | |
| "step": 5910 | |
| }, | |
| { | |
| "entropy": 1.5718028783798217, | |
| "epoch": 2.9959514170040484, | |
| "grad_norm": 0.3744632601737976, | |
| "learning_rate": 3.036437246963563e-08, | |
| "loss": 1.5594, | |
| "mean_token_accuracy": 0.6297330737113953, | |
| "num_tokens": 33794850.0, | |
| "step": 5920 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5928, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2845340765506765e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |