| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.643880926130099, | |
| "eval_steps": 100, | |
| "global_step": 600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.208277304470539, | |
| "epoch": 0.044101433296582136, | |
| "grad_norm": 0.7034044861793518, | |
| "learning_rate": 3.6e-05, | |
| "loss": 1.8075, | |
| "mean_token_accuracy": 0.7044319450855255, | |
| "num_tokens": 162027.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.2619145691394806, | |
| "epoch": 0.08820286659316427, | |
| "grad_norm": 0.56658935546875, | |
| "learning_rate": 7.6e-05, | |
| "loss": 1.7395, | |
| "mean_token_accuracy": 0.7111444145441055, | |
| "num_tokens": 322033.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.3051191717386246, | |
| "epoch": 0.13230429988974643, | |
| "grad_norm": 0.9701246023178101, | |
| "learning_rate": 0.000116, | |
| "loss": 1.4759, | |
| "mean_token_accuracy": 0.7372552484273911, | |
| "num_tokens": 493134.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.2549610823392867, | |
| "epoch": 0.17640573318632854, | |
| "grad_norm": 0.7130260467529297, | |
| "learning_rate": 0.00015600000000000002, | |
| "loss": 1.1858, | |
| "mean_token_accuracy": 0.7737368658185005, | |
| "num_tokens": 657636.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.0649129822850227, | |
| "epoch": 0.2205071664829107, | |
| "grad_norm": 0.31629839539527893, | |
| "learning_rate": 0.000196, | |
| "loss": 1.0158, | |
| "mean_token_accuracy": 0.7959408611059189, | |
| "num_tokens": 809381.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.9025423139333725, | |
| "epoch": 0.26460859977949286, | |
| "grad_norm": 0.2634122669696808, | |
| "learning_rate": 0.0001971473851030111, | |
| "loss": 0.8932, | |
| "mean_token_accuracy": 0.8144056230783463, | |
| "num_tokens": 970344.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.8994027122855186, | |
| "epoch": 0.308710033076075, | |
| "grad_norm": 0.29325830936431885, | |
| "learning_rate": 0.00019397781299524563, | |
| "loss": 0.8828, | |
| "mean_token_accuracy": 0.8157614529132843, | |
| "num_tokens": 1129434.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.8638058304786682, | |
| "epoch": 0.3528114663726571, | |
| "grad_norm": 0.22787100076675415, | |
| "learning_rate": 0.0001908082408874802, | |
| "loss": 0.8518, | |
| "mean_token_accuracy": 0.8206798136234283, | |
| "num_tokens": 1290818.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.8639321938157082, | |
| "epoch": 0.39691289966923926, | |
| "grad_norm": 0.27325868606567383, | |
| "learning_rate": 0.00018763866877971475, | |
| "loss": 0.8633, | |
| "mean_token_accuracy": 0.81620042771101, | |
| "num_tokens": 1452401.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.8432708650827407, | |
| "epoch": 0.4410143329658214, | |
| "grad_norm": 0.29134032130241394, | |
| "learning_rate": 0.0001844690966719493, | |
| "loss": 0.8225, | |
| "mean_token_accuracy": 0.8236450552940369, | |
| "num_tokens": 1616085.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4410143329658214, | |
| "eval_entropy": 0.7245843861952866, | |
| "eval_loss": 0.6365505456924438, | |
| "eval_mean_token_accuracy": 0.8594882246291283, | |
| "eval_num_tokens": 1616085.0, | |
| "eval_runtime": 8.1048, | |
| "eval_samples_per_second": 99.447, | |
| "eval_steps_per_second": 12.462, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.7841475278139114, | |
| "epoch": 0.48511576626240355, | |
| "grad_norm": 0.25806924700737, | |
| "learning_rate": 0.00018129952456418384, | |
| "loss": 0.7675, | |
| "mean_token_accuracy": 0.8291461855173111, | |
| "num_tokens": 1780700.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.8083247914910316, | |
| "epoch": 0.5292171995589857, | |
| "grad_norm": 0.2405901849269867, | |
| "learning_rate": 0.00017812995245641838, | |
| "loss": 0.7929, | |
| "mean_token_accuracy": 0.8249855980277061, | |
| "num_tokens": 1944758.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.8017621666193009, | |
| "epoch": 0.5733186328555678, | |
| "grad_norm": 0.2787521183490753, | |
| "learning_rate": 0.00017496038034865293, | |
| "loss": 0.8007, | |
| "mean_token_accuracy": 0.8232445836067199, | |
| "num_tokens": 2108373.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.783287800848484, | |
| "epoch": 0.61742006615215, | |
| "grad_norm": 0.24215690791606903, | |
| "learning_rate": 0.0001717908082408875, | |
| "loss": 0.7806, | |
| "mean_token_accuracy": 0.82769885212183, | |
| "num_tokens": 2279985.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.7877024173736572, | |
| "epoch": 0.6615214994487321, | |
| "grad_norm": 0.32073867321014404, | |
| "learning_rate": 0.00016862123613312205, | |
| "loss": 0.7749, | |
| "mean_token_accuracy": 0.8289618909358978, | |
| "num_tokens": 2437290.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.7758993163704873, | |
| "epoch": 0.7056229327453142, | |
| "grad_norm": 0.24493683874607086, | |
| "learning_rate": 0.0001654516640253566, | |
| "loss": 0.7613, | |
| "mean_token_accuracy": 0.8307989597320556, | |
| "num_tokens": 2599040.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.7710816964507103, | |
| "epoch": 0.7497243660418964, | |
| "grad_norm": 0.26176023483276367, | |
| "learning_rate": 0.00016228209191759114, | |
| "loss": 0.7701, | |
| "mean_token_accuracy": 0.8283671870827675, | |
| "num_tokens": 2763931.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.7687478274106979, | |
| "epoch": 0.7938257993384785, | |
| "grad_norm": 0.33372247219085693, | |
| "learning_rate": 0.00015911251980982568, | |
| "loss": 0.7599, | |
| "mean_token_accuracy": 0.8326913744211197, | |
| "num_tokens": 2921138.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.7683898612856865, | |
| "epoch": 0.8379272326350606, | |
| "grad_norm": 0.30479806661605835, | |
| "learning_rate": 0.00015594294770206023, | |
| "loss": 0.7652, | |
| "mean_token_accuracy": 0.8292697682976723, | |
| "num_tokens": 3078075.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.7426786199212074, | |
| "epoch": 0.8820286659316428, | |
| "grad_norm": 0.30385708808898926, | |
| "learning_rate": 0.0001527733755942948, | |
| "loss": 0.7484, | |
| "mean_token_accuracy": 0.834077812731266, | |
| "num_tokens": 3250532.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8820286659316428, | |
| "eval_entropy": 0.6650676030923824, | |
| "eval_loss": 0.5983571410179138, | |
| "eval_mean_token_accuracy": 0.8642630500368552, | |
| "eval_num_tokens": 3250532.0, | |
| "eval_runtime": 8.088, | |
| "eval_samples_per_second": 99.654, | |
| "eval_steps_per_second": 12.488, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.7351636976003647, | |
| "epoch": 0.9261300992282249, | |
| "grad_norm": 0.3023395836353302, | |
| "learning_rate": 0.00014960380348652932, | |
| "loss": 0.7276, | |
| "mean_token_accuracy": 0.8379455998539924, | |
| "num_tokens": 3412136.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.7627917662262916, | |
| "epoch": 0.9702315325248071, | |
| "grad_norm": 0.27449166774749756, | |
| "learning_rate": 0.00014643423137876386, | |
| "loss": 0.7657, | |
| "mean_token_accuracy": 0.832044218480587, | |
| "num_tokens": 3578511.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.7629164243355776, | |
| "epoch": 1.0132304299889747, | |
| "grad_norm": 0.3300953209400177, | |
| "learning_rate": 0.0001432646592709984, | |
| "loss": 0.7491, | |
| "mean_token_accuracy": 0.8328371659303323, | |
| "num_tokens": 3722301.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.746186052262783, | |
| "epoch": 1.0573318632855568, | |
| "grad_norm": 0.3027808666229248, | |
| "learning_rate": 0.00014009508716323295, | |
| "loss": 0.7462, | |
| "mean_token_accuracy": 0.8322931200265884, | |
| "num_tokens": 3896679.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.7389517679810524, | |
| "epoch": 1.101433296582139, | |
| "grad_norm": 0.2887279689311981, | |
| "learning_rate": 0.00013692551505546752, | |
| "loss": 0.7406, | |
| "mean_token_accuracy": 0.8339706152677536, | |
| "num_tokens": 4060607.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.7400932610034943, | |
| "epoch": 1.145534729878721, | |
| "grad_norm": 0.29185837507247925, | |
| "learning_rate": 0.00013375594294770207, | |
| "loss": 0.7335, | |
| "mean_token_accuracy": 0.8348478749394417, | |
| "num_tokens": 4235108.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.7405846387147903, | |
| "epoch": 1.1896361631753032, | |
| "grad_norm": 0.3650504946708679, | |
| "learning_rate": 0.0001305863708399366, | |
| "loss": 0.7242, | |
| "mean_token_accuracy": 0.8392871618270874, | |
| "num_tokens": 4399960.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.7342920154333115, | |
| "epoch": 1.2337375964718853, | |
| "grad_norm": 0.32232147455215454, | |
| "learning_rate": 0.00012741679873217116, | |
| "loss": 0.7329, | |
| "mean_token_accuracy": 0.8371502041816712, | |
| "num_tokens": 4562192.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.7346035555005074, | |
| "epoch": 1.2778390297684674, | |
| "grad_norm": 0.35367703437805176, | |
| "learning_rate": 0.0001242472266244057, | |
| "loss": 0.7251, | |
| "mean_token_accuracy": 0.8377319499850273, | |
| "num_tokens": 4721881.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.7108615353703499, | |
| "epoch": 1.3219404630650495, | |
| "grad_norm": 0.2766059637069702, | |
| "learning_rate": 0.00012107765451664026, | |
| "loss": 0.7134, | |
| "mean_token_accuracy": 0.8399718284606934, | |
| "num_tokens": 4885379.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.3219404630650495, | |
| "eval_entropy": 0.6444886132042007, | |
| "eval_loss": 0.5822195410728455, | |
| "eval_mean_token_accuracy": 0.866522473864036, | |
| "eval_num_tokens": 4885379.0, | |
| "eval_runtime": 8.0528, | |
| "eval_samples_per_second": 100.089, | |
| "eval_steps_per_second": 12.542, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.7376143395900726, | |
| "epoch": 1.3660418963616316, | |
| "grad_norm": 0.2916136384010315, | |
| "learning_rate": 0.0001179080824088748, | |
| "loss": 0.7378, | |
| "mean_token_accuracy": 0.8362751066684723, | |
| "num_tokens": 5049776.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.7360161304473877, | |
| "epoch": 1.4101433296582138, | |
| "grad_norm": 0.3246314525604248, | |
| "learning_rate": 0.00011473851030110936, | |
| "loss": 0.7383, | |
| "mean_token_accuracy": 0.8336721956729889, | |
| "num_tokens": 5209191.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.7085310086607933, | |
| "epoch": 1.454244762954796, | |
| "grad_norm": 0.2847578823566437, | |
| "learning_rate": 0.00011156893819334391, | |
| "loss": 0.7041, | |
| "mean_token_accuracy": 0.8415971100330353, | |
| "num_tokens": 5375904.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.7118788257241249, | |
| "epoch": 1.4983461962513782, | |
| "grad_norm": 0.3348420262336731, | |
| "learning_rate": 0.00010839936608557845, | |
| "loss": 0.7121, | |
| "mean_token_accuracy": 0.839908429980278, | |
| "num_tokens": 5530092.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.7398809120059013, | |
| "epoch": 1.5424476295479603, | |
| "grad_norm": 0.31565436720848083, | |
| "learning_rate": 0.00010522979397781301, | |
| "loss": 0.738, | |
| "mean_token_accuracy": 0.8339577659964561, | |
| "num_tokens": 5680997.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.7200135216116905, | |
| "epoch": 1.5865490628445424, | |
| "grad_norm": 0.3452684283256531, | |
| "learning_rate": 0.00010206022187004756, | |
| "loss": 0.726, | |
| "mean_token_accuracy": 0.8387370139360428, | |
| "num_tokens": 5847675.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.7074938386678695, | |
| "epoch": 1.6306504961411246, | |
| "grad_norm": 0.307235985994339, | |
| "learning_rate": 9.889064976228209e-05, | |
| "loss": 0.689, | |
| "mean_token_accuracy": 0.8433170482516289, | |
| "num_tokens": 6014120.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.713779816031456, | |
| "epoch": 1.6747519294377067, | |
| "grad_norm": 0.33486098051071167, | |
| "learning_rate": 9.572107765451665e-05, | |
| "loss": 0.7166, | |
| "mean_token_accuracy": 0.8390020251274108, | |
| "num_tokens": 6175759.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.7257090628147125, | |
| "epoch": 1.718853362734289, | |
| "grad_norm": 0.29937514662742615, | |
| "learning_rate": 9.255150554675119e-05, | |
| "loss": 0.7296, | |
| "mean_token_accuracy": 0.8377068281173706, | |
| "num_tokens": 6337886.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.7071069806814194, | |
| "epoch": 1.7629547960308711, | |
| "grad_norm": 0.29501640796661377, | |
| "learning_rate": 8.938193343898574e-05, | |
| "loss": 0.7023, | |
| "mean_token_accuracy": 0.8423839300870896, | |
| "num_tokens": 6499459.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.7629547960308711, | |
| "eval_entropy": 0.6217773565561464, | |
| "eval_loss": 0.5744128227233887, | |
| "eval_mean_token_accuracy": 0.8680370243469088, | |
| "eval_num_tokens": 6499459.0, | |
| "eval_runtime": 8.0487, | |
| "eval_samples_per_second": 100.14, | |
| "eval_steps_per_second": 12.549, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.7010652974247933, | |
| "epoch": 1.8070562293274532, | |
| "grad_norm": 0.3465122580528259, | |
| "learning_rate": 8.62123613312203e-05, | |
| "loss": 0.7056, | |
| "mean_token_accuracy": 0.8412188425660133, | |
| "num_tokens": 6661490.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.7035703644156456, | |
| "epoch": 1.8511576626240354, | |
| "grad_norm": 0.2653435170650482, | |
| "learning_rate": 8.304278922345484e-05, | |
| "loss": 0.7047, | |
| "mean_token_accuracy": 0.8421282634139061, | |
| "num_tokens": 6806303.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.7134277895092964, | |
| "epoch": 1.8952590959206175, | |
| "grad_norm": 0.3735100328922272, | |
| "learning_rate": 7.987321711568939e-05, | |
| "loss": 0.7222, | |
| "mean_token_accuracy": 0.8374130159616471, | |
| "num_tokens": 6977619.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.6837946087121963, | |
| "epoch": 1.9393605292171996, | |
| "grad_norm": 0.3172140121459961, | |
| "learning_rate": 7.670364500792393e-05, | |
| "loss": 0.6872, | |
| "mean_token_accuracy": 0.8447714239358902, | |
| "num_tokens": 7134964.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.7037998244166375, | |
| "epoch": 1.9834619625137817, | |
| "grad_norm": 0.3320825397968292, | |
| "learning_rate": 7.353407290015848e-05, | |
| "loss": 0.7065, | |
| "mean_token_accuracy": 0.8428374692797661, | |
| "num_tokens": 7302144.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.6984619360703689, | |
| "epoch": 2.0264608599779494, | |
| "grad_norm": 0.285248339176178, | |
| "learning_rate": 7.036450079239303e-05, | |
| "loss": 0.6904, | |
| "mean_token_accuracy": 0.8467852473258972, | |
| "num_tokens": 7443873.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.7012363314628601, | |
| "epoch": 2.0705622932745316, | |
| "grad_norm": 0.32722488045692444, | |
| "learning_rate": 6.719492868462758e-05, | |
| "loss": 0.6951, | |
| "mean_token_accuracy": 0.8441829264163971, | |
| "num_tokens": 7602458.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.7040385738015175, | |
| "epoch": 2.1146637265711137, | |
| "grad_norm": 0.3185954689979553, | |
| "learning_rate": 6.402535657686212e-05, | |
| "loss": 0.7082, | |
| "mean_token_accuracy": 0.8414013043045998, | |
| "num_tokens": 7763939.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.6956586122512818, | |
| "epoch": 2.158765159867696, | |
| "grad_norm": 0.34575557708740234, | |
| "learning_rate": 6.0855784469096676e-05, | |
| "loss": 0.695, | |
| "mean_token_accuracy": 0.8428655683994293, | |
| "num_tokens": 7927670.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.702117745578289, | |
| "epoch": 2.202866593164278, | |
| "grad_norm": 0.3255711793899536, | |
| "learning_rate": 5.768621236133123e-05, | |
| "loss": 0.7111, | |
| "mean_token_accuracy": 0.8387768477201462, | |
| "num_tokens": 8090934.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.202866593164278, | |
| "eval_entropy": 0.6130540751584685, | |
| "eval_loss": 0.5708180665969849, | |
| "eval_mean_token_accuracy": 0.8685296850629373, | |
| "eval_num_tokens": 8090934.0, | |
| "eval_runtime": 8.0458, | |
| "eval_samples_per_second": 100.176, | |
| "eval_steps_per_second": 12.553, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.6894026726484299, | |
| "epoch": 2.24696802646086, | |
| "grad_norm": 0.3195938467979431, | |
| "learning_rate": 5.451664025356578e-05, | |
| "loss": 0.6884, | |
| "mean_token_accuracy": 0.8427316024899483, | |
| "num_tokens": 8260273.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.6812330767512321, | |
| "epoch": 2.291069459757442, | |
| "grad_norm": 0.4105236232280731, | |
| "learning_rate": 5.134706814580032e-05, | |
| "loss": 0.6849, | |
| "mean_token_accuracy": 0.8467972829937935, | |
| "num_tokens": 8418421.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.6863151758909225, | |
| "epoch": 2.3351708930540243, | |
| "grad_norm": 0.3161937892436981, | |
| "learning_rate": 4.817749603803487e-05, | |
| "loss": 0.6813, | |
| "mean_token_accuracy": 0.8450520291924477, | |
| "num_tokens": 8584244.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.6781483091413975, | |
| "epoch": 2.3792723263506064, | |
| "grad_norm": 0.37656792998313904, | |
| "learning_rate": 4.5007923930269414e-05, | |
| "loss": 0.6754, | |
| "mean_token_accuracy": 0.8465494722127914, | |
| "num_tokens": 8748714.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.6889190331101418, | |
| "epoch": 2.4233737596471885, | |
| "grad_norm": 0.3911200761795044, | |
| "learning_rate": 4.1838351822503966e-05, | |
| "loss": 0.7013, | |
| "mean_token_accuracy": 0.8418177396059037, | |
| "num_tokens": 8908894.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.6950134262442589, | |
| "epoch": 2.4674751929437706, | |
| "grad_norm": 0.32076194882392883, | |
| "learning_rate": 3.866877971473851e-05, | |
| "loss": 0.7057, | |
| "mean_token_accuracy": 0.8429578125476838, | |
| "num_tokens": 9068647.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.6590564094483853, | |
| "epoch": 2.5115766262403527, | |
| "grad_norm": 0.33828845620155334, | |
| "learning_rate": 3.549920760697306e-05, | |
| "loss": 0.6653, | |
| "mean_token_accuracy": 0.8498437628149986, | |
| "num_tokens": 9241573.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.6963606104254723, | |
| "epoch": 2.555678059536935, | |
| "grad_norm": 0.3341946303844452, | |
| "learning_rate": 3.2329635499207614e-05, | |
| "loss": 0.7008, | |
| "mean_token_accuracy": 0.8427884921431541, | |
| "num_tokens": 9402668.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.7054937720298767, | |
| "epoch": 2.599779492833517, | |
| "grad_norm": 0.2775990962982178, | |
| "learning_rate": 2.9160063391442156e-05, | |
| "loss": 0.7011, | |
| "mean_token_accuracy": 0.841389861702919, | |
| "num_tokens": 9566515.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.7010829344391822, | |
| "epoch": 2.643880926130099, | |
| "grad_norm": 0.35286372900009155, | |
| "learning_rate": 2.5990491283676704e-05, | |
| "loss": 0.6916, | |
| "mean_token_accuracy": 0.8444753900170326, | |
| "num_tokens": 9737713.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.643880926130099, | |
| "eval_entropy": 0.6011341580069891, | |
| "eval_loss": 0.5674853324890137, | |
| "eval_mean_token_accuracy": 0.8689056859158053, | |
| "eval_num_tokens": 9737713.0, | |
| "eval_runtime": 8.0369, | |
| "eval_samples_per_second": 100.287, | |
| "eval_steps_per_second": 12.567, | |
| "step": 600 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 681, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.797942892344115e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |