| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.7500875043752186, | |
| "eval_steps": 500, | |
| "global_step": 5000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00035001750087504374, | |
| "grad_norm": 7.988319396972656, | |
| "learning_rate": 5e-05, | |
| "loss": 0.2364, | |
| "mean_token_accuracy": 0.8999999761581421, | |
| "num_tokens": 452.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0035001750087504373, | |
| "grad_norm": 0.8047283291816711, | |
| "learning_rate": 4.991e-05, | |
| "loss": 0.1339, | |
| "mean_token_accuracy": 0.966666665342119, | |
| "num_tokens": 4674.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.007000350017500875, | |
| "grad_norm": 0.08694641292095184, | |
| "learning_rate": 4.981e-05, | |
| "loss": 0.0457, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 9193.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.010500525026251312, | |
| "grad_norm": 9.807371139526367, | |
| "learning_rate": 4.9710000000000003e-05, | |
| "loss": 0.1587, | |
| "mean_token_accuracy": 0.9599999904632568, | |
| "num_tokens": 13953.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.01400070003500175, | |
| "grad_norm": 0.12450232356786728, | |
| "learning_rate": 4.961e-05, | |
| "loss": 0.107, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 18471.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.01750087504375219, | |
| "grad_norm": 3.2105650901794434, | |
| "learning_rate": 4.951e-05, | |
| "loss": 0.1113, | |
| "mean_token_accuracy": 0.9649999916553498, | |
| "num_tokens": 23060.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.021001050052502624, | |
| "grad_norm": 0.6051918268203735, | |
| "learning_rate": 4.941e-05, | |
| "loss": 0.0627, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 27672.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.024501225061253063, | |
| "grad_norm": 5.309004306793213, | |
| "learning_rate": 4.931e-05, | |
| "loss": 0.125, | |
| "mean_token_accuracy": 0.9649999916553498, | |
| "num_tokens": 32267.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0280014000700035, | |
| "grad_norm": 3.3586971759796143, | |
| "learning_rate": 4.921e-05, | |
| "loss": 0.0377, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 36884.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.03150157507875394, | |
| "grad_norm": 4.870711803436279, | |
| "learning_rate": 4.911e-05, | |
| "loss": 0.0644, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 41583.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.03500175008750438, | |
| "grad_norm": 0.014425868168473244, | |
| "learning_rate": 4.901e-05, | |
| "loss": 0.0432, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 46184.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.038501925096254816, | |
| "grad_norm": 8.833477020263672, | |
| "learning_rate": 4.891e-05, | |
| "loss": 0.093, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 50776.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.04200210010500525, | |
| "grad_norm": 0.19166199862957, | |
| "learning_rate": 4.881e-05, | |
| "loss": 0.1521, | |
| "mean_token_accuracy": 0.9649999976158142, | |
| "num_tokens": 55443.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.04550227511375569, | |
| "grad_norm": 0.3452470302581787, | |
| "learning_rate": 4.871e-05, | |
| "loss": 0.0972, | |
| "mean_token_accuracy": 0.9699999928474426, | |
| "num_tokens": 60036.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.049002450122506126, | |
| "grad_norm": 4.509720802307129, | |
| "learning_rate": 4.861e-05, | |
| "loss": 0.1143, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 64565.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.052502625131256565, | |
| "grad_norm": 0.35976719856262207, | |
| "learning_rate": 4.851e-05, | |
| "loss": 0.032, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 69185.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.056002800140007, | |
| "grad_norm": 5.863715648651123, | |
| "learning_rate": 4.841e-05, | |
| "loss": 0.0522, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 73698.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.05950297514875744, | |
| "grad_norm": 0.1498999446630478, | |
| "learning_rate": 4.8309999999999997e-05, | |
| "loss": 0.1335, | |
| "mean_token_accuracy": 0.9699999928474426, | |
| "num_tokens": 78163.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.06300315015750788, | |
| "grad_norm": 1.365043044090271, | |
| "learning_rate": 4.821e-05, | |
| "loss": 0.0818, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 82718.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.06650332516625831, | |
| "grad_norm": 7.134900093078613, | |
| "learning_rate": 4.8110000000000005e-05, | |
| "loss": 0.0881, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 87209.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.07000350017500875, | |
| "grad_norm": 1.4992774724960327, | |
| "learning_rate": 4.801e-05, | |
| "loss": 0.0612, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 91760.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.07350367518375919, | |
| "grad_norm": 0.02836497873067856, | |
| "learning_rate": 4.791000000000001e-05, | |
| "loss": 0.0553, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 96423.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.07700385019250963, | |
| "grad_norm": 0.020807797089219093, | |
| "learning_rate": 4.7810000000000005e-05, | |
| "loss": 0.0672, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 100968.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.08050402520126006, | |
| "grad_norm": 0.03391553834080696, | |
| "learning_rate": 4.771e-05, | |
| "loss": 0.0326, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 105637.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.0840042002100105, | |
| "grad_norm": 10.723307609558105, | |
| "learning_rate": 4.761000000000001e-05, | |
| "loss": 0.0897, | |
| "mean_token_accuracy": 0.9699999928474426, | |
| "num_tokens": 110219.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.08750437521876094, | |
| "grad_norm": 1.3650544881820679, | |
| "learning_rate": 4.7510000000000004e-05, | |
| "loss": 0.0266, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 114796.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.09100455022751137, | |
| "grad_norm": 0.07940108329057693, | |
| "learning_rate": 4.741e-05, | |
| "loss": 0.01, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 119315.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.09450472523626181, | |
| "grad_norm": 0.05688886716961861, | |
| "learning_rate": 4.7310000000000006e-05, | |
| "loss": 0.0224, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 123868.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.09800490024501225, | |
| "grad_norm": 0.00870002806186676, | |
| "learning_rate": 4.7210000000000004e-05, | |
| "loss": 0.012, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 128591.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.10150507525376269, | |
| "grad_norm": 55.893104553222656, | |
| "learning_rate": 4.711e-05, | |
| "loss": 0.0297, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 133283.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.10500525026251313, | |
| "grad_norm": 0.003983665257692337, | |
| "learning_rate": 4.7010000000000006e-05, | |
| "loss": 0.2061, | |
| "mean_token_accuracy": 0.9649999916553498, | |
| "num_tokens": 138008.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.10850542527126357, | |
| "grad_norm": 7.029219627380371, | |
| "learning_rate": 4.691e-05, | |
| "loss": 0.0849, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 142566.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.112005600280014, | |
| "grad_norm": 2.8149940967559814, | |
| "learning_rate": 4.681e-05, | |
| "loss": 0.0667, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 147213.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.11550577528876443, | |
| "grad_norm": 2.192121982574463, | |
| "learning_rate": 4.6710000000000005e-05, | |
| "loss": 0.0638, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 151831.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.11900595029751487, | |
| "grad_norm": 15.69092082977295, | |
| "learning_rate": 4.661e-05, | |
| "loss": 0.0256, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 156368.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.12250612530626531, | |
| "grad_norm": 0.544373095035553, | |
| "learning_rate": 4.651e-05, | |
| "loss": 0.0368, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 160915.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.12600630031501575, | |
| "grad_norm": 0.04070553556084633, | |
| "learning_rate": 4.6410000000000005e-05, | |
| "loss": 0.0729, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 165461.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.1295064753237662, | |
| "grad_norm": 0.006062925793230534, | |
| "learning_rate": 4.631e-05, | |
| "loss": 0.0372, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 170199.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.13300665033251663, | |
| "grad_norm": 0.04721317067742348, | |
| "learning_rate": 4.6210000000000006e-05, | |
| "loss": 0.0353, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 174899.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.13650682534126707, | |
| "grad_norm": 0.01112948078662157, | |
| "learning_rate": 4.6110000000000004e-05, | |
| "loss": 0.0565, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 179419.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.1400070003500175, | |
| "grad_norm": 3.867860794067383, | |
| "learning_rate": 4.601e-05, | |
| "loss": 0.0537, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 183987.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.14350717535876795, | |
| "grad_norm": 10.329545974731445, | |
| "learning_rate": 4.5910000000000006e-05, | |
| "loss": 0.0888, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 188517.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.14700735036751839, | |
| "grad_norm": 0.04144367575645447, | |
| "learning_rate": 4.5810000000000004e-05, | |
| "loss": 0.0723, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 193090.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.15050752537626882, | |
| "grad_norm": 13.018311500549316, | |
| "learning_rate": 4.571e-05, | |
| "loss": 0.0799, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 197726.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.15400770038501926, | |
| "grad_norm": 7.063663959503174, | |
| "learning_rate": 4.5610000000000005e-05, | |
| "loss": 0.0295, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 202376.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.15750787539376968, | |
| "grad_norm": 8.98883056640625, | |
| "learning_rate": 4.551e-05, | |
| "loss": 0.0624, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 207026.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.16100805040252011, | |
| "grad_norm": 13.842345237731934, | |
| "learning_rate": 4.541e-05, | |
| "loss": 0.0715, | |
| "mean_token_accuracy": 0.975, | |
| "num_tokens": 211776.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.16450822541127055, | |
| "grad_norm": 9.97155475616455, | |
| "learning_rate": 4.5310000000000005e-05, | |
| "loss": 0.0661, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 216432.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.168008400420021, | |
| "grad_norm": 7.468666076660156, | |
| "learning_rate": 4.521e-05, | |
| "loss": 0.0367, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 220948.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.17150857542877143, | |
| "grad_norm": 4.366839408874512, | |
| "learning_rate": 4.511e-05, | |
| "loss": 0.0627, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 225605.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.17500875043752187, | |
| "grad_norm": 1.5920456647872925, | |
| "learning_rate": 4.5010000000000004e-05, | |
| "loss": 0.0625, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.17500875043752187, | |
| "eval_accuracy": 0.42072015161086546, | |
| "eval_f1": 0.35817992606791954, | |
| "eval_loss": 0.052172355353832245, | |
| "eval_mean_token_accuracy": 0.9854798049035699, | |
| "eval_num_tokens": 230290.0, | |
| "eval_precision": 0.4422081376879308, | |
| "eval_recall": 0.3790056922240466, | |
| "eval_runtime": 244.5865, | |
| "eval_samples_per_second": 6.472, | |
| "eval_steps_per_second": 0.81, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1785089254462723, | |
| "grad_norm": 0.04973801597952843, | |
| "learning_rate": 4.491e-05, | |
| "loss": 0.082, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 234977.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.18200910045502275, | |
| "grad_norm": 0.0434698760509491, | |
| "learning_rate": 4.481e-05, | |
| "loss": 0.0479, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 239628.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.1855092754637732, | |
| "grad_norm": 11.246657371520996, | |
| "learning_rate": 4.4710000000000004e-05, | |
| "loss": 0.0361, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 244331.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.18900945047252363, | |
| "grad_norm": 2.8165736198425293, | |
| "learning_rate": 4.461e-05, | |
| "loss": 0.0622, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 249178.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.19250962548127407, | |
| "grad_norm": 1.3719075918197632, | |
| "learning_rate": 4.451e-05, | |
| "loss": 0.0498, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 253815.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.1960098004900245, | |
| "grad_norm": 0.8937302827835083, | |
| "learning_rate": 4.4410000000000003e-05, | |
| "loss": 0.0782, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 258433.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.19950997549877494, | |
| "grad_norm": 0.01865805685520172, | |
| "learning_rate": 4.431e-05, | |
| "loss": 0.0096, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 263057.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.20301015050752538, | |
| "grad_norm": 0.6028000712394714, | |
| "learning_rate": 4.421e-05, | |
| "loss": 0.0144, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 267618.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.20651032551627582, | |
| "grad_norm": 0.013873261399567127, | |
| "learning_rate": 4.411e-05, | |
| "loss": 0.0576, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 272288.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.21001050052502626, | |
| "grad_norm": 6.103112697601318, | |
| "learning_rate": 4.401e-05, | |
| "loss": 0.1198, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 276910.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.2135106755337767, | |
| "grad_norm": 0.640934944152832, | |
| "learning_rate": 4.391e-05, | |
| "loss": 0.0378, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 281446.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.21701085054252714, | |
| "grad_norm": 0.31448185443878174, | |
| "learning_rate": 4.381e-05, | |
| "loss": 0.0696, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 286106.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.22051102555127755, | |
| "grad_norm": 0.03195786848664284, | |
| "learning_rate": 4.371e-05, | |
| "loss": 0.124, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 290743.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.224011200560028, | |
| "grad_norm": 0.8114803433418274, | |
| "learning_rate": 4.361e-05, | |
| "loss": 0.1201, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 295315.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.22751137556877843, | |
| "grad_norm": 0.16202567517757416, | |
| "learning_rate": 4.351e-05, | |
| "loss": 0.0553, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 299840.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.23101155057752887, | |
| "grad_norm": 4.016778469085693, | |
| "learning_rate": 4.341e-05, | |
| "loss": 0.0692, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 304471.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.2345117255862793, | |
| "grad_norm": 0.056026436388492584, | |
| "learning_rate": 4.3310000000000004e-05, | |
| "loss": 0.0395, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 309003.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.23801190059502975, | |
| "grad_norm": 0.4657319188117981, | |
| "learning_rate": 4.321e-05, | |
| "loss": 0.0094, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 313702.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.24151207560378019, | |
| "grad_norm": 21.3116397857666, | |
| "learning_rate": 4.311e-05, | |
| "loss": 0.0468, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 318425.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.24501225061253062, | |
| "grad_norm": 0.024263957515358925, | |
| "learning_rate": 4.301e-05, | |
| "loss": 0.0576, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 323080.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.24851242562128106, | |
| "grad_norm": 0.039419762790203094, | |
| "learning_rate": 4.291e-05, | |
| "loss": 0.0503, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 327692.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.2520126006300315, | |
| "grad_norm": 0.06194750592112541, | |
| "learning_rate": 4.281e-05, | |
| "loss": 0.0379, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 332238.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.25551277563878194, | |
| "grad_norm": 0.015114092268049717, | |
| "learning_rate": 4.271e-05, | |
| "loss": 0.0729, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 336826.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.2590129506475324, | |
| "grad_norm": 0.05118599534034729, | |
| "learning_rate": 4.261e-05, | |
| "loss": 0.0773, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 341422.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.2625131256562828, | |
| "grad_norm": 0.13388273119926453, | |
| "learning_rate": 4.251e-05, | |
| "loss": 0.07, | |
| "mean_token_accuracy": 0.9699999928474426, | |
| "num_tokens": 346078.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.26601330066503326, | |
| "grad_norm": 0.043984536081552505, | |
| "learning_rate": 4.241e-05, | |
| "loss": 0.121, | |
| "mean_token_accuracy": 0.9649999916553498, | |
| "num_tokens": 350615.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.2695134756737837, | |
| "grad_norm": 3.5789825916290283, | |
| "learning_rate": 4.231e-05, | |
| "loss": 0.0646, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 355229.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.27301365068253414, | |
| "grad_norm": 0.15532948076725006, | |
| "learning_rate": 4.221e-05, | |
| "loss": 0.0809, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 359810.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.2765138256912846, | |
| "grad_norm": 4.205129146575928, | |
| "learning_rate": 4.211e-05, | |
| "loss": 0.0478, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 364370.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.280014000700035, | |
| "grad_norm": 0.06457880884408951, | |
| "learning_rate": 4.201e-05, | |
| "loss": 0.0381, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 368854.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.28351417570878545, | |
| "grad_norm": 0.06110011041164398, | |
| "learning_rate": 4.191e-05, | |
| "loss": 0.0362, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 373402.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.2870143507175359, | |
| "grad_norm": 0.6663037538528442, | |
| "learning_rate": 4.181000000000001e-05, | |
| "loss": 0.053, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 378067.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.29051452572628633, | |
| "grad_norm": 0.019796814769506454, | |
| "learning_rate": 4.1710000000000006e-05, | |
| "loss": 0.0813, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 382674.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.29401470073503677, | |
| "grad_norm": 6.284240245819092, | |
| "learning_rate": 4.161e-05, | |
| "loss": 0.1022, | |
| "mean_token_accuracy": 0.9599999904632568, | |
| "num_tokens": 387372.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.2975148757437872, | |
| "grad_norm": 0.050411708652973175, | |
| "learning_rate": 4.151000000000001e-05, | |
| "loss": 0.0286, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 392048.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.30101505075253765, | |
| "grad_norm": 0.13556945323944092, | |
| "learning_rate": 4.1410000000000005e-05, | |
| "loss": 0.056, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 396846.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.3045152257612881, | |
| "grad_norm": 0.2066652923822403, | |
| "learning_rate": 4.131e-05, | |
| "loss": 0.0304, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 401427.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.3080154007700385, | |
| "grad_norm": 0.21275383234024048, | |
| "learning_rate": 4.121000000000001e-05, | |
| "loss": 0.0971, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 406019.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.31151557577878897, | |
| "grad_norm": 0.0494910404086113, | |
| "learning_rate": 4.1110000000000005e-05, | |
| "loss": 0.0054, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 410600.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.31501575078753935, | |
| "grad_norm": 0.06328645348548889, | |
| "learning_rate": 4.101e-05, | |
| "loss": 0.0584, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 415202.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.3185159257962898, | |
| "grad_norm": 0.011447213590145111, | |
| "learning_rate": 4.0910000000000006e-05, | |
| "loss": 0.0842, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 419881.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.32201610080504023, | |
| "grad_norm": 0.11036702245473862, | |
| "learning_rate": 4.0810000000000004e-05, | |
| "loss": 0.023, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 424461.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.32551627581379067, | |
| "grad_norm": 5.421338081359863, | |
| "learning_rate": 4.071e-05, | |
| "loss": 0.0802, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 429057.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.3290164508225411, | |
| "grad_norm": 0.3822776675224304, | |
| "learning_rate": 4.0610000000000006e-05, | |
| "loss": 0.0204, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 433654.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.33251662583129155, | |
| "grad_norm": 0.39122045040130615, | |
| "learning_rate": 4.0510000000000003e-05, | |
| "loss": 0.0318, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 438323.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.336016800840042, | |
| "grad_norm": 0.16552428901195526, | |
| "learning_rate": 4.041e-05, | |
| "loss": 0.0472, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 442957.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.3395169758487924, | |
| "grad_norm": 0.028619434684515, | |
| "learning_rate": 4.0310000000000005e-05, | |
| "loss": 0.0433, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 447752.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.34301715085754286, | |
| "grad_norm": 7.8463053703308105, | |
| "learning_rate": 4.021e-05, | |
| "loss": 0.0688, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 452474.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.3465173258662933, | |
| "grad_norm": 0.012101550586521626, | |
| "learning_rate": 4.011e-05, | |
| "loss": 0.0084, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 457143.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.35001750087504374, | |
| "grad_norm": 0.013838861137628555, | |
| "learning_rate": 4.0010000000000005e-05, | |
| "loss": 0.0593, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.35001750087504374, | |
| "eval_accuracy": 0.441566645609602, | |
| "eval_f1": 0.3728091111256171, | |
| "eval_loss": 0.057350896298885345, | |
| "eval_mean_token_accuracy": 0.9861111180348829, | |
| "eval_num_tokens": 461707.0, | |
| "eval_precision": 0.4527947168630948, | |
| "eval_recall": 0.39396244890143234, | |
| "eval_runtime": 244.5385, | |
| "eval_samples_per_second": 6.473, | |
| "eval_steps_per_second": 0.81, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3535176758837942, | |
| "grad_norm": 0.0760878473520279, | |
| "learning_rate": 3.991e-05, | |
| "loss": 0.1017, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 466311.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.3570178508925446, | |
| "grad_norm": 6.497073173522949, | |
| "learning_rate": 3.981e-05, | |
| "loss": 0.0353, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 470821.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.36051802590129506, | |
| "grad_norm": 8.943822860717773, | |
| "learning_rate": 3.9710000000000004e-05, | |
| "loss": 0.0868, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 475416.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.3640182009100455, | |
| "grad_norm": 0.10018932819366455, | |
| "learning_rate": 3.961e-05, | |
| "loss": 0.0314, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 480046.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.36751837591879594, | |
| "grad_norm": 0.058345384895801544, | |
| "learning_rate": 3.951e-05, | |
| "loss": 0.0198, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 484728.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3710185509275464, | |
| "grad_norm": 0.059850409626960754, | |
| "learning_rate": 3.9410000000000004e-05, | |
| "loss": 0.0561, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 489334.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.3745187259362968, | |
| "grad_norm": 0.03875022009015083, | |
| "learning_rate": 3.931e-05, | |
| "loss": 0.0893, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 493949.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.37801890094504725, | |
| "grad_norm": 0.7719871997833252, | |
| "learning_rate": 3.921e-05, | |
| "loss": 0.0427, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 498518.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.3815190759537977, | |
| "grad_norm": 0.05535457283258438, | |
| "learning_rate": 3.911e-05, | |
| "loss": 0.0117, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 503111.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.38501925096254813, | |
| "grad_norm": 6.557998180389404, | |
| "learning_rate": 3.901e-05, | |
| "loss": 0.0805, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 507707.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.38851942597129857, | |
| "grad_norm": 0.6564468145370483, | |
| "learning_rate": 3.8910000000000005e-05, | |
| "loss": 0.0841, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 512285.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.392019600980049, | |
| "grad_norm": 8.401987075805664, | |
| "learning_rate": 3.881e-05, | |
| "loss": 0.0721, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 516852.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.39551977598879945, | |
| "grad_norm": 1.693769931793213, | |
| "learning_rate": 3.871e-05, | |
| "loss": 0.0807, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 521413.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.3990199509975499, | |
| "grad_norm": 3.10587739944458, | |
| "learning_rate": 3.8610000000000005e-05, | |
| "loss": 0.059, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 525992.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.4025201260063003, | |
| "grad_norm": 0.17380690574645996, | |
| "learning_rate": 3.851e-05, | |
| "loss": 0.0357, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 530679.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.40602030101505077, | |
| "grad_norm": 0.33141088485717773, | |
| "learning_rate": 3.841e-05, | |
| "loss": 0.0045, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 535218.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.4095204760238012, | |
| "grad_norm": 0.0494840107858181, | |
| "learning_rate": 3.8310000000000004e-05, | |
| "loss": 0.0594, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 539817.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.41302065103255164, | |
| "grad_norm": 0.013975823298096657, | |
| "learning_rate": 3.821e-05, | |
| "loss": 0.0022, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 544490.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.4165208260413021, | |
| "grad_norm": 0.09675566107034683, | |
| "learning_rate": 3.811e-05, | |
| "loss": 0.0019, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 549043.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.4200210010500525, | |
| "grad_norm": 0.00722131785005331, | |
| "learning_rate": 3.8010000000000004e-05, | |
| "loss": 0.0806, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 553578.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.42352117605880296, | |
| "grad_norm": 0.022663407027721405, | |
| "learning_rate": 3.791e-05, | |
| "loss": 0.0042, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 558226.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.4270213510675534, | |
| "grad_norm": 0.012322783470153809, | |
| "learning_rate": 3.781e-05, | |
| "loss": 0.007, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 562862.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.43052152607630384, | |
| "grad_norm": 0.016185415908694267, | |
| "learning_rate": 3.771e-05, | |
| "loss": 0.0943, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 567480.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.4340217010850543, | |
| "grad_norm": 0.0974903255701065, | |
| "learning_rate": 3.761e-05, | |
| "loss": 0.0773, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 572014.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.43752187609380466, | |
| "grad_norm": 0.028429092839360237, | |
| "learning_rate": 3.751e-05, | |
| "loss": 0.0779, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 576638.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.4410220511025551, | |
| "grad_norm": 2.4505090713500977, | |
| "learning_rate": 3.741e-05, | |
| "loss": 0.0153, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 581221.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.44452222611130554, | |
| "grad_norm": 0.11989375203847885, | |
| "learning_rate": 3.731e-05, | |
| "loss": 0.0748, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 585878.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.448022401120056, | |
| "grad_norm": 0.06575898826122284, | |
| "learning_rate": 3.721e-05, | |
| "loss": 0.0247, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 590474.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.4515225761288064, | |
| "grad_norm": 17.148649215698242, | |
| "learning_rate": 3.711e-05, | |
| "loss": 0.0701, | |
| "mean_token_accuracy": 0.9699999928474426, | |
| "num_tokens": 594994.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.45502275113755686, | |
| "grad_norm": 0.022335920482873917, | |
| "learning_rate": 3.701e-05, | |
| "loss": 0.0447, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 599605.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.4585229261463073, | |
| "grad_norm": 0.16378021240234375, | |
| "learning_rate": 3.691e-05, | |
| "loss": 0.0934, | |
| "mean_token_accuracy": 0.9699999928474426, | |
| "num_tokens": 604331.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.46202310115505774, | |
| "grad_norm": 4.628612995147705, | |
| "learning_rate": 3.681e-05, | |
| "loss": 0.1114, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 608859.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.4655232761638082, | |
| "grad_norm": 4.558804035186768, | |
| "learning_rate": 3.671e-05, | |
| "loss": 0.0527, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 613472.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.4690234511725586, | |
| "grad_norm": 7.380437850952148, | |
| "learning_rate": 3.661e-05, | |
| "loss": 0.0211, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 618003.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.47252362618130905, | |
| "grad_norm": 0.054671116173267365, | |
| "learning_rate": 3.651e-05, | |
| "loss": 0.0896, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 622602.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.4760238011900595, | |
| "grad_norm": 0.22701649367809296, | |
| "learning_rate": 3.641e-05, | |
| "loss": 0.0546, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 627312.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.47952397619880993, | |
| "grad_norm": 9.734682083129883, | |
| "learning_rate": 3.6309999999999996e-05, | |
| "loss": 0.0576, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 632167.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.48302415120756037, | |
| "grad_norm": 10.223374366760254, | |
| "learning_rate": 3.621e-05, | |
| "loss": 0.0569, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 636822.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.4865243262163108, | |
| "grad_norm": 0.22119201719760895, | |
| "learning_rate": 3.611e-05, | |
| "loss": 0.052, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 641324.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.49002450122506125, | |
| "grad_norm": 0.09743613004684448, | |
| "learning_rate": 3.601e-05, | |
| "loss": 0.0326, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 645990.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.4935246762338117, | |
| "grad_norm": 4.46646785736084, | |
| "learning_rate": 3.591e-05, | |
| "loss": 0.0845, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 650642.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.4970248512425621, | |
| "grad_norm": 0.3847590386867523, | |
| "learning_rate": 3.581e-05, | |
| "loss": 0.0276, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 655313.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.5005250262513126, | |
| "grad_norm": 3.9029712677001953, | |
| "learning_rate": 3.571e-05, | |
| "loss": 0.0479, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 659845.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.504025201260063, | |
| "grad_norm": 5.140905380249023, | |
| "learning_rate": 3.5610000000000006e-05, | |
| "loss": 0.0274, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 664526.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.5075253762688134, | |
| "grad_norm": 0.0748833566904068, | |
| "learning_rate": 3.5510000000000004e-05, | |
| "loss": 0.112, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 669184.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.5110255512775639, | |
| "grad_norm": 0.06513810157775879, | |
| "learning_rate": 3.541e-05, | |
| "loss": 0.0294, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 673829.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.5145257262863143, | |
| "grad_norm": 0.14687716960906982, | |
| "learning_rate": 3.5310000000000006e-05, | |
| "loss": 0.039, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 678451.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.5180259012950648, | |
| "grad_norm": 0.04928717017173767, | |
| "learning_rate": 3.5210000000000003e-05, | |
| "loss": 0.0881, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 682982.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.5215260763038152, | |
| "grad_norm": 0.05730545148253441, | |
| "learning_rate": 3.511e-05, | |
| "loss": 0.0094, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 687510.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.5250262513125656, | |
| "grad_norm": 0.024362344294786453, | |
| "learning_rate": 3.5010000000000005e-05, | |
| "loss": 0.0119, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5250262513125656, | |
| "eval_accuracy": 0.5596967782691092, | |
| "eval_f1": 0.4608174078043728, | |
| "eval_loss": 0.047076478600502014, | |
| "eval_mean_token_accuracy": 0.9872474811895929, | |
| "eval_num_tokens": 692119.0, | |
| "eval_precision": 0.5081716761653752, | |
| "eval_recall": 0.49738319415052024, | |
| "eval_runtime": 244.3841, | |
| "eval_samples_per_second": 6.478, | |
| "eval_steps_per_second": 0.81, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5285264263213161, | |
| "grad_norm": 0.03228575736284256, | |
| "learning_rate": 3.491e-05, | |
| "loss": 0.0282, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 696712.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.5320266013300665, | |
| "grad_norm": 15.16336441040039, | |
| "learning_rate": 3.481e-05, | |
| "loss": 0.0758, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 701390.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.535526776338817, | |
| "grad_norm": 19.84299087524414, | |
| "learning_rate": 3.4710000000000005e-05, | |
| "loss": 0.0911, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 705965.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.5390269513475674, | |
| "grad_norm": 0.051229000091552734, | |
| "learning_rate": 3.461e-05, | |
| "loss": 0.0982, | |
| "mean_token_accuracy": 0.9599999904632568, | |
| "num_tokens": 710601.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.5425271263563178, | |
| "grad_norm": 6.5445756912231445, | |
| "learning_rate": 3.451000000000001e-05, | |
| "loss": 0.037, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 715211.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.5460273013650683, | |
| "grad_norm": 4.0851874351501465, | |
| "learning_rate": 3.4410000000000004e-05, | |
| "loss": 0.0403, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 719818.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.5495274763738187, | |
| "grad_norm": 0.20048797130584717, | |
| "learning_rate": 3.431e-05, | |
| "loss": 0.0598, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 724418.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.5530276513825692, | |
| "grad_norm": 8.350198745727539, | |
| "learning_rate": 3.4210000000000006e-05, | |
| "loss": 0.0261, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 729079.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.5565278263913196, | |
| "grad_norm": 7.64754056930542, | |
| "learning_rate": 3.4110000000000004e-05, | |
| "loss": 0.0205, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 733776.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.56002800140007, | |
| "grad_norm": 11.657675743103027, | |
| "learning_rate": 3.401e-05, | |
| "loss": 0.0254, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 738301.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5635281764088205, | |
| "grad_norm": 0.044835012406110764, | |
| "learning_rate": 3.3910000000000006e-05, | |
| "loss": 0.0675, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 743020.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.5670283514175709, | |
| "grad_norm": 0.08898824453353882, | |
| "learning_rate": 3.381e-05, | |
| "loss": 0.0438, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 747522.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.5705285264263213, | |
| "grad_norm": 19.048906326293945, | |
| "learning_rate": 3.371e-05, | |
| "loss": 0.0462, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 752073.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.5740287014350718, | |
| "grad_norm": 5.376831531524658, | |
| "learning_rate": 3.3610000000000005e-05, | |
| "loss": 0.0067, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 756668.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.5775288764438222, | |
| "grad_norm": 0.003997461870312691, | |
| "learning_rate": 3.351e-05, | |
| "loss": 0.0093, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 761349.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.5810290514525727, | |
| "grad_norm": 1.1141142845153809, | |
| "learning_rate": 3.341e-05, | |
| "loss": 0.0453, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 766087.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.5845292264613231, | |
| "grad_norm": 0.09356174618005753, | |
| "learning_rate": 3.3310000000000005e-05, | |
| "loss": 0.0415, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 770676.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.5880294014700735, | |
| "grad_norm": 16.47395133972168, | |
| "learning_rate": 3.321e-05, | |
| "loss": 0.0726, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 775287.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.591529576478824, | |
| "grad_norm": 1.1543943881988525, | |
| "learning_rate": 3.311e-05, | |
| "loss": 0.0262, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 779907.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.5950297514875744, | |
| "grad_norm": 0.6417059898376465, | |
| "learning_rate": 3.3010000000000004e-05, | |
| "loss": 0.1458, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 784527.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5985299264963249, | |
| "grad_norm": 0.03735469654202461, | |
| "learning_rate": 3.291e-05, | |
| "loss": 0.0517, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 789037.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.6020301015050753, | |
| "grad_norm": 7.025692462921143, | |
| "learning_rate": 3.281e-05, | |
| "loss": 0.0912, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 793581.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.6055302765138257, | |
| "grad_norm": 0.046697914600372314, | |
| "learning_rate": 3.2710000000000004e-05, | |
| "loss": 0.0516, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 798206.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.6090304515225762, | |
| "grad_norm": 1.4056965112686157, | |
| "learning_rate": 3.261e-05, | |
| "loss": 0.0658, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 802791.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.6125306265313266, | |
| "grad_norm": 15.819257736206055, | |
| "learning_rate": 3.251e-05, | |
| "loss": 0.0332, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 807423.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.616030801540077, | |
| "grad_norm": 0.15242606401443481, | |
| "learning_rate": 3.241e-05, | |
| "loss": 0.0116, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 812014.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.6195309765488275, | |
| "grad_norm": 0.8969595432281494, | |
| "learning_rate": 3.231e-05, | |
| "loss": 0.0697, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 816484.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.6230311515575779, | |
| "grad_norm": 13.24059772491455, | |
| "learning_rate": 3.221e-05, | |
| "loss": 0.0475, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 821141.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.6265313265663283, | |
| "grad_norm": 0.0862284004688263, | |
| "learning_rate": 3.211e-05, | |
| "loss": 0.0133, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 825825.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.6300315015750787, | |
| "grad_norm": 6.188477993011475, | |
| "learning_rate": 3.201e-05, | |
| "loss": 0.0941, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 830440.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.6335316765838291, | |
| "grad_norm": 0.047075141221284866, | |
| "learning_rate": 3.191e-05, | |
| "loss": 0.0152, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 835064.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.6370318515925796, | |
| "grad_norm": 8.754451751708984, | |
| "learning_rate": 3.181e-05, | |
| "loss": 0.034, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 839606.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.64053202660133, | |
| "grad_norm": 0.6907691955566406, | |
| "learning_rate": 3.171e-05, | |
| "loss": 0.0215, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 844205.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.6440322016100805, | |
| "grad_norm": 0.06890915334224701, | |
| "learning_rate": 3.1610000000000004e-05, | |
| "loss": 0.0804, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 848857.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.6475323766188309, | |
| "grad_norm": 0.04362496733665466, | |
| "learning_rate": 3.151e-05, | |
| "loss": 0.0015, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 853505.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.6510325516275813, | |
| "grad_norm": 0.032738834619522095, | |
| "learning_rate": 3.141e-05, | |
| "loss": 0.0738, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 857949.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.6545327266363318, | |
| "grad_norm": 0.0720139741897583, | |
| "learning_rate": 3.1310000000000003e-05, | |
| "loss": 0.0198, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 862481.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.6580329016450822, | |
| "grad_norm": 0.3373511731624603, | |
| "learning_rate": 3.121e-05, | |
| "loss": 0.0232, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 867070.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.6615330766538327, | |
| "grad_norm": 0.03332596644759178, | |
| "learning_rate": 3.111e-05, | |
| "loss": 0.0255, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 871710.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.6650332516625831, | |
| "grad_norm": 0.02673097886145115, | |
| "learning_rate": 3.101e-05, | |
| "loss": 0.1023, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 876374.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6685334266713335, | |
| "grad_norm": 29.00749969482422, | |
| "learning_rate": 3.091e-05, | |
| "loss": 0.0775, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 880967.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.672033601680084, | |
| "grad_norm": 0.013920117169618607, | |
| "learning_rate": 3.081e-05, | |
| "loss": 0.0095, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 885539.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.6755337766888344, | |
| "grad_norm": 0.004398212768137455, | |
| "learning_rate": 3.071e-05, | |
| "loss": 0.0107, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 890118.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.6790339516975848, | |
| "grad_norm": 0.11914502084255219, | |
| "learning_rate": 3.061e-05, | |
| "loss": 0.0353, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 894652.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.6825341267063353, | |
| "grad_norm": 0.06763932853937149, | |
| "learning_rate": 3.051e-05, | |
| "loss": 0.0155, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 899187.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.6860343017150857, | |
| "grad_norm": 0.03659069910645485, | |
| "learning_rate": 3.041e-05, | |
| "loss": 0.0309, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 903727.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.6895344767238362, | |
| "grad_norm": 5.335174083709717, | |
| "learning_rate": 3.031e-05, | |
| "loss": 0.1113, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 908334.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.6930346517325866, | |
| "grad_norm": 2.4410805702209473, | |
| "learning_rate": 3.021e-05, | |
| "loss": 0.0128, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 913065.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.696534826741337, | |
| "grad_norm": 0.05332425609230995, | |
| "learning_rate": 3.0109999999999998e-05, | |
| "loss": 0.0443, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 917618.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.7000350017500875, | |
| "grad_norm": 0.012657753191888332, | |
| "learning_rate": 3.001e-05, | |
| "loss": 0.0267, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.7000350017500875, | |
| "eval_accuracy": 0.5710675931775111, | |
| "eval_f1": 0.4674227263281785, | |
| "eval_loss": 0.052520181983709335, | |
| "eval_mean_token_accuracy": 0.9876262681050734, | |
| "eval_num_tokens": 922286.0, | |
| "eval_precision": 0.5114304763470895, | |
| "eval_recall": 0.5012130900032767, | |
| "eval_runtime": 244.1164, | |
| "eval_samples_per_second": 6.485, | |
| "eval_steps_per_second": 0.811, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.7035351767588379, | |
| "grad_norm": 5.11520528793335, | |
| "learning_rate": 2.991e-05, | |
| "loss": 0.0078, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 926980.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.7070353517675884, | |
| "grad_norm": 0.03029199317097664, | |
| "learning_rate": 2.9809999999999997e-05, | |
| "loss": 0.0963, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 931724.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.7105355267763388, | |
| "grad_norm": 0.5081428289413452, | |
| "learning_rate": 2.971e-05, | |
| "loss": 0.0361, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 936266.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.7140357017850892, | |
| "grad_norm": 0.04822823032736778, | |
| "learning_rate": 2.961e-05, | |
| "loss": 0.1017, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 940886.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.7175358767938397, | |
| "grad_norm": 0.2854156494140625, | |
| "learning_rate": 2.951e-05, | |
| "loss": 0.0186, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 945433.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.7210360518025901, | |
| "grad_norm": 0.3434739112854004, | |
| "learning_rate": 2.9409999999999998e-05, | |
| "loss": 0.0587, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 949930.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.7245362268113406, | |
| "grad_norm": 0.03626574948430061, | |
| "learning_rate": 2.9310000000000006e-05, | |
| "loss": 0.1057, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 954492.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.728036401820091, | |
| "grad_norm": 11.993911743164062, | |
| "learning_rate": 2.9210000000000003e-05, | |
| "loss": 0.1327, | |
| "mean_token_accuracy": 0.9699999928474426, | |
| "num_tokens": 958986.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.7315365768288414, | |
| "grad_norm": 0.07597003877162933, | |
| "learning_rate": 2.9110000000000004e-05, | |
| "loss": 0.018, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 963614.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.7350367518375919, | |
| "grad_norm": 10.063232421875, | |
| "learning_rate": 2.9010000000000005e-05, | |
| "loss": 0.1227, | |
| "mean_token_accuracy": 0.9699999928474426, | |
| "num_tokens": 968140.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.7385369268463423, | |
| "grad_norm": 0.1807040572166443, | |
| "learning_rate": 2.8910000000000003e-05, | |
| "loss": 0.0323, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 972835.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.7420371018550928, | |
| "grad_norm": 0.17890332639217377, | |
| "learning_rate": 2.8810000000000004e-05, | |
| "loss": 0.0207, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 977445.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.7455372768638432, | |
| "grad_norm": 9.020623207092285, | |
| "learning_rate": 2.8710000000000005e-05, | |
| "loss": 0.0899, | |
| "mean_token_accuracy": 0.9699999928474426, | |
| "num_tokens": 982140.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.7490374518725936, | |
| "grad_norm": 3.116069793701172, | |
| "learning_rate": 2.8610000000000002e-05, | |
| "loss": 0.0603, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 986817.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.7525376268813441, | |
| "grad_norm": 14.557893753051758, | |
| "learning_rate": 2.8510000000000003e-05, | |
| "loss": 0.0326, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 991362.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.7560378018900945, | |
| "grad_norm": 5.8639140129089355, | |
| "learning_rate": 2.8410000000000004e-05, | |
| "loss": 0.0533, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 995958.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.759537976898845, | |
| "grad_norm": 0.08902487903833389, | |
| "learning_rate": 2.8310000000000002e-05, | |
| "loss": 0.0089, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1000561.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.7630381519075954, | |
| "grad_norm": 0.021990323439240456, | |
| "learning_rate": 2.8210000000000003e-05, | |
| "loss": 0.0398, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1005102.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.7665383269163458, | |
| "grad_norm": 0.0434272363781929, | |
| "learning_rate": 2.8110000000000004e-05, | |
| "loss": 0.0367, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1009721.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.7700385019250963, | |
| "grad_norm": 7.773507595062256, | |
| "learning_rate": 2.8010000000000005e-05, | |
| "loss": 0.0529, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1014234.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7735386769338467, | |
| "grad_norm": 11.276909828186035, | |
| "learning_rate": 2.7910000000000002e-05, | |
| "loss": 0.0828, | |
| "mean_token_accuracy": 0.9699999928474426, | |
| "num_tokens": 1018943.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.7770388519425971, | |
| "grad_norm": 0.2111329585313797, | |
| "learning_rate": 2.7810000000000003e-05, | |
| "loss": 0.0168, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1023659.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.7805390269513476, | |
| "grad_norm": 0.09295608103275299, | |
| "learning_rate": 2.7710000000000004e-05, | |
| "loss": 0.0426, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1028221.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.784039201960098, | |
| "grad_norm": 0.05695830285549164, | |
| "learning_rate": 2.761e-05, | |
| "loss": 0.0321, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1032825.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.7875393769688485, | |
| "grad_norm": 0.03428833931684494, | |
| "learning_rate": 2.7510000000000003e-05, | |
| "loss": 0.1096, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 1037389.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.7910395519775989, | |
| "grad_norm": 0.052995167672634125, | |
| "learning_rate": 2.7410000000000004e-05, | |
| "loss": 0.0808, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 1041941.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.7945397269863493, | |
| "grad_norm": 0.1979517787694931, | |
| "learning_rate": 2.731e-05, | |
| "loss": 0.0145, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1046451.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.7980399019950998, | |
| "grad_norm": 0.024557696655392647, | |
| "learning_rate": 2.7210000000000002e-05, | |
| "loss": 0.0534, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 1051071.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.8015400770038502, | |
| "grad_norm": 7.660386085510254, | |
| "learning_rate": 2.7110000000000003e-05, | |
| "loss": 0.1042, | |
| "mean_token_accuracy": 0.9649999916553498, | |
| "num_tokens": 1055716.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.8050402520126007, | |
| "grad_norm": 3.119615316390991, | |
| "learning_rate": 2.701e-05, | |
| "loss": 0.061, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1060266.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.8085404270213511, | |
| "grad_norm": 6.7030158042907715, | |
| "learning_rate": 2.691e-05, | |
| "loss": 0.0265, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1064858.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.8120406020301015, | |
| "grad_norm": 0.08051805198192596, | |
| "learning_rate": 2.6810000000000003e-05, | |
| "loss": 0.0091, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1069403.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.815540777038852, | |
| "grad_norm": 0.08621969074010849, | |
| "learning_rate": 2.671e-05, | |
| "loss": 0.0043, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1074041.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.8190409520476024, | |
| "grad_norm": 7.230138778686523, | |
| "learning_rate": 2.661e-05, | |
| "loss": 0.0891, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 1078603.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.8225411270563528, | |
| "grad_norm": 1.925933837890625, | |
| "learning_rate": 2.6510000000000002e-05, | |
| "loss": 0.1072, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 1083208.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.8260413020651033, | |
| "grad_norm": 0.06855742633342743, | |
| "learning_rate": 2.6410000000000003e-05, | |
| "loss": 0.0987, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 1087939.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.8295414770738537, | |
| "grad_norm": 4.232824802398682, | |
| "learning_rate": 2.631e-05, | |
| "loss": 0.0537, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1092618.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.8330416520826042, | |
| "grad_norm": 0.054919663816690445, | |
| "learning_rate": 2.621e-05, | |
| "loss": 0.0309, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1097232.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.8365418270913546, | |
| "grad_norm": 8.129829406738281, | |
| "learning_rate": 2.6110000000000002e-05, | |
| "loss": 0.0761, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1101883.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.840042002100105, | |
| "grad_norm": 7.850025653839111, | |
| "learning_rate": 2.601e-05, | |
| "loss": 0.0357, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1106608.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.8435421771088555, | |
| "grad_norm": 0.504169762134552, | |
| "learning_rate": 2.591e-05, | |
| "loss": 0.0419, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1111214.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.8470423521176059, | |
| "grad_norm": 0.02623009867966175, | |
| "learning_rate": 2.5810000000000002e-05, | |
| "loss": 0.0353, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1115761.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.8505425271263564, | |
| "grad_norm": 0.2593607008457184, | |
| "learning_rate": 2.571e-05, | |
| "loss": 0.0757, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 1120497.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.8540427021351068, | |
| "grad_norm": 0.09586932510137558, | |
| "learning_rate": 2.561e-05, | |
| "loss": 0.0709, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 1125050.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.8575428771438572, | |
| "grad_norm": 0.03755811229348183, | |
| "learning_rate": 2.551e-05, | |
| "loss": 0.0462, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1129583.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.8610430521526077, | |
| "grad_norm": 0.01429970283061266, | |
| "learning_rate": 2.541e-05, | |
| "loss": 0.0176, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1134231.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.8645432271613581, | |
| "grad_norm": 0.1092047318816185, | |
| "learning_rate": 2.531e-05, | |
| "loss": 0.0348, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1138841.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.8680434021701086, | |
| "grad_norm": 0.04420563951134682, | |
| "learning_rate": 2.521e-05, | |
| "loss": 0.0602, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1143627.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.871543577178859, | |
| "grad_norm": 0.056809134781360626, | |
| "learning_rate": 2.5110000000000002e-05, | |
| "loss": 0.0113, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1148140.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.8750437521876093, | |
| "grad_norm": 0.39837557077407837, | |
| "learning_rate": 2.501e-05, | |
| "loss": 0.0467, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8750437521876093, | |
| "eval_accuracy": 0.44535691724573595, | |
| "eval_f1": 0.3790299221602013, | |
| "eval_loss": 0.05089215189218521, | |
| "eval_mean_token_accuracy": 0.9880050568267552, | |
| "eval_num_tokens": 1152749.0, | |
| "eval_precision": 0.454011773226288, | |
| "eval_recall": 0.4125268353595752, | |
| "eval_runtime": 244.5484, | |
| "eval_samples_per_second": 6.473, | |
| "eval_steps_per_second": 0.81, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8785439271963598, | |
| "grad_norm": 0.028136901557445526, | |
| "learning_rate": 2.491e-05, | |
| "loss": 0.016, | |
| "mean_token_accuracy": 0.9874999970197678, | |
| "num_tokens": 1157289.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.8820441022051102, | |
| "grad_norm": 8.768643379211426, | |
| "learning_rate": 2.481e-05, | |
| "loss": 0.0547, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 1161827.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.8855442772138606, | |
| "grad_norm": 4.318042755126953, | |
| "learning_rate": 2.471e-05, | |
| "loss": 0.0548, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1166441.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.8890444522226111, | |
| "grad_norm": 0.032455261796712875, | |
| "learning_rate": 2.4610000000000003e-05, | |
| "loss": 0.0342, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1171123.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.8925446272313615, | |
| "grad_norm": 1.8352007865905762, | |
| "learning_rate": 2.451e-05, | |
| "loss": 0.0399, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1175686.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.896044802240112, | |
| "grad_norm": 0.11914759129285812, | |
| "learning_rate": 2.4410000000000002e-05, | |
| "loss": 0.089, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 1180208.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.8995449772488624, | |
| "grad_norm": 0.09686534851789474, | |
| "learning_rate": 2.4310000000000003e-05, | |
| "loss": 0.0889, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1184783.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.9030451522576128, | |
| "grad_norm": 0.06705299764871597, | |
| "learning_rate": 2.4210000000000004e-05, | |
| "loss": 0.0801, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1189400.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.9065453272663633, | |
| "grad_norm": 0.04434126242995262, | |
| "learning_rate": 2.411e-05, | |
| "loss": 0.0036, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1194035.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.9100455022751137, | |
| "grad_norm": 0.03630208596587181, | |
| "learning_rate": 2.4010000000000002e-05, | |
| "loss": 0.0326, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1198629.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.9135456772838642, | |
| "grad_norm": 0.031141789630055428, | |
| "learning_rate": 2.3910000000000003e-05, | |
| "loss": 0.0436, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1203254.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.9170458522926146, | |
| "grad_norm": 0.33005213737487793, | |
| "learning_rate": 2.381e-05, | |
| "loss": 0.066, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 1207907.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.920546027301365, | |
| "grad_norm": 8.107050895690918, | |
| "learning_rate": 2.371e-05, | |
| "loss": 0.0418, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1212537.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.9240462023101155, | |
| "grad_norm": 0.28287169337272644, | |
| "learning_rate": 2.3610000000000003e-05, | |
| "loss": 0.0683, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1217120.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.9275463773188659, | |
| "grad_norm": 0.03412451222538948, | |
| "learning_rate": 2.351e-05, | |
| "loss": 0.0101, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1221671.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.9310465523276164, | |
| "grad_norm": 6.6741814613342285, | |
| "learning_rate": 2.341e-05, | |
| "loss": 0.085, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 1226356.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.9345467273363668, | |
| "grad_norm": 4.757784366607666, | |
| "learning_rate": 2.3310000000000002e-05, | |
| "loss": 0.1041, | |
| "mean_token_accuracy": 0.9649999916553498, | |
| "num_tokens": 1230966.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.9380469023451172, | |
| "grad_norm": 0.1483946591615677, | |
| "learning_rate": 2.321e-05, | |
| "loss": 0.0043, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1235570.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.9415470773538677, | |
| "grad_norm": 4.1783976554870605, | |
| "learning_rate": 2.311e-05, | |
| "loss": 0.0297, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1240120.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.9450472523626181, | |
| "grad_norm": 0.006953865755349398, | |
| "learning_rate": 2.301e-05, | |
| "loss": 0.0036, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1244744.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.9485474273713685, | |
| "grad_norm": 7.843381881713867, | |
| "learning_rate": 2.2910000000000003e-05, | |
| "loss": 0.0747, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1249304.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.952047602380119, | |
| "grad_norm": 2.691250801086426, | |
| "learning_rate": 2.281e-05, | |
| "loss": 0.0341, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1253884.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.9555477773888694, | |
| "grad_norm": 0.048404838889837265, | |
| "learning_rate": 2.271e-05, | |
| "loss": 0.0236, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1258643.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.9590479523976199, | |
| "grad_norm": 0.10087752342224121, | |
| "learning_rate": 2.2610000000000002e-05, | |
| "loss": 0.0454, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1263197.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.9625481274063703, | |
| "grad_norm": 0.66507887840271, | |
| "learning_rate": 2.251e-05, | |
| "loss": 0.0573, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1267826.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.9660483024151207, | |
| "grad_norm": 0.0337546281516552, | |
| "learning_rate": 2.241e-05, | |
| "loss": 0.0534, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1272318.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.9695484774238712, | |
| "grad_norm": 0.022819435223937035, | |
| "learning_rate": 2.231e-05, | |
| "loss": 0.0262, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1276943.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.9730486524326216, | |
| "grad_norm": 0.023641835898160934, | |
| "learning_rate": 2.221e-05, | |
| "loss": 0.0152, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1281549.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.9765488274413721, | |
| "grad_norm": 3.202338695526123, | |
| "learning_rate": 2.211e-05, | |
| "loss": 0.0755, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 1286053.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.9800490024501225, | |
| "grad_norm": 0.052319396287202835, | |
| "learning_rate": 2.201e-05, | |
| "loss": 0.0376, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1290719.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.9835491774588729, | |
| "grad_norm": 10.303972244262695, | |
| "learning_rate": 2.191e-05, | |
| "loss": 0.0052, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1295220.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.9870493524676234, | |
| "grad_norm": 0.11650484800338745, | |
| "learning_rate": 2.181e-05, | |
| "loss": 0.0036, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1299736.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.9905495274763738, | |
| "grad_norm": 0.016379429027438164, | |
| "learning_rate": 2.171e-05, | |
| "loss": 0.0403, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1304307.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.9940497024851243, | |
| "grad_norm": 0.07140190899372101, | |
| "learning_rate": 2.1609999999999998e-05, | |
| "loss": 0.0358, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1308761.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.9975498774938747, | |
| "grad_norm": 0.3014475405216217, | |
| "learning_rate": 2.1510000000000002e-05, | |
| "loss": 0.0319, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1313336.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.0010500525026251, | |
| "grad_norm": 0.06447609513998032, | |
| "learning_rate": 2.1410000000000003e-05, | |
| "loss": 0.0348, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1317756.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 1.0045502275113756, | |
| "grad_norm": 0.02268841676414013, | |
| "learning_rate": 2.131e-05, | |
| "loss": 0.0218, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1322331.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 1.008050402520126, | |
| "grad_norm": 0.042231637984514236, | |
| "learning_rate": 2.1210000000000002e-05, | |
| "loss": 0.0023, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1326941.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 1.0115505775288764, | |
| "grad_norm": 5.811006546020508, | |
| "learning_rate": 2.1110000000000003e-05, | |
| "loss": 0.0024, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1331630.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 1.0150507525376269, | |
| "grad_norm": 0.0456203818321228, | |
| "learning_rate": 2.101e-05, | |
| "loss": 0.0011, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1336316.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.0185509275463773, | |
| "grad_norm": 0.2723388671875, | |
| "learning_rate": 2.091e-05, | |
| "loss": 0.0114, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1341005.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 1.0220511025551278, | |
| "grad_norm": 0.0164664164185524, | |
| "learning_rate": 2.0810000000000002e-05, | |
| "loss": 0.0016, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1345651.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 1.0255512775638782, | |
| "grad_norm": 0.38593819737434387, | |
| "learning_rate": 2.0710000000000003e-05, | |
| "loss": 0.0395, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1350333.0, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 1.0290514525726286, | |
| "grad_norm": 17.3580379486084, | |
| "learning_rate": 2.061e-05, | |
| "loss": 0.103, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 1354894.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 1.032551627581379, | |
| "grad_norm": 0.019541358575224876, | |
| "learning_rate": 2.0510000000000002e-05, | |
| "loss": 0.0496, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 1359645.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.0360518025901295, | |
| "grad_norm": 1.1783517599105835, | |
| "learning_rate": 2.0410000000000003e-05, | |
| "loss": 0.06, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1364197.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 1.03955197759888, | |
| "grad_norm": 0.03887060657143593, | |
| "learning_rate": 2.031e-05, | |
| "loss": 0.0118, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1368857.0, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 1.0430521526076304, | |
| "grad_norm": 0.11780918389558792, | |
| "learning_rate": 2.021e-05, | |
| "loss": 0.0336, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1373483.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 1.0465523276163808, | |
| "grad_norm": 0.06139334291219711, | |
| "learning_rate": 2.0110000000000002e-05, | |
| "loss": 0.0924, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 1378101.0, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 1.0500525026251313, | |
| "grad_norm": 0.0635937973856926, | |
| "learning_rate": 2.001e-05, | |
| "loss": 0.0472, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.0500525026251313, | |
| "eval_accuracy": 0.44662034112444726, | |
| "eval_f1": 0.3874674882536518, | |
| "eval_loss": 0.041092198342084885, | |
| "eval_mean_token_accuracy": 0.9890151577766495, | |
| "eval_num_tokens": 1382635.0, | |
| "eval_precision": 0.4642550079051411, | |
| "eval_recall": 0.4463958399837153, | |
| "eval_runtime": 243.8353, | |
| "eval_samples_per_second": 6.492, | |
| "eval_steps_per_second": 0.812, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.0535526776338817, | |
| "grad_norm": 0.05304880812764168, | |
| "learning_rate": 1.991e-05, | |
| "loss": 0.0065, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1387247.0, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 1.0570528526426322, | |
| "grad_norm": 0.09396978467702866, | |
| "learning_rate": 1.9810000000000002e-05, | |
| "loss": 0.03, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1391905.0, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 1.0605530276513826, | |
| "grad_norm": 0.02283914014697075, | |
| "learning_rate": 1.971e-05, | |
| "loss": 0.0412, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1396551.0, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 1.064053202660133, | |
| "grad_norm": 0.14994072914123535, | |
| "learning_rate": 1.961e-05, | |
| "loss": 0.0387, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1401184.0, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 1.0675533776688835, | |
| "grad_norm": 0.04891595244407654, | |
| "learning_rate": 1.951e-05, | |
| "loss": 0.04, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1405794.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.071053552677634, | |
| "grad_norm": 8.5429105758667, | |
| "learning_rate": 1.941e-05, | |
| "loss": 0.0338, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1410352.0, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 1.0745537276863844, | |
| "grad_norm": 6.096926212310791, | |
| "learning_rate": 1.931e-05, | |
| "loss": 0.0823, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 1414922.0, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 1.0780539026951348, | |
| "grad_norm": 0.01014970988035202, | |
| "learning_rate": 1.921e-05, | |
| "loss": 0.0596, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1419518.0, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 1.0815540777038852, | |
| "grad_norm": 8.101456642150879, | |
| "learning_rate": 1.911e-05, | |
| "loss": 0.0309, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1424063.0, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 1.0850542527126357, | |
| "grad_norm": 0.03248458355665207, | |
| "learning_rate": 1.901e-05, | |
| "loss": 0.0081, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1428657.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.088554427721386, | |
| "grad_norm": 0.284970760345459, | |
| "learning_rate": 1.891e-05, | |
| "loss": 0.0035, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1433340.0, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 1.0920546027301365, | |
| "grad_norm": 5.219287872314453, | |
| "learning_rate": 1.881e-05, | |
| "loss": 0.0339, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1437947.0, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 1.095554777738887, | |
| "grad_norm": 0.021635359153151512, | |
| "learning_rate": 1.871e-05, | |
| "loss": 0.0533, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1442589.0, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 1.0990549527476374, | |
| "grad_norm": 0.05187542736530304, | |
| "learning_rate": 1.861e-05, | |
| "loss": 0.0297, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1447150.0, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 1.1025551277563879, | |
| "grad_norm": 0.05536261200904846, | |
| "learning_rate": 1.851e-05, | |
| "loss": 0.0013, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1451702.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.1060553027651383, | |
| "grad_norm": 4.796628475189209, | |
| "learning_rate": 1.841e-05, | |
| "loss": 0.0073, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1456224.0, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 1.1095554777738887, | |
| "grad_norm": 4.390865325927734, | |
| "learning_rate": 1.8310000000000003e-05, | |
| "loss": 0.0532, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1460755.0, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 1.1130556527826392, | |
| "grad_norm": 0.12759913504123688, | |
| "learning_rate": 1.8210000000000004e-05, | |
| "loss": 0.0183, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1465282.0, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 1.1165558277913896, | |
| "grad_norm": 0.023097023367881775, | |
| "learning_rate": 1.811e-05, | |
| "loss": 0.0012, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1469858.0, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 1.12005600280014, | |
| "grad_norm": 0.013977882452309132, | |
| "learning_rate": 1.8010000000000002e-05, | |
| "loss": 0.0598, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1474377.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.1235561778088905, | |
| "grad_norm": 0.03361167758703232, | |
| "learning_rate": 1.7910000000000003e-05, | |
| "loss": 0.0238, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1478946.0, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 1.127056352817641, | |
| "grad_norm": 0.08658773452043533, | |
| "learning_rate": 1.781e-05, | |
| "loss": 0.0542, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1483522.0, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 1.1305565278263914, | |
| "grad_norm": 0.030420592054724693, | |
| "learning_rate": 1.771e-05, | |
| "loss": 0.0509, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 1488282.0, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 1.1340567028351418, | |
| "grad_norm": 0.10281772166490555, | |
| "learning_rate": 1.7610000000000002e-05, | |
| "loss": 0.0018, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1492851.0, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 1.1375568778438923, | |
| "grad_norm": 0.061314165592193604, | |
| "learning_rate": 1.751e-05, | |
| "loss": 0.0397, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1497490.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.1410570528526427, | |
| "grad_norm": 0.05558156967163086, | |
| "learning_rate": 1.741e-05, | |
| "loss": 0.0341, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1502033.0, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 1.1445572278613931, | |
| "grad_norm": 0.7785694003105164, | |
| "learning_rate": 1.7310000000000002e-05, | |
| "loss": 0.0501, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1506574.0, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 1.1480574028701436, | |
| "grad_norm": 1.054373025894165, | |
| "learning_rate": 1.721e-05, | |
| "loss": 0.0073, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1511166.0, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 1.151557577878894, | |
| "grad_norm": 13.361648559570312, | |
| "learning_rate": 1.711e-05, | |
| "loss": 0.0203, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1515897.0, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 1.1550577528876445, | |
| "grad_norm": 9.834617614746094, | |
| "learning_rate": 1.701e-05, | |
| "loss": 0.0487, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 1520628.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.158557927896395, | |
| "grad_norm": 0.03448121249675751, | |
| "learning_rate": 1.6910000000000002e-05, | |
| "loss": 0.0257, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1525210.0, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 1.1620581029051453, | |
| "grad_norm": 0.11401532590389252, | |
| "learning_rate": 1.681e-05, | |
| "loss": 0.0379, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1529651.0, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 1.1655582779138958, | |
| "grad_norm": 3.9310457706451416, | |
| "learning_rate": 1.671e-05, | |
| "loss": 0.0212, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1534286.0, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 1.1690584529226462, | |
| "grad_norm": 0.012804349884390831, | |
| "learning_rate": 1.6610000000000002e-05, | |
| "loss": 0.0012, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1538944.0, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 1.1725586279313966, | |
| "grad_norm": 0.7828325033187866, | |
| "learning_rate": 1.651e-05, | |
| "loss": 0.0346, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1543591.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.176058802940147, | |
| "grad_norm": 0.027147287502884865, | |
| "learning_rate": 1.641e-05, | |
| "loss": 0.0021, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1548149.0, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 1.1795589779488975, | |
| "grad_norm": 0.05930430442094803, | |
| "learning_rate": 1.631e-05, | |
| "loss": 0.0335, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1552807.0, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 1.183059152957648, | |
| "grad_norm": 0.03868912532925606, | |
| "learning_rate": 1.621e-05, | |
| "loss": 0.0871, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1557375.0, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 1.1865593279663984, | |
| "grad_norm": 0.038131892681121826, | |
| "learning_rate": 1.611e-05, | |
| "loss": 0.0129, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1561984.0, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 1.1900595029751488, | |
| "grad_norm": 0.3329053521156311, | |
| "learning_rate": 1.601e-05, | |
| "loss": 0.0173, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1566645.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.1935596779838993, | |
| "grad_norm": 0.08648809045553207, | |
| "learning_rate": 1.591e-05, | |
| "loss": 0.0916, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1571275.0, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 1.1970598529926497, | |
| "grad_norm": 9.256734848022461, | |
| "learning_rate": 1.581e-05, | |
| "loss": 0.0156, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1575848.0, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 1.2005600280014002, | |
| "grad_norm": 0.20919840037822723, | |
| "learning_rate": 1.571e-05, | |
| "loss": 0.028, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1580383.0, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 1.2040602030101506, | |
| "grad_norm": 8.012375831604004, | |
| "learning_rate": 1.561e-05, | |
| "loss": 0.0042, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1584986.0, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 1.207560378018901, | |
| "grad_norm": 0.024143755435943604, | |
| "learning_rate": 1.551e-05, | |
| "loss": 0.0021, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1589541.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.2110605530276515, | |
| "grad_norm": 3.953441619873047, | |
| "learning_rate": 1.541e-05, | |
| "loss": 0.043, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1594177.0, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 1.214560728036402, | |
| "grad_norm": 3.8087575435638428, | |
| "learning_rate": 1.531e-05, | |
| "loss": 0.0538, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1598836.0, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 1.2180609030451524, | |
| "grad_norm": 0.01786259561777115, | |
| "learning_rate": 1.5210000000000002e-05, | |
| "loss": 0.0583, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1603561.0, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 1.2215610780539028, | |
| "grad_norm": 0.026305489242076874, | |
| "learning_rate": 1.5110000000000003e-05, | |
| "loss": 0.0121, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1608176.0, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 1.2250612530626532, | |
| "grad_norm": 0.026074456050992012, | |
| "learning_rate": 1.5010000000000002e-05, | |
| "loss": 0.0616, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.2250612530626532, | |
| "eval_accuracy": 0.4403032217308907, | |
| "eval_f1": 0.3717722608671718, | |
| "eval_loss": 0.04926339536905289, | |
| "eval_mean_token_accuracy": 0.9881313193326045, | |
| "eval_num_tokens": 1612812.0, | |
| "eval_precision": 0.44875078714612676, | |
| "eval_recall": 0.39254694591331135, | |
| "eval_runtime": 245.1895, | |
| "eval_samples_per_second": 6.456, | |
| "eval_steps_per_second": 0.808, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.2285614280714037, | |
| "grad_norm": 0.0484970398247242, | |
| "learning_rate": 1.4910000000000001e-05, | |
| "loss": 0.0593, | |
| "mean_token_accuracy": 0.9874999970197678, | |
| "num_tokens": 1617379.0, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 1.232061603080154, | |
| "grad_norm": 12.298089981079102, | |
| "learning_rate": 1.4810000000000002e-05, | |
| "loss": 0.0659, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1622039.0, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 1.2355617780889045, | |
| "grad_norm": 0.022822504863142967, | |
| "learning_rate": 1.4710000000000001e-05, | |
| "loss": 0.0392, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1626629.0, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 1.239061953097655, | |
| "grad_norm": 0.19993631541728973, | |
| "learning_rate": 1.461e-05, | |
| "loss": 0.003, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1631184.0, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 1.2425621281064054, | |
| "grad_norm": 0.01650061272084713, | |
| "learning_rate": 1.4510000000000002e-05, | |
| "loss": 0.0016, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1635787.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.2460623031151559, | |
| "grad_norm": 0.09447409212589264, | |
| "learning_rate": 1.4410000000000001e-05, | |
| "loss": 0.0111, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1640489.0, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 1.2495624781239063, | |
| "grad_norm": 1.348664402961731, | |
| "learning_rate": 1.4310000000000002e-05, | |
| "loss": 0.0037, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1645122.0, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 1.2530626531326567, | |
| "grad_norm": 0.02807781472802162, | |
| "learning_rate": 1.4210000000000001e-05, | |
| "loss": 0.0101, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1649714.0, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 1.2565628281414072, | |
| "grad_norm": 0.0278321523219347, | |
| "learning_rate": 1.411e-05, | |
| "loss": 0.0317, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1654253.0, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 1.2600630031501576, | |
| "grad_norm": 0.05552316829562187, | |
| "learning_rate": 1.4010000000000001e-05, | |
| "loss": 0.0338, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1658870.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.263563178158908, | |
| "grad_norm": 0.5879592895507812, | |
| "learning_rate": 1.391e-05, | |
| "loss": 0.0768, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 1663540.0, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 1.2670633531676585, | |
| "grad_norm": 0.23051026463508606, | |
| "learning_rate": 1.381e-05, | |
| "loss": 0.0366, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1668164.0, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 1.270563528176409, | |
| "grad_norm": 7.013516426086426, | |
| "learning_rate": 1.3710000000000001e-05, | |
| "loss": 0.0831, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1672778.0, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 1.2740637031851594, | |
| "grad_norm": 0.20101211965084076, | |
| "learning_rate": 1.361e-05, | |
| "loss": 0.0336, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1677340.0, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 1.2775638781939098, | |
| "grad_norm": 0.3626852035522461, | |
| "learning_rate": 1.3510000000000001e-05, | |
| "loss": 0.043, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1681942.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.2810640532026603, | |
| "grad_norm": 0.053018514066934586, | |
| "learning_rate": 1.341e-05, | |
| "loss": 0.045, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1686613.0, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 1.2845642282114107, | |
| "grad_norm": 7.274749755859375, | |
| "learning_rate": 1.331e-05, | |
| "loss": 0.1322, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 1691194.0, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 1.2880644032201611, | |
| "grad_norm": 0.05607298016548157, | |
| "learning_rate": 1.321e-05, | |
| "loss": 0.0384, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1695910.0, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 1.2915645782289116, | |
| "grad_norm": 0.03872371464967728, | |
| "learning_rate": 1.311e-05, | |
| "loss": 0.0138, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1700440.0, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 1.295064753237662, | |
| "grad_norm": 0.042605865746736526, | |
| "learning_rate": 1.301e-05, | |
| "loss": 0.0845, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 1705135.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.2985649282464125, | |
| "grad_norm": 4.082870006561279, | |
| "learning_rate": 1.291e-05, | |
| "loss": 0.0633, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1709718.0, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 1.302065103255163, | |
| "grad_norm": 5.0214691162109375, | |
| "learning_rate": 1.281e-05, | |
| "loss": 0.0522, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1714292.0, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 1.3055652782639133, | |
| "grad_norm": 0.05840720981359482, | |
| "learning_rate": 1.271e-05, | |
| "loss": 0.0667, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1718744.0, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 1.3090654532726638, | |
| "grad_norm": 0.3673993647098541, | |
| "learning_rate": 1.261e-05, | |
| "loss": 0.0463, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1723373.0, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 1.312565628281414, | |
| "grad_norm": 0.06860412657260895, | |
| "learning_rate": 1.2509999999999999e-05, | |
| "loss": 0.0414, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1727984.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.3160658032901644, | |
| "grad_norm": 0.03777327015995979, | |
| "learning_rate": 1.2410000000000001e-05, | |
| "loss": 0.0116, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1732531.0, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 1.3195659782989149, | |
| "grad_norm": 0.7017369270324707, | |
| "learning_rate": 1.231e-05, | |
| "loss": 0.0685, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 1737147.0, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 1.3230661533076653, | |
| "grad_norm": 8.006946563720703, | |
| "learning_rate": 1.221e-05, | |
| "loss": 0.0483, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1741794.0, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 1.3265663283164157, | |
| "grad_norm": 7.42986536026001, | |
| "learning_rate": 1.2110000000000001e-05, | |
| "loss": 0.0691, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 1746457.0, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 1.3300665033251662, | |
| "grad_norm": 0.08513722568750381, | |
| "learning_rate": 1.201e-05, | |
| "loss": 0.0096, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1751031.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.3335666783339166, | |
| "grad_norm": 5.149372577667236, | |
| "learning_rate": 1.1910000000000001e-05, | |
| "loss": 0.1008, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 1755649.0, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 1.337066853342667, | |
| "grad_norm": 0.10420811176300049, | |
| "learning_rate": 1.181e-05, | |
| "loss": 0.0901, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1760094.0, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 1.3405670283514175, | |
| "grad_norm": 0.15396250784397125, | |
| "learning_rate": 1.171e-05, | |
| "loss": 0.006, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1764739.0, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 1.344067203360168, | |
| "grad_norm": 0.08703949302434921, | |
| "learning_rate": 1.161e-05, | |
| "loss": 0.0277, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1769426.0, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 1.3475673783689184, | |
| "grad_norm": 2.2800724506378174, | |
| "learning_rate": 1.151e-05, | |
| "loss": 0.0215, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1774042.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.3510675533776688, | |
| "grad_norm": 7.864820957183838, | |
| "learning_rate": 1.141e-05, | |
| "loss": 0.0396, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1778764.0, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 1.3545677283864193, | |
| "grad_norm": 18.59937286376953, | |
| "learning_rate": 1.1310000000000002e-05, | |
| "loss": 0.0501, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1783410.0, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 1.3580679033951697, | |
| "grad_norm": 0.06370130181312561, | |
| "learning_rate": 1.1210000000000001e-05, | |
| "loss": 0.0804, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1788086.0, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 1.3615680784039201, | |
| "grad_norm": 3.136486053466797, | |
| "learning_rate": 1.111e-05, | |
| "loss": 0.0549, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1792576.0, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 1.3650682534126706, | |
| "grad_norm": 0.080386683344841, | |
| "learning_rate": 1.1010000000000001e-05, | |
| "loss": 0.0105, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1797137.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.368568428421421, | |
| "grad_norm": 0.311697393655777, | |
| "learning_rate": 1.091e-05, | |
| "loss": 0.0016, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1801751.0, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 1.3720686034301715, | |
| "grad_norm": 0.04613969102501869, | |
| "learning_rate": 1.081e-05, | |
| "loss": 0.0236, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1806217.0, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 1.375568778438922, | |
| "grad_norm": 2.0834603309631348, | |
| "learning_rate": 1.071e-05, | |
| "loss": 0.0064, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1810797.0, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 1.3790689534476723, | |
| "grad_norm": 4.339105129241943, | |
| "learning_rate": 1.061e-05, | |
| "loss": 0.0572, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1815366.0, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 1.3825691284564228, | |
| "grad_norm": 0.03018569014966488, | |
| "learning_rate": 1.0510000000000001e-05, | |
| "loss": 0.0121, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1819910.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.3860693034651732, | |
| "grad_norm": 0.02608495019376278, | |
| "learning_rate": 1.041e-05, | |
| "loss": 0.0203, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1824620.0, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 1.3895694784739236, | |
| "grad_norm": 0.028722476214170456, | |
| "learning_rate": 1.031e-05, | |
| "loss": 0.0524, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1829297.0, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 1.393069653482674, | |
| "grad_norm": 3.794125556945801, | |
| "learning_rate": 1.021e-05, | |
| "loss": 0.1004, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 1833884.0, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 1.3965698284914245, | |
| "grad_norm": 0.0639004036784172, | |
| "learning_rate": 1.011e-05, | |
| "loss": 0.0432, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1838429.0, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 1.400070003500175, | |
| "grad_norm": 0.08853046596050262, | |
| "learning_rate": 1.001e-05, | |
| "loss": 0.0531, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.400070003500175, | |
| "eval_accuracy": 0.4491471888818699, | |
| "eval_f1": 0.3831962155491568, | |
| "eval_loss": 0.04581384360790253, | |
| "eval_mean_token_accuracy": 0.988113282003788, | |
| "eval_num_tokens": 1843093.0, | |
| "eval_precision": 0.4551209732080744, | |
| "eval_recall": 0.42343303927833725, | |
| "eval_runtime": 243.4032, | |
| "eval_samples_per_second": 6.504, | |
| "eval_steps_per_second": 0.813, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.4035701785089254, | |
| "grad_norm": 1.6858179569244385, | |
| "learning_rate": 9.91e-06, | |
| "loss": 0.0544, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1847649.0, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 1.4070703535176758, | |
| "grad_norm": 4.908888339996338, | |
| "learning_rate": 9.810000000000001e-06, | |
| "loss": 0.0405, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1852259.0, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 1.4105705285264263, | |
| "grad_norm": 0.08344841003417969, | |
| "learning_rate": 9.71e-06, | |
| "loss": 0.0251, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1856863.0, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 1.4140707035351767, | |
| "grad_norm": 0.43226656317710876, | |
| "learning_rate": 9.610000000000001e-06, | |
| "loss": 0.0363, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1861413.0, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 1.4175708785439272, | |
| "grad_norm": 5.785868167877197, | |
| "learning_rate": 9.51e-06, | |
| "loss": 0.069, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1865973.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.4210710535526776, | |
| "grad_norm": 0.880620002746582, | |
| "learning_rate": 9.410000000000001e-06, | |
| "loss": 0.0436, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1870553.0, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 1.424571228561428, | |
| "grad_norm": 6.6892218589782715, | |
| "learning_rate": 9.31e-06, | |
| "loss": 0.0627, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1875126.0, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 1.4280714035701785, | |
| "grad_norm": 0.048246119171381, | |
| "learning_rate": 9.21e-06, | |
| "loss": 0.0428, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1879726.0, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 1.431571578578929, | |
| "grad_norm": 0.07305438071489334, | |
| "learning_rate": 9.110000000000001e-06, | |
| "loss": 0.0387, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1884286.0, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 1.4350717535876794, | |
| "grad_norm": 11.415247917175293, | |
| "learning_rate": 9.01e-06, | |
| "loss": 0.0821, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1888772.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.4385719285964298, | |
| "grad_norm": 0.0724155455827713, | |
| "learning_rate": 8.910000000000001e-06, | |
| "loss": 0.0189, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1893335.0, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 1.4420721036051802, | |
| "grad_norm": 0.11276718974113464, | |
| "learning_rate": 8.81e-06, | |
| "loss": 0.0218, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1898013.0, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 1.4455722786139307, | |
| "grad_norm": 0.07353251427412033, | |
| "learning_rate": 8.71e-06, | |
| "loss": 0.0018, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1902625.0, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 1.4490724536226811, | |
| "grad_norm": 0.031495820730924606, | |
| "learning_rate": 8.61e-06, | |
| "loss": 0.025, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1907228.0, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 1.4525726286314316, | |
| "grad_norm": 13.788881301879883, | |
| "learning_rate": 8.51e-06, | |
| "loss": 0.0804, | |
| "mean_token_accuracy": 0.975, | |
| "num_tokens": 1911797.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.456072803640182, | |
| "grad_norm": 0.05939871817827225, | |
| "learning_rate": 8.409999999999999e-06, | |
| "loss": 0.0432, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1916403.0, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 1.4595729786489324, | |
| "grad_norm": 11.028568267822266, | |
| "learning_rate": 8.31e-06, | |
| "loss": 0.0425, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1920937.0, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 1.4630731536576829, | |
| "grad_norm": 0.048251356929540634, | |
| "learning_rate": 8.210000000000001e-06, | |
| "loss": 0.0328, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1925503.0, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 1.4665733286664333, | |
| "grad_norm": 0.05744925141334534, | |
| "learning_rate": 8.11e-06, | |
| "loss": 0.0581, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1930161.0, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 1.4700735036751837, | |
| "grad_norm": 4.82538366317749, | |
| "learning_rate": 8.010000000000001e-06, | |
| "loss": 0.0637, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1934851.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.4735736786839342, | |
| "grad_norm": 0.015897316858172417, | |
| "learning_rate": 7.91e-06, | |
| "loss": 0.0553, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1939517.0, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 1.4770738536926846, | |
| "grad_norm": 0.4943805932998657, | |
| "learning_rate": 7.810000000000001e-06, | |
| "loss": 0.064, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 1944221.0, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 1.480574028701435, | |
| "grad_norm": 0.8401426672935486, | |
| "learning_rate": 7.71e-06, | |
| "loss": 0.004, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1948737.0, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 1.4840742037101855, | |
| "grad_norm": 8.93281364440918, | |
| "learning_rate": 7.610000000000001e-06, | |
| "loss": 0.0687, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1953245.0, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 1.487574378718936, | |
| "grad_norm": 24.4106388092041, | |
| "learning_rate": 7.51e-06, | |
| "loss": 0.0928, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 1957794.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.4910745537276864, | |
| "grad_norm": 0.38299062848091125, | |
| "learning_rate": 7.41e-06, | |
| "loss": 0.0376, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1962327.0, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 1.4945747287364368, | |
| "grad_norm": 0.06252578645944595, | |
| "learning_rate": 7.31e-06, | |
| "loss": 0.0196, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1967028.0, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 1.4980749037451873, | |
| "grad_norm": 1.5602178573608398, | |
| "learning_rate": 7.2100000000000004e-06, | |
| "loss": 0.023, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 1971740.0, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 1.5015750787539377, | |
| "grad_norm": 0.031557030975818634, | |
| "learning_rate": 7.11e-06, | |
| "loss": 0.0233, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1976313.0, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 1.5050752537626881, | |
| "grad_norm": 0.027841169387102127, | |
| "learning_rate": 7.01e-06, | |
| "loss": 0.022, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1980963.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.5085754287714386, | |
| "grad_norm": 0.62822425365448, | |
| "learning_rate": 6.91e-06, | |
| "loss": 0.0481, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 1985494.0, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 1.512075603780189, | |
| "grad_norm": 0.05204153060913086, | |
| "learning_rate": 6.81e-06, | |
| "loss": 0.0123, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 1990229.0, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.5155757787889395, | |
| "grad_norm": 0.030517544597387314, | |
| "learning_rate": 6.710000000000001e-06, | |
| "loss": 0.0037, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 1994888.0, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 1.51907595379769, | |
| "grad_norm": 0.03292595595121384, | |
| "learning_rate": 6.610000000000001e-06, | |
| "loss": 0.0739, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 1999584.0, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 1.5225761288064403, | |
| "grad_norm": 0.04422605782747269, | |
| "learning_rate": 6.510000000000001e-06, | |
| "loss": 0.0677, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 2004217.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.5260763038151908, | |
| "grad_norm": 0.03554658591747284, | |
| "learning_rate": 6.4100000000000005e-06, | |
| "loss": 0.0764, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 2008889.0, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 1.5295764788239412, | |
| "grad_norm": 3.288350820541382, | |
| "learning_rate": 6.3100000000000006e-06, | |
| "loss": 0.0361, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 2013499.0, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 1.5330766538326916, | |
| "grad_norm": 0.06462374329566956, | |
| "learning_rate": 6.210000000000001e-06, | |
| "loss": 0.0048, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 2018094.0, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 1.536576828841442, | |
| "grad_norm": 0.1262829601764679, | |
| "learning_rate": 6.110000000000001e-06, | |
| "loss": 0.0165, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 2022654.0, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 1.5400770038501925, | |
| "grad_norm": 1.0548720359802246, | |
| "learning_rate": 6.01e-06, | |
| "loss": 0.0459, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 2027308.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.543577178858943, | |
| "grad_norm": 10.028485298156738, | |
| "learning_rate": 5.91e-06, | |
| "loss": 0.0683, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 2031892.0, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 1.5470773538676934, | |
| "grad_norm": 9.958955764770508, | |
| "learning_rate": 5.81e-06, | |
| "loss": 0.0637, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 2036478.0, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 1.5505775288764438, | |
| "grad_norm": 0.04276181757450104, | |
| "learning_rate": 5.71e-06, | |
| "loss": 0.0782, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 2041074.0, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 1.5540777038851943, | |
| "grad_norm": 0.047367651015520096, | |
| "learning_rate": 5.61e-06, | |
| "loss": 0.0377, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 2045668.0, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 1.5575778788939447, | |
| "grad_norm": 4.863480091094971, | |
| "learning_rate": 5.510000000000001e-06, | |
| "loss": 0.0285, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 2050297.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.5610780539026952, | |
| "grad_norm": 6.1144537925720215, | |
| "learning_rate": 5.410000000000001e-06, | |
| "loss": 0.0646, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 2054920.0, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 1.5645782289114456, | |
| "grad_norm": 0.05516400188207626, | |
| "learning_rate": 5.31e-06, | |
| "loss": 0.0097, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 2059412.0, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 1.568078403920196, | |
| "grad_norm": 0.06957421451807022, | |
| "learning_rate": 5.21e-06, | |
| "loss": 0.0453, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 2063933.0, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 1.5715785789289465, | |
| "grad_norm": 0.59195876121521, | |
| "learning_rate": 5.11e-06, | |
| "loss": 0.0545, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 2068549.0, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 1.575078753937697, | |
| "grad_norm": 4.435554504394531, | |
| "learning_rate": 5.01e-06, | |
| "loss": 0.0137, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.575078753937697, | |
| "eval_accuracy": 0.44662034112444726, | |
| "eval_f1": 0.3821042468723695, | |
| "eval_loss": 0.04538816958665848, | |
| "eval_mean_token_accuracy": 0.9878607569920896, | |
| "eval_num_tokens": 2073095.0, | |
| "eval_precision": 0.45302304542991134, | |
| "eval_recall": 0.42551771400250105, | |
| "eval_runtime": 243.4481, | |
| "eval_samples_per_second": 6.502, | |
| "eval_steps_per_second": 0.813, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.5785789289464474, | |
| "grad_norm": 0.03762364760041237, | |
| "learning_rate": 4.9100000000000004e-06, | |
| "loss": 0.026, | |
| "mean_token_accuracy": 0.9924999982118606, | |
| "num_tokens": 2077816.0, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 1.5820791039551978, | |
| "grad_norm": 0.038492508232593536, | |
| "learning_rate": 4.81e-06, | |
| "loss": 0.037, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 2082369.0, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 1.5855792789639482, | |
| "grad_norm": 0.04851048067212105, | |
| "learning_rate": 4.710000000000001e-06, | |
| "loss": 0.1037, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 2086980.0, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 1.5890794539726987, | |
| "grad_norm": 0.06226026266813278, | |
| "learning_rate": 4.610000000000001e-06, | |
| "loss": 0.0316, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 2091582.0, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 1.5925796289814491, | |
| "grad_norm": 0.012553819455206394, | |
| "learning_rate": 4.51e-06, | |
| "loss": 0.0426, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 2096137.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.5960798039901996, | |
| "grad_norm": 0.12090373784303665, | |
| "learning_rate": 4.41e-06, | |
| "loss": 0.0035, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 2100974.0, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 1.59957997899895, | |
| "grad_norm": 0.024477414786815643, | |
| "learning_rate": 4.31e-06, | |
| "loss": 0.0199, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 2105553.0, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 1.6030801540077004, | |
| "grad_norm": 9.273329734802246, | |
| "learning_rate": 4.21e-06, | |
| "loss": 0.0197, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 2110182.0, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 1.6065803290164509, | |
| "grad_norm": 6.43629264831543, | |
| "learning_rate": 4.11e-06, | |
| "loss": 0.0163, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 2114818.0, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 1.6100805040252013, | |
| "grad_norm": 0.047764312475919724, | |
| "learning_rate": 4.01e-06, | |
| "loss": 0.0112, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 2119456.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.6135806790339517, | |
| "grad_norm": 3.2197811603546143, | |
| "learning_rate": 3.910000000000001e-06, | |
| "loss": 0.0357, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 2124191.0, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 1.6170808540427022, | |
| "grad_norm": 0.04559561237692833, | |
| "learning_rate": 3.8100000000000004e-06, | |
| "loss": 0.0896, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 2128813.0, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 1.6205810290514526, | |
| "grad_norm": 0.03245115652680397, | |
| "learning_rate": 3.7100000000000005e-06, | |
| "loss": 0.0167, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 2133363.0, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 1.624081204060203, | |
| "grad_norm": 0.0637376606464386, | |
| "learning_rate": 3.61e-06, | |
| "loss": 0.0161, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 2137919.0, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 1.6275813790689533, | |
| "grad_norm": 0.10170795023441315, | |
| "learning_rate": 3.5100000000000003e-06, | |
| "loss": 0.0619, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 2142475.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.6310815540777037, | |
| "grad_norm": 0.11928985267877579, | |
| "learning_rate": 3.41e-06, | |
| "loss": 0.0317, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 2147153.0, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 1.6345817290864542, | |
| "grad_norm": 0.31450000405311584, | |
| "learning_rate": 3.31e-06, | |
| "loss": 0.0262, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 2151803.0, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 1.6380819040952046, | |
| "grad_norm": 0.025399642065167427, | |
| "learning_rate": 3.2099999999999998e-06, | |
| "loss": 0.029, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 2156302.0, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 1.641582079103955, | |
| "grad_norm": 0.07148288935422897, | |
| "learning_rate": 3.11e-06, | |
| "loss": 0.058, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 2160964.0, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 1.6450822541127055, | |
| "grad_norm": 0.043584585189819336, | |
| "learning_rate": 3.01e-06, | |
| "loss": 0.0118, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 2165614.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.648582429121456, | |
| "grad_norm": 0.021303439512848854, | |
| "learning_rate": 2.91e-06, | |
| "loss": 0.0593, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 2170098.0, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 1.6520826041302064, | |
| "grad_norm": 3.4671308994293213, | |
| "learning_rate": 2.81e-06, | |
| "loss": 0.0461, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 2174607.0, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 1.6555827791389568, | |
| "grad_norm": 0.03900500759482384, | |
| "learning_rate": 2.71e-06, | |
| "loss": 0.0029, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 2179286.0, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 1.6590829541477072, | |
| "grad_norm": 0.02918989770114422, | |
| "learning_rate": 2.6100000000000004e-06, | |
| "loss": 0.0892, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 2183836.0, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 1.6625831291564577, | |
| "grad_norm": 0.029129987582564354, | |
| "learning_rate": 2.51e-06, | |
| "loss": 0.0861, | |
| "mean_token_accuracy": 0.9699999928474426, | |
| "num_tokens": 2188430.0, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.6660833041652081, | |
| "grad_norm": 0.08811552822589874, | |
| "learning_rate": 2.4100000000000002e-06, | |
| "loss": 0.007, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 2193026.0, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 1.6695834791739586, | |
| "grad_norm": 3.6819851398468018, | |
| "learning_rate": 2.31e-06, | |
| "loss": 0.016, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 2197606.0, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 1.673083654182709, | |
| "grad_norm": 0.0475095734000206, | |
| "learning_rate": 2.2100000000000004e-06, | |
| "loss": 0.0318, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 2202131.0, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 1.6765838291914594, | |
| "grad_norm": 5.221133708953857, | |
| "learning_rate": 2.11e-06, | |
| "loss": 0.0138, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 2206887.0, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 1.6800840042002099, | |
| "grad_norm": 8.505605697631836, | |
| "learning_rate": 2.0100000000000002e-06, | |
| "loss": 0.0216, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 2211562.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.6835841792089603, | |
| "grad_norm": 0.05636419355869293, | |
| "learning_rate": 1.91e-06, | |
| "loss": 0.0568, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 2216043.0, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 1.6870843542177107, | |
| "grad_norm": 0.06148410961031914, | |
| "learning_rate": 1.8100000000000002e-06, | |
| "loss": 0.0313, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 2220686.0, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 1.6905845292264612, | |
| "grad_norm": 0.1644497960805893, | |
| "learning_rate": 1.7100000000000001e-06, | |
| "loss": 0.0611, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 2225264.0, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 1.6940847042352116, | |
| "grad_norm": 8.40280532836914, | |
| "learning_rate": 1.61e-06, | |
| "loss": 0.0134, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 2229777.0, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 1.697584879243962, | |
| "grad_norm": 0.4285930097103119, | |
| "learning_rate": 1.5100000000000002e-06, | |
| "loss": 0.0016, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 2234477.0, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.7010850542527125, | |
| "grad_norm": 0.05217473581433296, | |
| "learning_rate": 1.41e-06, | |
| "loss": 0.0159, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 2239087.0, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 1.704585229261463, | |
| "grad_norm": 17.20269012451172, | |
| "learning_rate": 1.3100000000000002e-06, | |
| "loss": 0.0332, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 2243751.0, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 1.7080854042702134, | |
| "grad_norm": 0.019595852121710777, | |
| "learning_rate": 1.21e-06, | |
| "loss": 0.0438, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 2248371.0, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 1.7115855792789638, | |
| "grad_norm": 0.021653831005096436, | |
| "learning_rate": 1.1100000000000002e-06, | |
| "loss": 0.0014, | |
| "mean_token_accuracy": 1.0, | |
| "num_tokens": 2252963.0, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 1.7150857542877143, | |
| "grad_norm": 0.06774129718542099, | |
| "learning_rate": 1.01e-06, | |
| "loss": 0.0086, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 2257627.0, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.7185859292964647, | |
| "grad_norm": 0.17008963227272034, | |
| "learning_rate": 9.100000000000001e-07, | |
| "loss": 0.0511, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 2262204.0, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 1.7220861043052151, | |
| "grad_norm": 0.05181132256984711, | |
| "learning_rate": 8.1e-07, | |
| "loss": 0.102, | |
| "mean_token_accuracy": 0.9699999928474426, | |
| "num_tokens": 2266782.0, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 1.7255862793139656, | |
| "grad_norm": 0.019397318363189697, | |
| "learning_rate": 7.100000000000001e-07, | |
| "loss": 0.0653, | |
| "mean_token_accuracy": 0.9799999952316284, | |
| "num_tokens": 2271407.0, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 1.729086454322716, | |
| "grad_norm": 12.307579040527344, | |
| "learning_rate": 6.100000000000001e-07, | |
| "loss": 0.0463, | |
| "mean_token_accuracy": 0.9899999976158143, | |
| "num_tokens": 2276067.0, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 1.7325866293314665, | |
| "grad_norm": 0.9262644648551941, | |
| "learning_rate": 5.100000000000001e-07, | |
| "loss": 0.1174, | |
| "mean_token_accuracy": 0.9749999940395355, | |
| "num_tokens": 2280720.0, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.736086804340217, | |
| "grad_norm": 10.878531455993652, | |
| "learning_rate": 4.1000000000000004e-07, | |
| "loss": 0.0366, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 2285258.0, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 1.7395869793489673, | |
| "grad_norm": 0.015575112774968147, | |
| "learning_rate": 3.1e-07, | |
| "loss": 0.013, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 2289807.0, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 1.7430871543577178, | |
| "grad_norm": 6.684645652770996, | |
| "learning_rate": 2.1e-07, | |
| "loss": 0.048, | |
| "mean_token_accuracy": 0.9849999964237213, | |
| "num_tokens": 2294427.0, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 1.7465873293664682, | |
| "grad_norm": 0.2887522280216217, | |
| "learning_rate": 1.1e-07, | |
| "loss": 0.0268, | |
| "mean_token_accuracy": 0.9949999988079071, | |
| "num_tokens": 2299106.0, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 1.7500875043752186, | |
| "grad_norm": 3.4086289405822754, | |
| "learning_rate": 1e-08, | |
| "loss": 0.0635, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.7500875043752186, | |
| "eval_accuracy": 0.4516740366392925, | |
| "eval_f1": 0.3847962989063844, | |
| "eval_loss": 0.04607350006699562, | |
| "eval_mean_token_accuracy": 0.9878607566910561, | |
| "eval_num_tokens": 2303714.0, | |
| "eval_precision": 0.4551334955315969, | |
| "eval_recall": 0.42526555766959756, | |
| "eval_runtime": 245.308, | |
| "eval_samples_per_second": 6.453, | |
| "eval_steps_per_second": 0.807, | |
| "step": 5000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 5000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.477983256915968e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |