{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.7500875043752186, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00035001750087504374, "grad_norm": 7.988319396972656, "learning_rate": 5e-05, "loss": 0.2364, "mean_token_accuracy": 0.8999999761581421, "num_tokens": 452.0, "step": 1 }, { "epoch": 0.0035001750087504373, "grad_norm": 0.8047283291816711, "learning_rate": 4.991e-05, "loss": 0.1339, "mean_token_accuracy": 0.966666665342119, "num_tokens": 4674.0, "step": 10 }, { "epoch": 0.007000350017500875, "grad_norm": 0.08694641292095184, "learning_rate": 4.981e-05, "loss": 0.0457, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 9193.0, "step": 20 }, { "epoch": 0.010500525026251312, "grad_norm": 9.807371139526367, "learning_rate": 4.9710000000000003e-05, "loss": 0.1587, "mean_token_accuracy": 0.9599999904632568, "num_tokens": 13953.0, "step": 30 }, { "epoch": 0.01400070003500175, "grad_norm": 0.12450232356786728, "learning_rate": 4.961e-05, "loss": 0.107, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 18471.0, "step": 40 }, { "epoch": 0.01750087504375219, "grad_norm": 3.2105650901794434, "learning_rate": 4.951e-05, "loss": 0.1113, "mean_token_accuracy": 0.9649999916553498, "num_tokens": 23060.0, "step": 50 }, { "epoch": 0.021001050052502624, "grad_norm": 0.6051918268203735, "learning_rate": 4.941e-05, "loss": 0.0627, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 27672.0, "step": 60 }, { "epoch": 0.024501225061253063, "grad_norm": 5.309004306793213, "learning_rate": 4.931e-05, "loss": 0.125, "mean_token_accuracy": 0.9649999916553498, "num_tokens": 32267.0, "step": 70 }, { "epoch": 0.0280014000700035, "grad_norm": 3.3586971759796143, "learning_rate": 4.921e-05, "loss": 0.0377, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 36884.0, "step": 80 }, { "epoch": 0.03150157507875394, "grad_norm": 4.870711803436279, "learning_rate": 4.911e-05, "loss": 0.0644, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 41583.0, "step": 90 }, { "epoch": 0.03500175008750438, "grad_norm": 0.014425868168473244, "learning_rate": 4.901e-05, "loss": 0.0432, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 46184.0, "step": 100 }, { "epoch": 0.038501925096254816, "grad_norm": 8.833477020263672, "learning_rate": 4.891e-05, "loss": 0.093, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 50776.0, "step": 110 }, { "epoch": 0.04200210010500525, "grad_norm": 0.19166199862957, "learning_rate": 4.881e-05, "loss": 0.1521, "mean_token_accuracy": 0.9649999976158142, "num_tokens": 55443.0, "step": 120 }, { "epoch": 0.04550227511375569, "grad_norm": 0.3452470302581787, "learning_rate": 4.871e-05, "loss": 0.0972, "mean_token_accuracy": 0.9699999928474426, "num_tokens": 60036.0, "step": 130 }, { "epoch": 0.049002450122506126, "grad_norm": 4.509720802307129, "learning_rate": 4.861e-05, "loss": 0.1143, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 64565.0, "step": 140 }, { "epoch": 0.052502625131256565, "grad_norm": 0.35976719856262207, "learning_rate": 4.851e-05, "loss": 0.032, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 69185.0, "step": 150 }, { "epoch": 0.056002800140007, "grad_norm": 5.863715648651123, "learning_rate": 4.841e-05, "loss": 0.0522, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 73698.0, "step": 160 }, { "epoch": 0.05950297514875744, "grad_norm": 0.1498999446630478, "learning_rate": 4.8309999999999997e-05, "loss": 0.1335, "mean_token_accuracy": 0.9699999928474426, "num_tokens": 78163.0, "step": 170 }, { "epoch": 0.06300315015750788, "grad_norm": 1.365043044090271, "learning_rate": 4.821e-05, "loss": 0.0818, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 82718.0, "step": 180 }, { "epoch": 0.06650332516625831, "grad_norm": 7.134900093078613, "learning_rate": 4.8110000000000005e-05, "loss": 0.0881, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 87209.0, "step": 190 }, { "epoch": 0.07000350017500875, "grad_norm": 1.4992774724960327, "learning_rate": 4.801e-05, "loss": 0.0612, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 91760.0, "step": 200 }, { "epoch": 0.07350367518375919, "grad_norm": 0.02836497873067856, "learning_rate": 4.791000000000001e-05, "loss": 0.0553, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 96423.0, "step": 210 }, { "epoch": 0.07700385019250963, "grad_norm": 0.020807797089219093, "learning_rate": 4.7810000000000005e-05, "loss": 0.0672, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 100968.0, "step": 220 }, { "epoch": 0.08050402520126006, "grad_norm": 0.03391553834080696, "learning_rate": 4.771e-05, "loss": 0.0326, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 105637.0, "step": 230 }, { "epoch": 0.0840042002100105, "grad_norm": 10.723307609558105, "learning_rate": 4.761000000000001e-05, "loss": 0.0897, "mean_token_accuracy": 0.9699999928474426, "num_tokens": 110219.0, "step": 240 }, { "epoch": 0.08750437521876094, "grad_norm": 1.3650544881820679, "learning_rate": 4.7510000000000004e-05, "loss": 0.0266, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 114796.0, "step": 250 }, { "epoch": 0.09100455022751137, "grad_norm": 0.07940108329057693, "learning_rate": 4.741e-05, "loss": 0.01, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 119315.0, "step": 260 }, { "epoch": 0.09450472523626181, "grad_norm": 0.05688886716961861, "learning_rate": 4.7310000000000006e-05, "loss": 0.0224, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 123868.0, "step": 270 }, { "epoch": 0.09800490024501225, "grad_norm": 0.00870002806186676, "learning_rate": 4.7210000000000004e-05, "loss": 0.012, "mean_token_accuracy": 1.0, "num_tokens": 128591.0, "step": 280 }, { "epoch": 0.10150507525376269, "grad_norm": 55.893104553222656, "learning_rate": 4.711e-05, "loss": 0.0297, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 133283.0, "step": 290 }, { "epoch": 0.10500525026251313, "grad_norm": 0.003983665257692337, "learning_rate": 4.7010000000000006e-05, "loss": 0.2061, "mean_token_accuracy": 0.9649999916553498, "num_tokens": 138008.0, "step": 300 }, { "epoch": 0.10850542527126357, "grad_norm": 7.029219627380371, "learning_rate": 4.691e-05, "loss": 0.0849, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 142566.0, "step": 310 }, { "epoch": 0.112005600280014, "grad_norm": 2.8149940967559814, "learning_rate": 4.681e-05, "loss": 0.0667, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 147213.0, "step": 320 }, { "epoch": 0.11550577528876443, "grad_norm": 2.192121982574463, "learning_rate": 4.6710000000000005e-05, "loss": 0.0638, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 151831.0, "step": 330 }, { "epoch": 0.11900595029751487, "grad_norm": 15.69092082977295, "learning_rate": 4.661e-05, "loss": 0.0256, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 156368.0, "step": 340 }, { "epoch": 0.12250612530626531, "grad_norm": 0.544373095035553, "learning_rate": 4.651e-05, "loss": 0.0368, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 160915.0, "step": 350 }, { "epoch": 0.12600630031501575, "grad_norm": 0.04070553556084633, "learning_rate": 4.6410000000000005e-05, "loss": 0.0729, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 165461.0, "step": 360 }, { "epoch": 0.1295064753237662, "grad_norm": 0.006062925793230534, "learning_rate": 4.631e-05, "loss": 0.0372, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 170199.0, "step": 370 }, { "epoch": 0.13300665033251663, "grad_norm": 0.04721317067742348, "learning_rate": 4.6210000000000006e-05, "loss": 0.0353, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 174899.0, "step": 380 }, { "epoch": 0.13650682534126707, "grad_norm": 0.01112948078662157, "learning_rate": 4.6110000000000004e-05, "loss": 0.0565, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 179419.0, "step": 390 }, { "epoch": 0.1400070003500175, "grad_norm": 3.867860794067383, "learning_rate": 4.601e-05, "loss": 0.0537, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 183987.0, "step": 400 }, { "epoch": 0.14350717535876795, "grad_norm": 10.329545974731445, "learning_rate": 4.5910000000000006e-05, "loss": 0.0888, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 188517.0, "step": 410 }, { "epoch": 0.14700735036751839, "grad_norm": 0.04144367575645447, "learning_rate": 4.5810000000000004e-05, "loss": 0.0723, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 193090.0, "step": 420 }, { "epoch": 0.15050752537626882, "grad_norm": 13.018311500549316, "learning_rate": 4.571e-05, "loss": 0.0799, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 197726.0, "step": 430 }, { "epoch": 0.15400770038501926, "grad_norm": 7.063663959503174, "learning_rate": 4.5610000000000005e-05, "loss": 0.0295, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 202376.0, "step": 440 }, { "epoch": 0.15750787539376968, "grad_norm": 8.98883056640625, "learning_rate": 4.551e-05, "loss": 0.0624, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 207026.0, "step": 450 }, { "epoch": 0.16100805040252011, "grad_norm": 13.842345237731934, "learning_rate": 4.541e-05, "loss": 0.0715, "mean_token_accuracy": 0.975, "num_tokens": 211776.0, "step": 460 }, { "epoch": 0.16450822541127055, "grad_norm": 9.97155475616455, "learning_rate": 4.5310000000000005e-05, "loss": 0.0661, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 216432.0, "step": 470 }, { "epoch": 0.168008400420021, "grad_norm": 7.468666076660156, "learning_rate": 4.521e-05, "loss": 0.0367, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 220948.0, "step": 480 }, { "epoch": 0.17150857542877143, "grad_norm": 4.366839408874512, "learning_rate": 4.511e-05, "loss": 0.0627, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 225605.0, "step": 490 }, { "epoch": 0.17500875043752187, "grad_norm": 1.5920456647872925, "learning_rate": 4.5010000000000004e-05, "loss": 0.0625, "step": 500 }, { "epoch": 0.17500875043752187, "eval_accuracy": 0.42072015161086546, "eval_f1": 0.35817992606791954, "eval_loss": 0.052172355353832245, "eval_mean_token_accuracy": 0.9854798049035699, "eval_num_tokens": 230290.0, "eval_precision": 0.4422081376879308, "eval_recall": 0.3790056922240466, "eval_runtime": 244.5865, "eval_samples_per_second": 6.472, "eval_steps_per_second": 0.81, "step": 500 }, { "epoch": 0.1785089254462723, "grad_norm": 0.04973801597952843, "learning_rate": 4.491e-05, "loss": 0.082, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 234977.0, "step": 510 }, { "epoch": 0.18200910045502275, "grad_norm": 0.0434698760509491, "learning_rate": 4.481e-05, "loss": 0.0479, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 239628.0, "step": 520 }, { "epoch": 0.1855092754637732, "grad_norm": 11.246657371520996, "learning_rate": 4.4710000000000004e-05, "loss": 0.0361, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 244331.0, "step": 530 }, { "epoch": 0.18900945047252363, "grad_norm": 2.8165736198425293, "learning_rate": 4.461e-05, "loss": 0.0622, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 249178.0, "step": 540 }, { "epoch": 0.19250962548127407, "grad_norm": 1.3719075918197632, "learning_rate": 4.451e-05, "loss": 0.0498, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 253815.0, "step": 550 }, { "epoch": 0.1960098004900245, "grad_norm": 0.8937302827835083, "learning_rate": 4.4410000000000003e-05, "loss": 0.0782, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 258433.0, "step": 560 }, { "epoch": 0.19950997549877494, "grad_norm": 0.01865805685520172, "learning_rate": 4.431e-05, "loss": 0.0096, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 263057.0, "step": 570 }, { "epoch": 0.20301015050752538, "grad_norm": 0.6028000712394714, "learning_rate": 4.421e-05, "loss": 0.0144, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 267618.0, "step": 580 }, { "epoch": 0.20651032551627582, "grad_norm": 0.013873261399567127, "learning_rate": 4.411e-05, "loss": 0.0576, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 272288.0, "step": 590 }, { "epoch": 0.21001050052502626, "grad_norm": 6.103112697601318, "learning_rate": 4.401e-05, "loss": 0.1198, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 276910.0, "step": 600 }, { "epoch": 0.2135106755337767, "grad_norm": 0.640934944152832, "learning_rate": 4.391e-05, "loss": 0.0378, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 281446.0, "step": 610 }, { "epoch": 0.21701085054252714, "grad_norm": 0.31448185443878174, "learning_rate": 4.381e-05, "loss": 0.0696, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 286106.0, "step": 620 }, { "epoch": 0.22051102555127755, "grad_norm": 0.03195786848664284, "learning_rate": 4.371e-05, "loss": 0.124, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 290743.0, "step": 630 }, { "epoch": 0.224011200560028, "grad_norm": 0.8114803433418274, "learning_rate": 4.361e-05, "loss": 0.1201, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 295315.0, "step": 640 }, { "epoch": 0.22751137556877843, "grad_norm": 0.16202567517757416, "learning_rate": 4.351e-05, "loss": 0.0553, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 299840.0, "step": 650 }, { "epoch": 0.23101155057752887, "grad_norm": 4.016778469085693, "learning_rate": 4.341e-05, "loss": 0.0692, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 304471.0, "step": 660 }, { "epoch": 0.2345117255862793, "grad_norm": 0.056026436388492584, "learning_rate": 4.3310000000000004e-05, "loss": 0.0395, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 309003.0, "step": 670 }, { "epoch": 0.23801190059502975, "grad_norm": 0.4657319188117981, "learning_rate": 4.321e-05, "loss": 0.0094, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 313702.0, "step": 680 }, { "epoch": 0.24151207560378019, "grad_norm": 21.3116397857666, "learning_rate": 4.311e-05, "loss": 0.0468, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 318425.0, "step": 690 }, { "epoch": 0.24501225061253062, "grad_norm": 0.024263957515358925, "learning_rate": 4.301e-05, "loss": 0.0576, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 323080.0, "step": 700 }, { "epoch": 0.24851242562128106, "grad_norm": 0.039419762790203094, "learning_rate": 4.291e-05, "loss": 0.0503, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 327692.0, "step": 710 }, { "epoch": 0.2520126006300315, "grad_norm": 0.06194750592112541, "learning_rate": 4.281e-05, "loss": 0.0379, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 332238.0, "step": 720 }, { "epoch": 0.25551277563878194, "grad_norm": 0.015114092268049717, "learning_rate": 4.271e-05, "loss": 0.0729, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 336826.0, "step": 730 }, { "epoch": 0.2590129506475324, "grad_norm": 0.05118599534034729, "learning_rate": 4.261e-05, "loss": 0.0773, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 341422.0, "step": 740 }, { "epoch": 0.2625131256562828, "grad_norm": 0.13388273119926453, "learning_rate": 4.251e-05, "loss": 0.07, "mean_token_accuracy": 0.9699999928474426, "num_tokens": 346078.0, "step": 750 }, { "epoch": 0.26601330066503326, "grad_norm": 0.043984536081552505, "learning_rate": 4.241e-05, "loss": 0.121, "mean_token_accuracy": 0.9649999916553498, "num_tokens": 350615.0, "step": 760 }, { "epoch": 0.2695134756737837, "grad_norm": 3.5789825916290283, "learning_rate": 4.231e-05, "loss": 0.0646, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 355229.0, "step": 770 }, { "epoch": 0.27301365068253414, "grad_norm": 0.15532948076725006, "learning_rate": 4.221e-05, "loss": 0.0809, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 359810.0, "step": 780 }, { "epoch": 0.2765138256912846, "grad_norm": 4.205129146575928, "learning_rate": 4.211e-05, "loss": 0.0478, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 364370.0, "step": 790 }, { "epoch": 0.280014000700035, "grad_norm": 0.06457880884408951, "learning_rate": 4.201e-05, "loss": 0.0381, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 368854.0, "step": 800 }, { "epoch": 0.28351417570878545, "grad_norm": 0.06110011041164398, "learning_rate": 4.191e-05, "loss": 0.0362, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 373402.0, "step": 810 }, { "epoch": 0.2870143507175359, "grad_norm": 0.6663037538528442, "learning_rate": 4.181000000000001e-05, "loss": 0.053, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 378067.0, "step": 820 }, { "epoch": 0.29051452572628633, "grad_norm": 0.019796814769506454, "learning_rate": 4.1710000000000006e-05, "loss": 0.0813, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 382674.0, "step": 830 }, { "epoch": 0.29401470073503677, "grad_norm": 6.284240245819092, "learning_rate": 4.161e-05, "loss": 0.1022, "mean_token_accuracy": 0.9599999904632568, "num_tokens": 387372.0, "step": 840 }, { "epoch": 0.2975148757437872, "grad_norm": 0.050411708652973175, "learning_rate": 4.151000000000001e-05, "loss": 0.0286, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 392048.0, "step": 850 }, { "epoch": 0.30101505075253765, "grad_norm": 0.13556945323944092, "learning_rate": 4.1410000000000005e-05, "loss": 0.056, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 396846.0, "step": 860 }, { "epoch": 0.3045152257612881, "grad_norm": 0.2066652923822403, "learning_rate": 4.131e-05, "loss": 0.0304, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 401427.0, "step": 870 }, { "epoch": 0.3080154007700385, "grad_norm": 0.21275383234024048, "learning_rate": 4.121000000000001e-05, "loss": 0.0971, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 406019.0, "step": 880 }, { "epoch": 0.31151557577878897, "grad_norm": 0.0494910404086113, "learning_rate": 4.1110000000000005e-05, "loss": 0.0054, "mean_token_accuracy": 1.0, "num_tokens": 410600.0, "step": 890 }, { "epoch": 0.31501575078753935, "grad_norm": 0.06328645348548889, "learning_rate": 4.101e-05, "loss": 0.0584, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 415202.0, "step": 900 }, { "epoch": 0.3185159257962898, "grad_norm": 0.011447213590145111, "learning_rate": 4.0910000000000006e-05, "loss": 0.0842, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 419881.0, "step": 910 }, { "epoch": 0.32201610080504023, "grad_norm": 0.11036702245473862, "learning_rate": 4.0810000000000004e-05, "loss": 0.023, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 424461.0, "step": 920 }, { "epoch": 0.32551627581379067, "grad_norm": 5.421338081359863, "learning_rate": 4.071e-05, "loss": 0.0802, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 429057.0, "step": 930 }, { "epoch": 0.3290164508225411, "grad_norm": 0.3822776675224304, "learning_rate": 4.0610000000000006e-05, "loss": 0.0204, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 433654.0, "step": 940 }, { "epoch": 0.33251662583129155, "grad_norm": 0.39122045040130615, "learning_rate": 4.0510000000000003e-05, "loss": 0.0318, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 438323.0, "step": 950 }, { "epoch": 0.336016800840042, "grad_norm": 0.16552428901195526, "learning_rate": 4.041e-05, "loss": 0.0472, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 442957.0, "step": 960 }, { "epoch": 0.3395169758487924, "grad_norm": 0.028619434684515, "learning_rate": 4.0310000000000005e-05, "loss": 0.0433, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 447752.0, "step": 970 }, { "epoch": 0.34301715085754286, "grad_norm": 7.8463053703308105, "learning_rate": 4.021e-05, "loss": 0.0688, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 452474.0, "step": 980 }, { "epoch": 0.3465173258662933, "grad_norm": 0.012101550586521626, "learning_rate": 4.011e-05, "loss": 0.0084, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 457143.0, "step": 990 }, { "epoch": 0.35001750087504374, "grad_norm": 0.013838861137628555, "learning_rate": 4.0010000000000005e-05, "loss": 0.0593, "step": 1000 }, { "epoch": 0.35001750087504374, "eval_accuracy": 0.441566645609602, "eval_f1": 0.3728091111256171, "eval_loss": 0.057350896298885345, "eval_mean_token_accuracy": 0.9861111180348829, "eval_num_tokens": 461707.0, "eval_precision": 0.4527947168630948, "eval_recall": 0.39396244890143234, "eval_runtime": 244.5385, "eval_samples_per_second": 6.473, "eval_steps_per_second": 0.81, "step": 1000 }, { "epoch": 0.3535176758837942, "grad_norm": 0.0760878473520279, "learning_rate": 3.991e-05, "loss": 0.1017, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 466311.0, "step": 1010 }, { "epoch": 0.3570178508925446, "grad_norm": 6.497073173522949, "learning_rate": 3.981e-05, "loss": 0.0353, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 470821.0, "step": 1020 }, { "epoch": 0.36051802590129506, "grad_norm": 8.943822860717773, "learning_rate": 3.9710000000000004e-05, "loss": 0.0868, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 475416.0, "step": 1030 }, { "epoch": 0.3640182009100455, "grad_norm": 0.10018932819366455, "learning_rate": 3.961e-05, "loss": 0.0314, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 480046.0, "step": 1040 }, { "epoch": 0.36751837591879594, "grad_norm": 0.058345384895801544, "learning_rate": 3.951e-05, "loss": 0.0198, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 484728.0, "step": 1050 }, { "epoch": 0.3710185509275464, "grad_norm": 0.059850409626960754, "learning_rate": 3.9410000000000004e-05, "loss": 0.0561, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 489334.0, "step": 1060 }, { "epoch": 0.3745187259362968, "grad_norm": 0.03875022009015083, "learning_rate": 3.931e-05, "loss": 0.0893, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 493949.0, "step": 1070 }, { "epoch": 0.37801890094504725, "grad_norm": 0.7719871997833252, "learning_rate": 3.921e-05, "loss": 0.0427, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 498518.0, "step": 1080 }, { "epoch": 0.3815190759537977, "grad_norm": 0.05535457283258438, "learning_rate": 3.911e-05, "loss": 0.0117, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 503111.0, "step": 1090 }, { "epoch": 0.38501925096254813, "grad_norm": 6.557998180389404, "learning_rate": 3.901e-05, "loss": 0.0805, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 507707.0, "step": 1100 }, { "epoch": 0.38851942597129857, "grad_norm": 0.6564468145370483, "learning_rate": 3.8910000000000005e-05, "loss": 0.0841, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 512285.0, "step": 1110 }, { "epoch": 0.392019600980049, "grad_norm": 8.401987075805664, "learning_rate": 3.881e-05, "loss": 0.0721, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 516852.0, "step": 1120 }, { "epoch": 0.39551977598879945, "grad_norm": 1.693769931793213, "learning_rate": 3.871e-05, "loss": 0.0807, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 521413.0, "step": 1130 }, { "epoch": 0.3990199509975499, "grad_norm": 3.10587739944458, "learning_rate": 3.8610000000000005e-05, "loss": 0.059, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 525992.0, "step": 1140 }, { "epoch": 0.4025201260063003, "grad_norm": 0.17380690574645996, "learning_rate": 3.851e-05, "loss": 0.0357, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 530679.0, "step": 1150 }, { "epoch": 0.40602030101505077, "grad_norm": 0.33141088485717773, "learning_rate": 3.841e-05, "loss": 0.0045, "mean_token_accuracy": 1.0, "num_tokens": 535218.0, "step": 1160 }, { "epoch": 0.4095204760238012, "grad_norm": 0.0494840107858181, "learning_rate": 3.8310000000000004e-05, "loss": 0.0594, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 539817.0, "step": 1170 }, { "epoch": 0.41302065103255164, "grad_norm": 0.013975823298096657, "learning_rate": 3.821e-05, "loss": 0.0022, "mean_token_accuracy": 1.0, "num_tokens": 544490.0, "step": 1180 }, { "epoch": 0.4165208260413021, "grad_norm": 0.09675566107034683, "learning_rate": 3.811e-05, "loss": 0.0019, "mean_token_accuracy": 1.0, "num_tokens": 549043.0, "step": 1190 }, { "epoch": 0.4200210010500525, "grad_norm": 0.00722131785005331, "learning_rate": 3.8010000000000004e-05, "loss": 0.0806, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 553578.0, "step": 1200 }, { "epoch": 0.42352117605880296, "grad_norm": 0.022663407027721405, "learning_rate": 3.791e-05, "loss": 0.0042, "mean_token_accuracy": 1.0, "num_tokens": 558226.0, "step": 1210 }, { "epoch": 0.4270213510675534, "grad_norm": 0.012322783470153809, "learning_rate": 3.781e-05, "loss": 0.007, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 562862.0, "step": 1220 }, { "epoch": 0.43052152607630384, "grad_norm": 0.016185415908694267, "learning_rate": 3.771e-05, "loss": 0.0943, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 567480.0, "step": 1230 }, { "epoch": 0.4340217010850543, "grad_norm": 0.0974903255701065, "learning_rate": 3.761e-05, "loss": 0.0773, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 572014.0, "step": 1240 }, { "epoch": 0.43752187609380466, "grad_norm": 0.028429092839360237, "learning_rate": 3.751e-05, "loss": 0.0779, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 576638.0, "step": 1250 }, { "epoch": 0.4410220511025551, "grad_norm": 2.4505090713500977, "learning_rate": 3.741e-05, "loss": 0.0153, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 581221.0, "step": 1260 }, { "epoch": 0.44452222611130554, "grad_norm": 0.11989375203847885, "learning_rate": 3.731e-05, "loss": 0.0748, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 585878.0, "step": 1270 }, { "epoch": 0.448022401120056, "grad_norm": 0.06575898826122284, "learning_rate": 3.721e-05, "loss": 0.0247, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 590474.0, "step": 1280 }, { "epoch": 0.4515225761288064, "grad_norm": 17.148649215698242, "learning_rate": 3.711e-05, "loss": 0.0701, "mean_token_accuracy": 0.9699999928474426, "num_tokens": 594994.0, "step": 1290 }, { "epoch": 0.45502275113755686, "grad_norm": 0.022335920482873917, "learning_rate": 3.701e-05, "loss": 0.0447, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 599605.0, "step": 1300 }, { "epoch": 0.4585229261463073, "grad_norm": 0.16378021240234375, "learning_rate": 3.691e-05, "loss": 0.0934, "mean_token_accuracy": 0.9699999928474426, "num_tokens": 604331.0, "step": 1310 }, { "epoch": 0.46202310115505774, "grad_norm": 4.628612995147705, "learning_rate": 3.681e-05, "loss": 0.1114, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 608859.0, "step": 1320 }, { "epoch": 0.4655232761638082, "grad_norm": 4.558804035186768, "learning_rate": 3.671e-05, "loss": 0.0527, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 613472.0, "step": 1330 }, { "epoch": 0.4690234511725586, "grad_norm": 7.380437850952148, "learning_rate": 3.661e-05, "loss": 0.0211, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 618003.0, "step": 1340 }, { "epoch": 0.47252362618130905, "grad_norm": 0.054671116173267365, "learning_rate": 3.651e-05, "loss": 0.0896, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 622602.0, "step": 1350 }, { "epoch": 0.4760238011900595, "grad_norm": 0.22701649367809296, "learning_rate": 3.641e-05, "loss": 0.0546, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 627312.0, "step": 1360 }, { "epoch": 0.47952397619880993, "grad_norm": 9.734682083129883, "learning_rate": 3.6309999999999996e-05, "loss": 0.0576, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 632167.0, "step": 1370 }, { "epoch": 0.48302415120756037, "grad_norm": 10.223374366760254, "learning_rate": 3.621e-05, "loss": 0.0569, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 636822.0, "step": 1380 }, { "epoch": 0.4865243262163108, "grad_norm": 0.22119201719760895, "learning_rate": 3.611e-05, "loss": 0.052, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 641324.0, "step": 1390 }, { "epoch": 0.49002450122506125, "grad_norm": 0.09743613004684448, "learning_rate": 3.601e-05, "loss": 0.0326, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 645990.0, "step": 1400 }, { "epoch": 0.4935246762338117, "grad_norm": 4.46646785736084, "learning_rate": 3.591e-05, "loss": 0.0845, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 650642.0, "step": 1410 }, { "epoch": 0.4970248512425621, "grad_norm": 0.3847590386867523, "learning_rate": 3.581e-05, "loss": 0.0276, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 655313.0, "step": 1420 }, { "epoch": 0.5005250262513126, "grad_norm": 3.9029712677001953, "learning_rate": 3.571e-05, "loss": 0.0479, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 659845.0, "step": 1430 }, { "epoch": 0.504025201260063, "grad_norm": 5.140905380249023, "learning_rate": 3.5610000000000006e-05, "loss": 0.0274, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 664526.0, "step": 1440 }, { "epoch": 0.5075253762688134, "grad_norm": 0.0748833566904068, "learning_rate": 3.5510000000000004e-05, "loss": 0.112, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 669184.0, "step": 1450 }, { "epoch": 0.5110255512775639, "grad_norm": 0.06513810157775879, "learning_rate": 3.541e-05, "loss": 0.0294, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 673829.0, "step": 1460 }, { "epoch": 0.5145257262863143, "grad_norm": 0.14687716960906982, "learning_rate": 3.5310000000000006e-05, "loss": 0.039, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 678451.0, "step": 1470 }, { "epoch": 0.5180259012950648, "grad_norm": 0.04928717017173767, "learning_rate": 3.5210000000000003e-05, "loss": 0.0881, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 682982.0, "step": 1480 }, { "epoch": 0.5215260763038152, "grad_norm": 0.05730545148253441, "learning_rate": 3.511e-05, "loss": 0.0094, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 687510.0, "step": 1490 }, { "epoch": 0.5250262513125656, "grad_norm": 0.024362344294786453, "learning_rate": 3.5010000000000005e-05, "loss": 0.0119, "step": 1500 }, { "epoch": 0.5250262513125656, "eval_accuracy": 0.5596967782691092, "eval_f1": 0.4608174078043728, "eval_loss": 0.047076478600502014, "eval_mean_token_accuracy": 0.9872474811895929, "eval_num_tokens": 692119.0, "eval_precision": 0.5081716761653752, "eval_recall": 0.49738319415052024, "eval_runtime": 244.3841, "eval_samples_per_second": 6.478, "eval_steps_per_second": 0.81, "step": 1500 }, { "epoch": 0.5285264263213161, "grad_norm": 0.03228575736284256, "learning_rate": 3.491e-05, "loss": 0.0282, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 696712.0, "step": 1510 }, { "epoch": 0.5320266013300665, "grad_norm": 15.16336441040039, "learning_rate": 3.481e-05, "loss": 0.0758, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 701390.0, "step": 1520 }, { "epoch": 0.535526776338817, "grad_norm": 19.84299087524414, "learning_rate": 3.4710000000000005e-05, "loss": 0.0911, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 705965.0, "step": 1530 }, { "epoch": 0.5390269513475674, "grad_norm": 0.051229000091552734, "learning_rate": 3.461e-05, "loss": 0.0982, "mean_token_accuracy": 0.9599999904632568, "num_tokens": 710601.0, "step": 1540 }, { "epoch": 0.5425271263563178, "grad_norm": 6.5445756912231445, "learning_rate": 3.451000000000001e-05, "loss": 0.037, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 715211.0, "step": 1550 }, { "epoch": 0.5460273013650683, "grad_norm": 4.0851874351501465, "learning_rate": 3.4410000000000004e-05, "loss": 0.0403, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 719818.0, "step": 1560 }, { "epoch": 0.5495274763738187, "grad_norm": 0.20048797130584717, "learning_rate": 3.431e-05, "loss": 0.0598, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 724418.0, "step": 1570 }, { "epoch": 0.5530276513825692, "grad_norm": 8.350198745727539, "learning_rate": 3.4210000000000006e-05, "loss": 0.0261, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 729079.0, "step": 1580 }, { "epoch": 0.5565278263913196, "grad_norm": 7.64754056930542, "learning_rate": 3.4110000000000004e-05, "loss": 0.0205, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 733776.0, "step": 1590 }, { "epoch": 0.56002800140007, "grad_norm": 11.657675743103027, "learning_rate": 3.401e-05, "loss": 0.0254, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 738301.0, "step": 1600 }, { "epoch": 0.5635281764088205, "grad_norm": 0.044835012406110764, "learning_rate": 3.3910000000000006e-05, "loss": 0.0675, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 743020.0, "step": 1610 }, { "epoch": 0.5670283514175709, "grad_norm": 0.08898824453353882, "learning_rate": 3.381e-05, "loss": 0.0438, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 747522.0, "step": 1620 }, { "epoch": 0.5705285264263213, "grad_norm": 19.048906326293945, "learning_rate": 3.371e-05, "loss": 0.0462, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 752073.0, "step": 1630 }, { "epoch": 0.5740287014350718, "grad_norm": 5.376831531524658, "learning_rate": 3.3610000000000005e-05, "loss": 0.0067, "mean_token_accuracy": 1.0, "num_tokens": 756668.0, "step": 1640 }, { "epoch": 0.5775288764438222, "grad_norm": 0.003997461870312691, "learning_rate": 3.351e-05, "loss": 0.0093, "mean_token_accuracy": 1.0, "num_tokens": 761349.0, "step": 1650 }, { "epoch": 0.5810290514525727, "grad_norm": 1.1141142845153809, "learning_rate": 3.341e-05, "loss": 0.0453, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 766087.0, "step": 1660 }, { "epoch": 0.5845292264613231, "grad_norm": 0.09356174618005753, "learning_rate": 3.3310000000000005e-05, "loss": 0.0415, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 770676.0, "step": 1670 }, { "epoch": 0.5880294014700735, "grad_norm": 16.47395133972168, "learning_rate": 3.321e-05, "loss": 0.0726, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 775287.0, "step": 1680 }, { "epoch": 0.591529576478824, "grad_norm": 1.1543943881988525, "learning_rate": 3.311e-05, "loss": 0.0262, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 779907.0, "step": 1690 }, { "epoch": 0.5950297514875744, "grad_norm": 0.6417059898376465, "learning_rate": 3.3010000000000004e-05, "loss": 0.1458, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 784527.0, "step": 1700 }, { "epoch": 0.5985299264963249, "grad_norm": 0.03735469654202461, "learning_rate": 3.291e-05, "loss": 0.0517, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 789037.0, "step": 1710 }, { "epoch": 0.6020301015050753, "grad_norm": 7.025692462921143, "learning_rate": 3.281e-05, "loss": 0.0912, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 793581.0, "step": 1720 }, { "epoch": 0.6055302765138257, "grad_norm": 0.046697914600372314, "learning_rate": 3.2710000000000004e-05, "loss": 0.0516, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 798206.0, "step": 1730 }, { "epoch": 0.6090304515225762, "grad_norm": 1.4056965112686157, "learning_rate": 3.261e-05, "loss": 0.0658, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 802791.0, "step": 1740 }, { "epoch": 0.6125306265313266, "grad_norm": 15.819257736206055, "learning_rate": 3.251e-05, "loss": 0.0332, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 807423.0, "step": 1750 }, { "epoch": 0.616030801540077, "grad_norm": 0.15242606401443481, "learning_rate": 3.241e-05, "loss": 0.0116, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 812014.0, "step": 1760 }, { "epoch": 0.6195309765488275, "grad_norm": 0.8969595432281494, "learning_rate": 3.231e-05, "loss": 0.0697, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 816484.0, "step": 1770 }, { "epoch": 0.6230311515575779, "grad_norm": 13.24059772491455, "learning_rate": 3.221e-05, "loss": 0.0475, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 821141.0, "step": 1780 }, { "epoch": 0.6265313265663283, "grad_norm": 0.0862284004688263, "learning_rate": 3.211e-05, "loss": 0.0133, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 825825.0, "step": 1790 }, { "epoch": 0.6300315015750787, "grad_norm": 6.188477993011475, "learning_rate": 3.201e-05, "loss": 0.0941, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 830440.0, "step": 1800 }, { "epoch": 0.6335316765838291, "grad_norm": 0.047075141221284866, "learning_rate": 3.191e-05, "loss": 0.0152, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 835064.0, "step": 1810 }, { "epoch": 0.6370318515925796, "grad_norm": 8.754451751708984, "learning_rate": 3.181e-05, "loss": 0.034, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 839606.0, "step": 1820 }, { "epoch": 0.64053202660133, "grad_norm": 0.6907691955566406, "learning_rate": 3.171e-05, "loss": 0.0215, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 844205.0, "step": 1830 }, { "epoch": 0.6440322016100805, "grad_norm": 0.06890915334224701, "learning_rate": 3.1610000000000004e-05, "loss": 0.0804, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 848857.0, "step": 1840 }, { "epoch": 0.6475323766188309, "grad_norm": 0.04362496733665466, "learning_rate": 3.151e-05, "loss": 0.0015, "mean_token_accuracy": 1.0, "num_tokens": 853505.0, "step": 1850 }, { "epoch": 0.6510325516275813, "grad_norm": 0.032738834619522095, "learning_rate": 3.141e-05, "loss": 0.0738, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 857949.0, "step": 1860 }, { "epoch": 0.6545327266363318, "grad_norm": 0.0720139741897583, "learning_rate": 3.1310000000000003e-05, "loss": 0.0198, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 862481.0, "step": 1870 }, { "epoch": 0.6580329016450822, "grad_norm": 0.3373511731624603, "learning_rate": 3.121e-05, "loss": 0.0232, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 867070.0, "step": 1880 }, { "epoch": 0.6615330766538327, "grad_norm": 0.03332596644759178, "learning_rate": 3.111e-05, "loss": 0.0255, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 871710.0, "step": 1890 }, { "epoch": 0.6650332516625831, "grad_norm": 0.02673097886145115, "learning_rate": 3.101e-05, "loss": 0.1023, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 876374.0, "step": 1900 }, { "epoch": 0.6685334266713335, "grad_norm": 29.00749969482422, "learning_rate": 3.091e-05, "loss": 0.0775, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 880967.0, "step": 1910 }, { "epoch": 0.672033601680084, "grad_norm": 0.013920117169618607, "learning_rate": 3.081e-05, "loss": 0.0095, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 885539.0, "step": 1920 }, { "epoch": 0.6755337766888344, "grad_norm": 0.004398212768137455, "learning_rate": 3.071e-05, "loss": 0.0107, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 890118.0, "step": 1930 }, { "epoch": 0.6790339516975848, "grad_norm": 0.11914502084255219, "learning_rate": 3.061e-05, "loss": 0.0353, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 894652.0, "step": 1940 }, { "epoch": 0.6825341267063353, "grad_norm": 0.06763932853937149, "learning_rate": 3.051e-05, "loss": 0.0155, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 899187.0, "step": 1950 }, { "epoch": 0.6860343017150857, "grad_norm": 0.03659069910645485, "learning_rate": 3.041e-05, "loss": 0.0309, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 903727.0, "step": 1960 }, { "epoch": 0.6895344767238362, "grad_norm": 5.335174083709717, "learning_rate": 3.031e-05, "loss": 0.1113, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 908334.0, "step": 1970 }, { "epoch": 0.6930346517325866, "grad_norm": 2.4410805702209473, "learning_rate": 3.021e-05, "loss": 0.0128, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 913065.0, "step": 1980 }, { "epoch": 0.696534826741337, "grad_norm": 0.05332425609230995, "learning_rate": 3.0109999999999998e-05, "loss": 0.0443, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 917618.0, "step": 1990 }, { "epoch": 0.7000350017500875, "grad_norm": 0.012657753191888332, "learning_rate": 3.001e-05, "loss": 0.0267, "step": 2000 }, { "epoch": 0.7000350017500875, "eval_accuracy": 0.5710675931775111, "eval_f1": 0.4674227263281785, "eval_loss": 0.052520181983709335, "eval_mean_token_accuracy": 0.9876262681050734, "eval_num_tokens": 922286.0, "eval_precision": 0.5114304763470895, "eval_recall": 0.5012130900032767, "eval_runtime": 244.1164, "eval_samples_per_second": 6.485, "eval_steps_per_second": 0.811, "step": 2000 }, { "epoch": 0.7035351767588379, "grad_norm": 5.11520528793335, "learning_rate": 2.991e-05, "loss": 0.0078, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 926980.0, "step": 2010 }, { "epoch": 0.7070353517675884, "grad_norm": 0.03029199317097664, "learning_rate": 2.9809999999999997e-05, "loss": 0.0963, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 931724.0, "step": 2020 }, { "epoch": 0.7105355267763388, "grad_norm": 0.5081428289413452, "learning_rate": 2.971e-05, "loss": 0.0361, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 936266.0, "step": 2030 }, { "epoch": 0.7140357017850892, "grad_norm": 0.04822823032736778, "learning_rate": 2.961e-05, "loss": 0.1017, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 940886.0, "step": 2040 }, { "epoch": 0.7175358767938397, "grad_norm": 0.2854156494140625, "learning_rate": 2.951e-05, "loss": 0.0186, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 945433.0, "step": 2050 }, { "epoch": 0.7210360518025901, "grad_norm": 0.3434739112854004, "learning_rate": 2.9409999999999998e-05, "loss": 0.0587, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 949930.0, "step": 2060 }, { "epoch": 0.7245362268113406, "grad_norm": 0.03626574948430061, "learning_rate": 2.9310000000000006e-05, "loss": 0.1057, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 954492.0, "step": 2070 }, { "epoch": 0.728036401820091, "grad_norm": 11.993911743164062, "learning_rate": 2.9210000000000003e-05, "loss": 0.1327, "mean_token_accuracy": 0.9699999928474426, "num_tokens": 958986.0, "step": 2080 }, { "epoch": 0.7315365768288414, "grad_norm": 0.07597003877162933, "learning_rate": 2.9110000000000004e-05, "loss": 0.018, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 963614.0, "step": 2090 }, { "epoch": 0.7350367518375919, "grad_norm": 10.063232421875, "learning_rate": 2.9010000000000005e-05, "loss": 0.1227, "mean_token_accuracy": 0.9699999928474426, "num_tokens": 968140.0, "step": 2100 }, { "epoch": 0.7385369268463423, "grad_norm": 0.1807040572166443, "learning_rate": 2.8910000000000003e-05, "loss": 0.0323, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 972835.0, "step": 2110 }, { "epoch": 0.7420371018550928, "grad_norm": 0.17890332639217377, "learning_rate": 2.8810000000000004e-05, "loss": 0.0207, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 977445.0, "step": 2120 }, { "epoch": 0.7455372768638432, "grad_norm": 9.020623207092285, "learning_rate": 2.8710000000000005e-05, "loss": 0.0899, "mean_token_accuracy": 0.9699999928474426, "num_tokens": 982140.0, "step": 2130 }, { "epoch": 0.7490374518725936, "grad_norm": 3.116069793701172, "learning_rate": 2.8610000000000002e-05, "loss": 0.0603, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 986817.0, "step": 2140 }, { "epoch": 0.7525376268813441, "grad_norm": 14.557893753051758, "learning_rate": 2.8510000000000003e-05, "loss": 0.0326, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 991362.0, "step": 2150 }, { "epoch": 0.7560378018900945, "grad_norm": 5.8639140129089355, "learning_rate": 2.8410000000000004e-05, "loss": 0.0533, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 995958.0, "step": 2160 }, { "epoch": 0.759537976898845, "grad_norm": 0.08902487903833389, "learning_rate": 2.8310000000000002e-05, "loss": 0.0089, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1000561.0, "step": 2170 }, { "epoch": 0.7630381519075954, "grad_norm": 0.021990323439240456, "learning_rate": 2.8210000000000003e-05, "loss": 0.0398, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1005102.0, "step": 2180 }, { "epoch": 0.7665383269163458, "grad_norm": 0.0434272363781929, "learning_rate": 2.8110000000000004e-05, "loss": 0.0367, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1009721.0, "step": 2190 }, { "epoch": 0.7700385019250963, "grad_norm": 7.773507595062256, "learning_rate": 2.8010000000000005e-05, "loss": 0.0529, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1014234.0, "step": 2200 }, { "epoch": 0.7735386769338467, "grad_norm": 11.276909828186035, "learning_rate": 2.7910000000000002e-05, "loss": 0.0828, "mean_token_accuracy": 0.9699999928474426, "num_tokens": 1018943.0, "step": 2210 }, { "epoch": 0.7770388519425971, "grad_norm": 0.2111329585313797, "learning_rate": 2.7810000000000003e-05, "loss": 0.0168, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1023659.0, "step": 2220 }, { "epoch": 0.7805390269513476, "grad_norm": 0.09295608103275299, "learning_rate": 2.7710000000000004e-05, "loss": 0.0426, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1028221.0, "step": 2230 }, { "epoch": 0.784039201960098, "grad_norm": 0.05695830285549164, "learning_rate": 2.761e-05, "loss": 0.0321, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1032825.0, "step": 2240 }, { "epoch": 0.7875393769688485, "grad_norm": 0.03428833931684494, "learning_rate": 2.7510000000000003e-05, "loss": 0.1096, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 1037389.0, "step": 2250 }, { "epoch": 0.7910395519775989, "grad_norm": 0.052995167672634125, "learning_rate": 2.7410000000000004e-05, "loss": 0.0808, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 1041941.0, "step": 2260 }, { "epoch": 0.7945397269863493, "grad_norm": 0.1979517787694931, "learning_rate": 2.731e-05, "loss": 0.0145, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1046451.0, "step": 2270 }, { "epoch": 0.7980399019950998, "grad_norm": 0.024557696655392647, "learning_rate": 2.7210000000000002e-05, "loss": 0.0534, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 1051071.0, "step": 2280 }, { "epoch": 0.8015400770038502, "grad_norm": 7.660386085510254, "learning_rate": 2.7110000000000003e-05, "loss": 0.1042, "mean_token_accuracy": 0.9649999916553498, "num_tokens": 1055716.0, "step": 2290 }, { "epoch": 0.8050402520126007, "grad_norm": 3.119615316390991, "learning_rate": 2.701e-05, "loss": 0.061, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1060266.0, "step": 2300 }, { "epoch": 0.8085404270213511, "grad_norm": 6.7030158042907715, "learning_rate": 2.691e-05, "loss": 0.0265, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1064858.0, "step": 2310 }, { "epoch": 0.8120406020301015, "grad_norm": 0.08051805198192596, "learning_rate": 2.6810000000000003e-05, "loss": 0.0091, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1069403.0, "step": 2320 }, { "epoch": 0.815540777038852, "grad_norm": 0.08621969074010849, "learning_rate": 2.671e-05, "loss": 0.0043, "mean_token_accuracy": 1.0, "num_tokens": 1074041.0, "step": 2330 }, { "epoch": 0.8190409520476024, "grad_norm": 7.230138778686523, "learning_rate": 2.661e-05, "loss": 0.0891, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 1078603.0, "step": 2340 }, { "epoch": 0.8225411270563528, "grad_norm": 1.925933837890625, "learning_rate": 2.6510000000000002e-05, "loss": 0.1072, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 1083208.0, "step": 2350 }, { "epoch": 0.8260413020651033, "grad_norm": 0.06855742633342743, "learning_rate": 2.6410000000000003e-05, "loss": 0.0987, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 1087939.0, "step": 2360 }, { "epoch": 0.8295414770738537, "grad_norm": 4.232824802398682, "learning_rate": 2.631e-05, "loss": 0.0537, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1092618.0, "step": 2370 }, { "epoch": 0.8330416520826042, "grad_norm": 0.054919663816690445, "learning_rate": 2.621e-05, "loss": 0.0309, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1097232.0, "step": 2380 }, { "epoch": 0.8365418270913546, "grad_norm": 8.129829406738281, "learning_rate": 2.6110000000000002e-05, "loss": 0.0761, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1101883.0, "step": 2390 }, { "epoch": 0.840042002100105, "grad_norm": 7.850025653839111, "learning_rate": 2.601e-05, "loss": 0.0357, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1106608.0, "step": 2400 }, { "epoch": 0.8435421771088555, "grad_norm": 0.504169762134552, "learning_rate": 2.591e-05, "loss": 0.0419, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1111214.0, "step": 2410 }, { "epoch": 0.8470423521176059, "grad_norm": 0.02623009867966175, "learning_rate": 2.5810000000000002e-05, "loss": 0.0353, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1115761.0, "step": 2420 }, { "epoch": 0.8505425271263564, "grad_norm": 0.2593607008457184, "learning_rate": 2.571e-05, "loss": 0.0757, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 1120497.0, "step": 2430 }, { "epoch": 0.8540427021351068, "grad_norm": 0.09586932510137558, "learning_rate": 2.561e-05, "loss": 0.0709, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 1125050.0, "step": 2440 }, { "epoch": 0.8575428771438572, "grad_norm": 0.03755811229348183, "learning_rate": 2.551e-05, "loss": 0.0462, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1129583.0, "step": 2450 }, { "epoch": 0.8610430521526077, "grad_norm": 0.01429970283061266, "learning_rate": 2.541e-05, "loss": 0.0176, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1134231.0, "step": 2460 }, { "epoch": 0.8645432271613581, "grad_norm": 0.1092047318816185, "learning_rate": 2.531e-05, "loss": 0.0348, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1138841.0, "step": 2470 }, { "epoch": 0.8680434021701086, "grad_norm": 0.04420563951134682, "learning_rate": 2.521e-05, "loss": 0.0602, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1143627.0, "step": 2480 }, { "epoch": 0.871543577178859, "grad_norm": 0.056809134781360626, "learning_rate": 2.5110000000000002e-05, "loss": 0.0113, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1148140.0, "step": 2490 }, { "epoch": 0.8750437521876093, "grad_norm": 0.39837557077407837, "learning_rate": 2.501e-05, "loss": 0.0467, "step": 2500 }, { "epoch": 0.8750437521876093, "eval_accuracy": 0.44535691724573595, "eval_f1": 0.3790299221602013, "eval_loss": 0.05089215189218521, "eval_mean_token_accuracy": 0.9880050568267552, "eval_num_tokens": 1152749.0, "eval_precision": 0.454011773226288, "eval_recall": 0.4125268353595752, "eval_runtime": 244.5484, "eval_samples_per_second": 6.473, "eval_steps_per_second": 0.81, "step": 2500 }, { "epoch": 0.8785439271963598, "grad_norm": 0.028136901557445526, "learning_rate": 2.491e-05, "loss": 0.016, "mean_token_accuracy": 0.9874999970197678, "num_tokens": 1157289.0, "step": 2510 }, { "epoch": 0.8820441022051102, "grad_norm": 8.768643379211426, "learning_rate": 2.481e-05, "loss": 0.0547, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 1161827.0, "step": 2520 }, { "epoch": 0.8855442772138606, "grad_norm": 4.318042755126953, "learning_rate": 2.471e-05, "loss": 0.0548, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1166441.0, "step": 2530 }, { "epoch": 0.8890444522226111, "grad_norm": 0.032455261796712875, "learning_rate": 2.4610000000000003e-05, "loss": 0.0342, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1171123.0, "step": 2540 }, { "epoch": 0.8925446272313615, "grad_norm": 1.8352007865905762, "learning_rate": 2.451e-05, "loss": 0.0399, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1175686.0, "step": 2550 }, { "epoch": 0.896044802240112, "grad_norm": 0.11914759129285812, "learning_rate": 2.4410000000000002e-05, "loss": 0.089, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 1180208.0, "step": 2560 }, { "epoch": 0.8995449772488624, "grad_norm": 0.09686534851789474, "learning_rate": 2.4310000000000003e-05, "loss": 0.0889, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1184783.0, "step": 2570 }, { "epoch": 0.9030451522576128, "grad_norm": 0.06705299764871597, "learning_rate": 2.4210000000000004e-05, "loss": 0.0801, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1189400.0, "step": 2580 }, { "epoch": 0.9065453272663633, "grad_norm": 0.04434126242995262, "learning_rate": 2.411e-05, "loss": 0.0036, "mean_token_accuracy": 1.0, "num_tokens": 1194035.0, "step": 2590 }, { "epoch": 0.9100455022751137, "grad_norm": 0.03630208596587181, "learning_rate": 2.4010000000000002e-05, "loss": 0.0326, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1198629.0, "step": 2600 }, { "epoch": 0.9135456772838642, "grad_norm": 0.031141789630055428, "learning_rate": 2.3910000000000003e-05, "loss": 0.0436, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1203254.0, "step": 2610 }, { "epoch": 0.9170458522926146, "grad_norm": 0.33005213737487793, "learning_rate": 2.381e-05, "loss": 0.066, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 1207907.0, "step": 2620 }, { "epoch": 0.920546027301365, "grad_norm": 8.107050895690918, "learning_rate": 2.371e-05, "loss": 0.0418, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1212537.0, "step": 2630 }, { "epoch": 0.9240462023101155, "grad_norm": 0.28287169337272644, "learning_rate": 2.3610000000000003e-05, "loss": 0.0683, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1217120.0, "step": 2640 }, { "epoch": 0.9275463773188659, "grad_norm": 0.03412451222538948, "learning_rate": 2.351e-05, "loss": 0.0101, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1221671.0, "step": 2650 }, { "epoch": 0.9310465523276164, "grad_norm": 6.6741814613342285, "learning_rate": 2.341e-05, "loss": 0.085, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 1226356.0, "step": 2660 }, { "epoch": 0.9345467273363668, "grad_norm": 4.757784366607666, "learning_rate": 2.3310000000000002e-05, "loss": 0.1041, "mean_token_accuracy": 0.9649999916553498, "num_tokens": 1230966.0, "step": 2670 }, { "epoch": 0.9380469023451172, "grad_norm": 0.1483946591615677, "learning_rate": 2.321e-05, "loss": 0.0043, "mean_token_accuracy": 1.0, "num_tokens": 1235570.0, "step": 2680 }, { "epoch": 0.9415470773538677, "grad_norm": 4.1783976554870605, "learning_rate": 2.311e-05, "loss": 0.0297, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1240120.0, "step": 2690 }, { "epoch": 0.9450472523626181, "grad_norm": 0.006953865755349398, "learning_rate": 2.301e-05, "loss": 0.0036, "mean_token_accuracy": 1.0, "num_tokens": 1244744.0, "step": 2700 }, { "epoch": 0.9485474273713685, "grad_norm": 7.843381881713867, "learning_rate": 2.2910000000000003e-05, "loss": 0.0747, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1249304.0, "step": 2710 }, { "epoch": 0.952047602380119, "grad_norm": 2.691250801086426, "learning_rate": 2.281e-05, "loss": 0.0341, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1253884.0, "step": 2720 }, { "epoch": 0.9555477773888694, "grad_norm": 0.048404838889837265, "learning_rate": 2.271e-05, "loss": 0.0236, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1258643.0, "step": 2730 }, { "epoch": 0.9590479523976199, "grad_norm": 0.10087752342224121, "learning_rate": 2.2610000000000002e-05, "loss": 0.0454, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1263197.0, "step": 2740 }, { "epoch": 0.9625481274063703, "grad_norm": 0.66507887840271, "learning_rate": 2.251e-05, "loss": 0.0573, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1267826.0, "step": 2750 }, { "epoch": 0.9660483024151207, "grad_norm": 0.0337546281516552, "learning_rate": 2.241e-05, "loss": 0.0534, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1272318.0, "step": 2760 }, { "epoch": 0.9695484774238712, "grad_norm": 0.022819435223937035, "learning_rate": 2.231e-05, "loss": 0.0262, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1276943.0, "step": 2770 }, { "epoch": 0.9730486524326216, "grad_norm": 0.023641835898160934, "learning_rate": 2.221e-05, "loss": 0.0152, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1281549.0, "step": 2780 }, { "epoch": 0.9765488274413721, "grad_norm": 3.202338695526123, "learning_rate": 2.211e-05, "loss": 0.0755, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 1286053.0, "step": 2790 }, { "epoch": 0.9800490024501225, "grad_norm": 0.052319396287202835, "learning_rate": 2.201e-05, "loss": 0.0376, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1290719.0, "step": 2800 }, { "epoch": 0.9835491774588729, "grad_norm": 10.303972244262695, "learning_rate": 2.191e-05, "loss": 0.0052, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1295220.0, "step": 2810 }, { "epoch": 0.9870493524676234, "grad_norm": 0.11650484800338745, "learning_rate": 2.181e-05, "loss": 0.0036, "mean_token_accuracy": 1.0, "num_tokens": 1299736.0, "step": 2820 }, { "epoch": 0.9905495274763738, "grad_norm": 0.016379429027438164, "learning_rate": 2.171e-05, "loss": 0.0403, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1304307.0, "step": 2830 }, { "epoch": 0.9940497024851243, "grad_norm": 0.07140190899372101, "learning_rate": 2.1609999999999998e-05, "loss": 0.0358, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1308761.0, "step": 2840 }, { "epoch": 0.9975498774938747, "grad_norm": 0.3014475405216217, "learning_rate": 2.1510000000000002e-05, "loss": 0.0319, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1313336.0, "step": 2850 }, { "epoch": 1.0010500525026251, "grad_norm": 0.06447609513998032, "learning_rate": 2.1410000000000003e-05, "loss": 0.0348, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1317756.0, "step": 2860 }, { "epoch": 1.0045502275113756, "grad_norm": 0.02268841676414013, "learning_rate": 2.131e-05, "loss": 0.0218, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1322331.0, "step": 2870 }, { "epoch": 1.008050402520126, "grad_norm": 0.042231637984514236, "learning_rate": 2.1210000000000002e-05, "loss": 0.0023, "mean_token_accuracy": 1.0, "num_tokens": 1326941.0, "step": 2880 }, { "epoch": 1.0115505775288764, "grad_norm": 5.811006546020508, "learning_rate": 2.1110000000000003e-05, "loss": 0.0024, "mean_token_accuracy": 1.0, "num_tokens": 1331630.0, "step": 2890 }, { "epoch": 1.0150507525376269, "grad_norm": 0.0456203818321228, "learning_rate": 2.101e-05, "loss": 0.0011, "mean_token_accuracy": 1.0, "num_tokens": 1336316.0, "step": 2900 }, { "epoch": 1.0185509275463773, "grad_norm": 0.2723388671875, "learning_rate": 2.091e-05, "loss": 0.0114, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1341005.0, "step": 2910 }, { "epoch": 1.0220511025551278, "grad_norm": 0.0164664164185524, "learning_rate": 2.0810000000000002e-05, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 1345651.0, "step": 2920 }, { "epoch": 1.0255512775638782, "grad_norm": 0.38593819737434387, "learning_rate": 2.0710000000000003e-05, "loss": 0.0395, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1350333.0, "step": 2930 }, { "epoch": 1.0290514525726286, "grad_norm": 17.3580379486084, "learning_rate": 2.061e-05, "loss": 0.103, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 1354894.0, "step": 2940 }, { "epoch": 1.032551627581379, "grad_norm": 0.019541358575224876, "learning_rate": 2.0510000000000002e-05, "loss": 0.0496, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 1359645.0, "step": 2950 }, { "epoch": 1.0360518025901295, "grad_norm": 1.1783517599105835, "learning_rate": 2.0410000000000003e-05, "loss": 0.06, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1364197.0, "step": 2960 }, { "epoch": 1.03955197759888, "grad_norm": 0.03887060657143593, "learning_rate": 2.031e-05, "loss": 0.0118, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1368857.0, "step": 2970 }, { "epoch": 1.0430521526076304, "grad_norm": 0.11780918389558792, "learning_rate": 2.021e-05, "loss": 0.0336, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1373483.0, "step": 2980 }, { "epoch": 1.0465523276163808, "grad_norm": 0.06139334291219711, "learning_rate": 2.0110000000000002e-05, "loss": 0.0924, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 1378101.0, "step": 2990 }, { "epoch": 1.0500525026251313, "grad_norm": 0.0635937973856926, "learning_rate": 2.001e-05, "loss": 0.0472, "step": 3000 }, { "epoch": 1.0500525026251313, "eval_accuracy": 0.44662034112444726, "eval_f1": 0.3874674882536518, "eval_loss": 0.041092198342084885, "eval_mean_token_accuracy": 0.9890151577766495, "eval_num_tokens": 1382635.0, "eval_precision": 0.4642550079051411, "eval_recall": 0.4463958399837153, "eval_runtime": 243.8353, "eval_samples_per_second": 6.492, "eval_steps_per_second": 0.812, "step": 3000 }, { "epoch": 1.0535526776338817, "grad_norm": 0.05304880812764168, "learning_rate": 1.991e-05, "loss": 0.0065, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1387247.0, "step": 3010 }, { "epoch": 1.0570528526426322, "grad_norm": 0.09396978467702866, "learning_rate": 1.9810000000000002e-05, "loss": 0.03, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1391905.0, "step": 3020 }, { "epoch": 1.0605530276513826, "grad_norm": 0.02283914014697075, "learning_rate": 1.971e-05, "loss": 0.0412, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1396551.0, "step": 3030 }, { "epoch": 1.064053202660133, "grad_norm": 0.14994072914123535, "learning_rate": 1.961e-05, "loss": 0.0387, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1401184.0, "step": 3040 }, { "epoch": 1.0675533776688835, "grad_norm": 0.04891595244407654, "learning_rate": 1.951e-05, "loss": 0.04, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1405794.0, "step": 3050 }, { "epoch": 1.071053552677634, "grad_norm": 8.5429105758667, "learning_rate": 1.941e-05, "loss": 0.0338, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1410352.0, "step": 3060 }, { "epoch": 1.0745537276863844, "grad_norm": 6.096926212310791, "learning_rate": 1.931e-05, "loss": 0.0823, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 1414922.0, "step": 3070 }, { "epoch": 1.0780539026951348, "grad_norm": 0.01014970988035202, "learning_rate": 1.921e-05, "loss": 0.0596, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1419518.0, "step": 3080 }, { "epoch": 1.0815540777038852, "grad_norm": 8.101456642150879, "learning_rate": 1.911e-05, "loss": 0.0309, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1424063.0, "step": 3090 }, { "epoch": 1.0850542527126357, "grad_norm": 0.03248458355665207, "learning_rate": 1.901e-05, "loss": 0.0081, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1428657.0, "step": 3100 }, { "epoch": 1.088554427721386, "grad_norm": 0.284970760345459, "learning_rate": 1.891e-05, "loss": 0.0035, "mean_token_accuracy": 1.0, "num_tokens": 1433340.0, "step": 3110 }, { "epoch": 1.0920546027301365, "grad_norm": 5.219287872314453, "learning_rate": 1.881e-05, "loss": 0.0339, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1437947.0, "step": 3120 }, { "epoch": 1.095554777738887, "grad_norm": 0.021635359153151512, "learning_rate": 1.871e-05, "loss": 0.0533, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1442589.0, "step": 3130 }, { "epoch": 1.0990549527476374, "grad_norm": 0.05187542736530304, "learning_rate": 1.861e-05, "loss": 0.0297, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1447150.0, "step": 3140 }, { "epoch": 1.1025551277563879, "grad_norm": 0.05536261200904846, "learning_rate": 1.851e-05, "loss": 0.0013, "mean_token_accuracy": 1.0, "num_tokens": 1451702.0, "step": 3150 }, { "epoch": 1.1060553027651383, "grad_norm": 4.796628475189209, "learning_rate": 1.841e-05, "loss": 0.0073, "mean_token_accuracy": 1.0, "num_tokens": 1456224.0, "step": 3160 }, { "epoch": 1.1095554777738887, "grad_norm": 4.390865325927734, "learning_rate": 1.8310000000000003e-05, "loss": 0.0532, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1460755.0, "step": 3170 }, { "epoch": 1.1130556527826392, "grad_norm": 0.12759913504123688, "learning_rate": 1.8210000000000004e-05, "loss": 0.0183, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1465282.0, "step": 3180 }, { "epoch": 1.1165558277913896, "grad_norm": 0.023097023367881775, "learning_rate": 1.811e-05, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 1469858.0, "step": 3190 }, { "epoch": 1.12005600280014, "grad_norm": 0.013977882452309132, "learning_rate": 1.8010000000000002e-05, "loss": 0.0598, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1474377.0, "step": 3200 }, { "epoch": 1.1235561778088905, "grad_norm": 0.03361167758703232, "learning_rate": 1.7910000000000003e-05, "loss": 0.0238, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1478946.0, "step": 3210 }, { "epoch": 1.127056352817641, "grad_norm": 0.08658773452043533, "learning_rate": 1.781e-05, "loss": 0.0542, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1483522.0, "step": 3220 }, { "epoch": 1.1305565278263914, "grad_norm": 0.030420592054724693, "learning_rate": 1.771e-05, "loss": 0.0509, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 1488282.0, "step": 3230 }, { "epoch": 1.1340567028351418, "grad_norm": 0.10281772166490555, "learning_rate": 1.7610000000000002e-05, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 1492851.0, "step": 3240 }, { "epoch": 1.1375568778438923, "grad_norm": 0.061314165592193604, "learning_rate": 1.751e-05, "loss": 0.0397, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1497490.0, "step": 3250 }, { "epoch": 1.1410570528526427, "grad_norm": 0.05558156967163086, "learning_rate": 1.741e-05, "loss": 0.0341, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1502033.0, "step": 3260 }, { "epoch": 1.1445572278613931, "grad_norm": 0.7785694003105164, "learning_rate": 1.7310000000000002e-05, "loss": 0.0501, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1506574.0, "step": 3270 }, { "epoch": 1.1480574028701436, "grad_norm": 1.054373025894165, "learning_rate": 1.721e-05, "loss": 0.0073, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1511166.0, "step": 3280 }, { "epoch": 1.151557577878894, "grad_norm": 13.361648559570312, "learning_rate": 1.711e-05, "loss": 0.0203, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1515897.0, "step": 3290 }, { "epoch": 1.1550577528876445, "grad_norm": 9.834617614746094, "learning_rate": 1.701e-05, "loss": 0.0487, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 1520628.0, "step": 3300 }, { "epoch": 1.158557927896395, "grad_norm": 0.03448121249675751, "learning_rate": 1.6910000000000002e-05, "loss": 0.0257, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1525210.0, "step": 3310 }, { "epoch": 1.1620581029051453, "grad_norm": 0.11401532590389252, "learning_rate": 1.681e-05, "loss": 0.0379, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1529651.0, "step": 3320 }, { "epoch": 1.1655582779138958, "grad_norm": 3.9310457706451416, "learning_rate": 1.671e-05, "loss": 0.0212, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1534286.0, "step": 3330 }, { "epoch": 1.1690584529226462, "grad_norm": 0.012804349884390831, "learning_rate": 1.6610000000000002e-05, "loss": 0.0012, "mean_token_accuracy": 1.0, "num_tokens": 1538944.0, "step": 3340 }, { "epoch": 1.1725586279313966, "grad_norm": 0.7828325033187866, "learning_rate": 1.651e-05, "loss": 0.0346, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1543591.0, "step": 3350 }, { "epoch": 1.176058802940147, "grad_norm": 0.027147287502884865, "learning_rate": 1.641e-05, "loss": 0.0021, "mean_token_accuracy": 1.0, "num_tokens": 1548149.0, "step": 3360 }, { "epoch": 1.1795589779488975, "grad_norm": 0.05930430442094803, "learning_rate": 1.631e-05, "loss": 0.0335, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1552807.0, "step": 3370 }, { "epoch": 1.183059152957648, "grad_norm": 0.03868912532925606, "learning_rate": 1.621e-05, "loss": 0.0871, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1557375.0, "step": 3380 }, { "epoch": 1.1865593279663984, "grad_norm": 0.038131892681121826, "learning_rate": 1.611e-05, "loss": 0.0129, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1561984.0, "step": 3390 }, { "epoch": 1.1900595029751488, "grad_norm": 0.3329053521156311, "learning_rate": 1.601e-05, "loss": 0.0173, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1566645.0, "step": 3400 }, { "epoch": 1.1935596779838993, "grad_norm": 0.08648809045553207, "learning_rate": 1.591e-05, "loss": 0.0916, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1571275.0, "step": 3410 }, { "epoch": 1.1970598529926497, "grad_norm": 9.256734848022461, "learning_rate": 1.581e-05, "loss": 0.0156, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1575848.0, "step": 3420 }, { "epoch": 1.2005600280014002, "grad_norm": 0.20919840037822723, "learning_rate": 1.571e-05, "loss": 0.028, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1580383.0, "step": 3430 }, { "epoch": 1.2040602030101506, "grad_norm": 8.012375831604004, "learning_rate": 1.561e-05, "loss": 0.0042, "mean_token_accuracy": 1.0, "num_tokens": 1584986.0, "step": 3440 }, { "epoch": 1.207560378018901, "grad_norm": 0.024143755435943604, "learning_rate": 1.551e-05, "loss": 0.0021, "mean_token_accuracy": 1.0, "num_tokens": 1589541.0, "step": 3450 }, { "epoch": 1.2110605530276515, "grad_norm": 3.953441619873047, "learning_rate": 1.541e-05, "loss": 0.043, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1594177.0, "step": 3460 }, { "epoch": 1.214560728036402, "grad_norm": 3.8087575435638428, "learning_rate": 1.531e-05, "loss": 0.0538, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1598836.0, "step": 3470 }, { "epoch": 1.2180609030451524, "grad_norm": 0.01786259561777115, "learning_rate": 1.5210000000000002e-05, "loss": 0.0583, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1603561.0, "step": 3480 }, { "epoch": 1.2215610780539028, "grad_norm": 0.026305489242076874, "learning_rate": 1.5110000000000003e-05, "loss": 0.0121, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1608176.0, "step": 3490 }, { "epoch": 1.2250612530626532, "grad_norm": 0.026074456050992012, "learning_rate": 1.5010000000000002e-05, "loss": 0.0616, "step": 3500 }, { "epoch": 1.2250612530626532, "eval_accuracy": 0.4403032217308907, "eval_f1": 0.3717722608671718, "eval_loss": 0.04926339536905289, "eval_mean_token_accuracy": 0.9881313193326045, "eval_num_tokens": 1612812.0, "eval_precision": 0.44875078714612676, "eval_recall": 0.39254694591331135, "eval_runtime": 245.1895, "eval_samples_per_second": 6.456, "eval_steps_per_second": 0.808, "step": 3500 }, { "epoch": 1.2285614280714037, "grad_norm": 0.0484970398247242, "learning_rate": 1.4910000000000001e-05, "loss": 0.0593, "mean_token_accuracy": 0.9874999970197678, "num_tokens": 1617379.0, "step": 3510 }, { "epoch": 1.232061603080154, "grad_norm": 12.298089981079102, "learning_rate": 1.4810000000000002e-05, "loss": 0.0659, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1622039.0, "step": 3520 }, { "epoch": 1.2355617780889045, "grad_norm": 0.022822504863142967, "learning_rate": 1.4710000000000001e-05, "loss": 0.0392, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1626629.0, "step": 3530 }, { "epoch": 1.239061953097655, "grad_norm": 0.19993631541728973, "learning_rate": 1.461e-05, "loss": 0.003, "mean_token_accuracy": 1.0, "num_tokens": 1631184.0, "step": 3540 }, { "epoch": 1.2425621281064054, "grad_norm": 0.01650061272084713, "learning_rate": 1.4510000000000002e-05, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 1635787.0, "step": 3550 }, { "epoch": 1.2460623031151559, "grad_norm": 0.09447409212589264, "learning_rate": 1.4410000000000001e-05, "loss": 0.0111, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1640489.0, "step": 3560 }, { "epoch": 1.2495624781239063, "grad_norm": 1.348664402961731, "learning_rate": 1.4310000000000002e-05, "loss": 0.0037, "mean_token_accuracy": 1.0, "num_tokens": 1645122.0, "step": 3570 }, { "epoch": 1.2530626531326567, "grad_norm": 0.02807781472802162, "learning_rate": 1.4210000000000001e-05, "loss": 0.0101, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1649714.0, "step": 3580 }, { "epoch": 1.2565628281414072, "grad_norm": 0.0278321523219347, "learning_rate": 1.411e-05, "loss": 0.0317, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1654253.0, "step": 3590 }, { "epoch": 1.2600630031501576, "grad_norm": 0.05552316829562187, "learning_rate": 1.4010000000000001e-05, "loss": 0.0338, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1658870.0, "step": 3600 }, { "epoch": 1.263563178158908, "grad_norm": 0.5879592895507812, "learning_rate": 1.391e-05, "loss": 0.0768, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 1663540.0, "step": 3610 }, { "epoch": 1.2670633531676585, "grad_norm": 0.23051026463508606, "learning_rate": 1.381e-05, "loss": 0.0366, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1668164.0, "step": 3620 }, { "epoch": 1.270563528176409, "grad_norm": 7.013516426086426, "learning_rate": 1.3710000000000001e-05, "loss": 0.0831, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1672778.0, "step": 3630 }, { "epoch": 1.2740637031851594, "grad_norm": 0.20101211965084076, "learning_rate": 1.361e-05, "loss": 0.0336, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1677340.0, "step": 3640 }, { "epoch": 1.2775638781939098, "grad_norm": 0.3626852035522461, "learning_rate": 1.3510000000000001e-05, "loss": 0.043, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1681942.0, "step": 3650 }, { "epoch": 1.2810640532026603, "grad_norm": 0.053018514066934586, "learning_rate": 1.341e-05, "loss": 0.045, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1686613.0, "step": 3660 }, { "epoch": 1.2845642282114107, "grad_norm": 7.274749755859375, "learning_rate": 1.331e-05, "loss": 0.1322, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 1691194.0, "step": 3670 }, { "epoch": 1.2880644032201611, "grad_norm": 0.05607298016548157, "learning_rate": 1.321e-05, "loss": 0.0384, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1695910.0, "step": 3680 }, { "epoch": 1.2915645782289116, "grad_norm": 0.03872371464967728, "learning_rate": 1.311e-05, "loss": 0.0138, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1700440.0, "step": 3690 }, { "epoch": 1.295064753237662, "grad_norm": 0.042605865746736526, "learning_rate": 1.301e-05, "loss": 0.0845, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 1705135.0, "step": 3700 }, { "epoch": 1.2985649282464125, "grad_norm": 4.082870006561279, "learning_rate": 1.291e-05, "loss": 0.0633, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1709718.0, "step": 3710 }, { "epoch": 1.302065103255163, "grad_norm": 5.0214691162109375, "learning_rate": 1.281e-05, "loss": 0.0522, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1714292.0, "step": 3720 }, { "epoch": 1.3055652782639133, "grad_norm": 0.05840720981359482, "learning_rate": 1.271e-05, "loss": 0.0667, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1718744.0, "step": 3730 }, { "epoch": 1.3090654532726638, "grad_norm": 0.3673993647098541, "learning_rate": 1.261e-05, "loss": 0.0463, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1723373.0, "step": 3740 }, { "epoch": 1.312565628281414, "grad_norm": 0.06860412657260895, "learning_rate": 1.2509999999999999e-05, "loss": 0.0414, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1727984.0, "step": 3750 }, { "epoch": 1.3160658032901644, "grad_norm": 0.03777327015995979, "learning_rate": 1.2410000000000001e-05, "loss": 0.0116, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1732531.0, "step": 3760 }, { "epoch": 1.3195659782989149, "grad_norm": 0.7017369270324707, "learning_rate": 1.231e-05, "loss": 0.0685, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 1737147.0, "step": 3770 }, { "epoch": 1.3230661533076653, "grad_norm": 8.006946563720703, "learning_rate": 1.221e-05, "loss": 0.0483, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1741794.0, "step": 3780 }, { "epoch": 1.3265663283164157, "grad_norm": 7.42986536026001, "learning_rate": 1.2110000000000001e-05, "loss": 0.0691, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 1746457.0, "step": 3790 }, { "epoch": 1.3300665033251662, "grad_norm": 0.08513722568750381, "learning_rate": 1.201e-05, "loss": 0.0096, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1751031.0, "step": 3800 }, { "epoch": 1.3335666783339166, "grad_norm": 5.149372577667236, "learning_rate": 1.1910000000000001e-05, "loss": 0.1008, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 1755649.0, "step": 3810 }, { "epoch": 1.337066853342667, "grad_norm": 0.10420811176300049, "learning_rate": 1.181e-05, "loss": 0.0901, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1760094.0, "step": 3820 }, { "epoch": 1.3405670283514175, "grad_norm": 0.15396250784397125, "learning_rate": 1.171e-05, "loss": 0.006, "mean_token_accuracy": 1.0, "num_tokens": 1764739.0, "step": 3830 }, { "epoch": 1.344067203360168, "grad_norm": 0.08703949302434921, "learning_rate": 1.161e-05, "loss": 0.0277, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1769426.0, "step": 3840 }, { "epoch": 1.3475673783689184, "grad_norm": 2.2800724506378174, "learning_rate": 1.151e-05, "loss": 0.0215, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1774042.0, "step": 3850 }, { "epoch": 1.3510675533776688, "grad_norm": 7.864820957183838, "learning_rate": 1.141e-05, "loss": 0.0396, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1778764.0, "step": 3860 }, { "epoch": 1.3545677283864193, "grad_norm": 18.59937286376953, "learning_rate": 1.1310000000000002e-05, "loss": 0.0501, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1783410.0, "step": 3870 }, { "epoch": 1.3580679033951697, "grad_norm": 0.06370130181312561, "learning_rate": 1.1210000000000001e-05, "loss": 0.0804, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1788086.0, "step": 3880 }, { "epoch": 1.3615680784039201, "grad_norm": 3.136486053466797, "learning_rate": 1.111e-05, "loss": 0.0549, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1792576.0, "step": 3890 }, { "epoch": 1.3650682534126706, "grad_norm": 0.080386683344841, "learning_rate": 1.1010000000000001e-05, "loss": 0.0105, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1797137.0, "step": 3900 }, { "epoch": 1.368568428421421, "grad_norm": 0.311697393655777, "learning_rate": 1.091e-05, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 1801751.0, "step": 3910 }, { "epoch": 1.3720686034301715, "grad_norm": 0.04613969102501869, "learning_rate": 1.081e-05, "loss": 0.0236, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1806217.0, "step": 3920 }, { "epoch": 1.375568778438922, "grad_norm": 2.0834603309631348, "learning_rate": 1.071e-05, "loss": 0.0064, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1810797.0, "step": 3930 }, { "epoch": 1.3790689534476723, "grad_norm": 4.339105129241943, "learning_rate": 1.061e-05, "loss": 0.0572, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1815366.0, "step": 3940 }, { "epoch": 1.3825691284564228, "grad_norm": 0.03018569014966488, "learning_rate": 1.0510000000000001e-05, "loss": 0.0121, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1819910.0, "step": 3950 }, { "epoch": 1.3860693034651732, "grad_norm": 0.02608495019376278, "learning_rate": 1.041e-05, "loss": 0.0203, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1824620.0, "step": 3960 }, { "epoch": 1.3895694784739236, "grad_norm": 0.028722476214170456, "learning_rate": 1.031e-05, "loss": 0.0524, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1829297.0, "step": 3970 }, { "epoch": 1.393069653482674, "grad_norm": 3.794125556945801, "learning_rate": 1.021e-05, "loss": 0.1004, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 1833884.0, "step": 3980 }, { "epoch": 1.3965698284914245, "grad_norm": 0.0639004036784172, "learning_rate": 1.011e-05, "loss": 0.0432, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1838429.0, "step": 3990 }, { "epoch": 1.400070003500175, "grad_norm": 0.08853046596050262, "learning_rate": 1.001e-05, "loss": 0.0531, "step": 4000 }, { "epoch": 1.400070003500175, "eval_accuracy": 0.4491471888818699, "eval_f1": 0.3831962155491568, "eval_loss": 0.04581384360790253, "eval_mean_token_accuracy": 0.988113282003788, "eval_num_tokens": 1843093.0, "eval_precision": 0.4551209732080744, "eval_recall": 0.42343303927833725, "eval_runtime": 243.4032, "eval_samples_per_second": 6.504, "eval_steps_per_second": 0.813, "step": 4000 }, { "epoch": 1.4035701785089254, "grad_norm": 1.6858179569244385, "learning_rate": 9.91e-06, "loss": 0.0544, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1847649.0, "step": 4010 }, { "epoch": 1.4070703535176758, "grad_norm": 4.908888339996338, "learning_rate": 9.810000000000001e-06, "loss": 0.0405, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1852259.0, "step": 4020 }, { "epoch": 1.4105705285264263, "grad_norm": 0.08344841003417969, "learning_rate": 9.71e-06, "loss": 0.0251, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1856863.0, "step": 4030 }, { "epoch": 1.4140707035351767, "grad_norm": 0.43226656317710876, "learning_rate": 9.610000000000001e-06, "loss": 0.0363, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1861413.0, "step": 4040 }, { "epoch": 1.4175708785439272, "grad_norm": 5.785868167877197, "learning_rate": 9.51e-06, "loss": 0.069, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1865973.0, "step": 4050 }, { "epoch": 1.4210710535526776, "grad_norm": 0.880620002746582, "learning_rate": 9.410000000000001e-06, "loss": 0.0436, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1870553.0, "step": 4060 }, { "epoch": 1.424571228561428, "grad_norm": 6.6892218589782715, "learning_rate": 9.31e-06, "loss": 0.0627, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1875126.0, "step": 4070 }, { "epoch": 1.4280714035701785, "grad_norm": 0.048246119171381, "learning_rate": 9.21e-06, "loss": 0.0428, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1879726.0, "step": 4080 }, { "epoch": 1.431571578578929, "grad_norm": 0.07305438071489334, "learning_rate": 9.110000000000001e-06, "loss": 0.0387, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1884286.0, "step": 4090 }, { "epoch": 1.4350717535876794, "grad_norm": 11.415247917175293, "learning_rate": 9.01e-06, "loss": 0.0821, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1888772.0, "step": 4100 }, { "epoch": 1.4385719285964298, "grad_norm": 0.0724155455827713, "learning_rate": 8.910000000000001e-06, "loss": 0.0189, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1893335.0, "step": 4110 }, { "epoch": 1.4420721036051802, "grad_norm": 0.11276718974113464, "learning_rate": 8.81e-06, "loss": 0.0218, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1898013.0, "step": 4120 }, { "epoch": 1.4455722786139307, "grad_norm": 0.07353251427412033, "learning_rate": 8.71e-06, "loss": 0.0018, "mean_token_accuracy": 1.0, "num_tokens": 1902625.0, "step": 4130 }, { "epoch": 1.4490724536226811, "grad_norm": 0.031495820730924606, "learning_rate": 8.61e-06, "loss": 0.025, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1907228.0, "step": 4140 }, { "epoch": 1.4525726286314316, "grad_norm": 13.788881301879883, "learning_rate": 8.51e-06, "loss": 0.0804, "mean_token_accuracy": 0.975, "num_tokens": 1911797.0, "step": 4150 }, { "epoch": 1.456072803640182, "grad_norm": 0.05939871817827225, "learning_rate": 8.409999999999999e-06, "loss": 0.0432, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1916403.0, "step": 4160 }, { "epoch": 1.4595729786489324, "grad_norm": 11.028568267822266, "learning_rate": 8.31e-06, "loss": 0.0425, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1920937.0, "step": 4170 }, { "epoch": 1.4630731536576829, "grad_norm": 0.048251356929540634, "learning_rate": 8.210000000000001e-06, "loss": 0.0328, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1925503.0, "step": 4180 }, { "epoch": 1.4665733286664333, "grad_norm": 0.05744925141334534, "learning_rate": 8.11e-06, "loss": 0.0581, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1930161.0, "step": 4190 }, { "epoch": 1.4700735036751837, "grad_norm": 4.82538366317749, "learning_rate": 8.010000000000001e-06, "loss": 0.0637, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1934851.0, "step": 4200 }, { "epoch": 1.4735736786839342, "grad_norm": 0.015897316858172417, "learning_rate": 7.91e-06, "loss": 0.0553, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1939517.0, "step": 4210 }, { "epoch": 1.4770738536926846, "grad_norm": 0.4943805932998657, "learning_rate": 7.810000000000001e-06, "loss": 0.064, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 1944221.0, "step": 4220 }, { "epoch": 1.480574028701435, "grad_norm": 0.8401426672935486, "learning_rate": 7.71e-06, "loss": 0.004, "mean_token_accuracy": 1.0, "num_tokens": 1948737.0, "step": 4230 }, { "epoch": 1.4840742037101855, "grad_norm": 8.93281364440918, "learning_rate": 7.610000000000001e-06, "loss": 0.0687, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1953245.0, "step": 4240 }, { "epoch": 1.487574378718936, "grad_norm": 24.4106388092041, "learning_rate": 7.51e-06, "loss": 0.0928, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 1957794.0, "step": 4250 }, { "epoch": 1.4910745537276864, "grad_norm": 0.38299062848091125, "learning_rate": 7.41e-06, "loss": 0.0376, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1962327.0, "step": 4260 }, { "epoch": 1.4945747287364368, "grad_norm": 0.06252578645944595, "learning_rate": 7.31e-06, "loss": 0.0196, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1967028.0, "step": 4270 }, { "epoch": 1.4980749037451873, "grad_norm": 1.5602178573608398, "learning_rate": 7.2100000000000004e-06, "loss": 0.023, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 1971740.0, "step": 4280 }, { "epoch": 1.5015750787539377, "grad_norm": 0.031557030975818634, "learning_rate": 7.11e-06, "loss": 0.0233, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1976313.0, "step": 4290 }, { "epoch": 1.5050752537626881, "grad_norm": 0.027841169387102127, "learning_rate": 7.01e-06, "loss": 0.022, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1980963.0, "step": 4300 }, { "epoch": 1.5085754287714386, "grad_norm": 0.62822425365448, "learning_rate": 6.91e-06, "loss": 0.0481, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 1985494.0, "step": 4310 }, { "epoch": 1.512075603780189, "grad_norm": 0.05204153060913086, "learning_rate": 6.81e-06, "loss": 0.0123, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 1990229.0, "step": 4320 }, { "epoch": 1.5155757787889395, "grad_norm": 0.030517544597387314, "learning_rate": 6.710000000000001e-06, "loss": 0.0037, "mean_token_accuracy": 1.0, "num_tokens": 1994888.0, "step": 4330 }, { "epoch": 1.51907595379769, "grad_norm": 0.03292595595121384, "learning_rate": 6.610000000000001e-06, "loss": 0.0739, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 1999584.0, "step": 4340 }, { "epoch": 1.5225761288064403, "grad_norm": 0.04422605782747269, "learning_rate": 6.510000000000001e-06, "loss": 0.0677, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 2004217.0, "step": 4350 }, { "epoch": 1.5260763038151908, "grad_norm": 0.03554658591747284, "learning_rate": 6.4100000000000005e-06, "loss": 0.0764, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 2008889.0, "step": 4360 }, { "epoch": 1.5295764788239412, "grad_norm": 3.288350820541382, "learning_rate": 6.3100000000000006e-06, "loss": 0.0361, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 2013499.0, "step": 4370 }, { "epoch": 1.5330766538326916, "grad_norm": 0.06462374329566956, "learning_rate": 6.210000000000001e-06, "loss": 0.0048, "mean_token_accuracy": 1.0, "num_tokens": 2018094.0, "step": 4380 }, { "epoch": 1.536576828841442, "grad_norm": 0.1262829601764679, "learning_rate": 6.110000000000001e-06, "loss": 0.0165, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 2022654.0, "step": 4390 }, { "epoch": 1.5400770038501925, "grad_norm": 1.0548720359802246, "learning_rate": 6.01e-06, "loss": 0.0459, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 2027308.0, "step": 4400 }, { "epoch": 1.543577178858943, "grad_norm": 10.028485298156738, "learning_rate": 5.91e-06, "loss": 0.0683, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 2031892.0, "step": 4410 }, { "epoch": 1.5470773538676934, "grad_norm": 9.958955764770508, "learning_rate": 5.81e-06, "loss": 0.0637, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 2036478.0, "step": 4420 }, { "epoch": 1.5505775288764438, "grad_norm": 0.04276181757450104, "learning_rate": 5.71e-06, "loss": 0.0782, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 2041074.0, "step": 4430 }, { "epoch": 1.5540777038851943, "grad_norm": 0.047367651015520096, "learning_rate": 5.61e-06, "loss": 0.0377, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 2045668.0, "step": 4440 }, { "epoch": 1.5575778788939447, "grad_norm": 4.863480091094971, "learning_rate": 5.510000000000001e-06, "loss": 0.0285, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 2050297.0, "step": 4450 }, { "epoch": 1.5610780539026952, "grad_norm": 6.1144537925720215, "learning_rate": 5.410000000000001e-06, "loss": 0.0646, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 2054920.0, "step": 4460 }, { "epoch": 1.5645782289114456, "grad_norm": 0.05516400188207626, "learning_rate": 5.31e-06, "loss": 0.0097, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 2059412.0, "step": 4470 }, { "epoch": 1.568078403920196, "grad_norm": 0.06957421451807022, "learning_rate": 5.21e-06, "loss": 0.0453, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 2063933.0, "step": 4480 }, { "epoch": 1.5715785789289465, "grad_norm": 0.59195876121521, "learning_rate": 5.11e-06, "loss": 0.0545, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 2068549.0, "step": 4490 }, { "epoch": 1.575078753937697, "grad_norm": 4.435554504394531, "learning_rate": 5.01e-06, "loss": 0.0137, "step": 4500 }, { "epoch": 1.575078753937697, "eval_accuracy": 0.44662034112444726, "eval_f1": 0.3821042468723695, "eval_loss": 0.04538816958665848, "eval_mean_token_accuracy": 0.9878607569920896, "eval_num_tokens": 2073095.0, "eval_precision": 0.45302304542991134, "eval_recall": 0.42551771400250105, "eval_runtime": 243.4481, "eval_samples_per_second": 6.502, "eval_steps_per_second": 0.813, "step": 4500 }, { "epoch": 1.5785789289464474, "grad_norm": 0.03762364760041237, "learning_rate": 4.9100000000000004e-06, "loss": 0.026, "mean_token_accuracy": 0.9924999982118606, "num_tokens": 2077816.0, "step": 4510 }, { "epoch": 1.5820791039551978, "grad_norm": 0.038492508232593536, "learning_rate": 4.81e-06, "loss": 0.037, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 2082369.0, "step": 4520 }, { "epoch": 1.5855792789639482, "grad_norm": 0.04851048067212105, "learning_rate": 4.710000000000001e-06, "loss": 0.1037, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 2086980.0, "step": 4530 }, { "epoch": 1.5890794539726987, "grad_norm": 0.06226026266813278, "learning_rate": 4.610000000000001e-06, "loss": 0.0316, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 2091582.0, "step": 4540 }, { "epoch": 1.5925796289814491, "grad_norm": 0.012553819455206394, "learning_rate": 4.51e-06, "loss": 0.0426, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 2096137.0, "step": 4550 }, { "epoch": 1.5960798039901996, "grad_norm": 0.12090373784303665, "learning_rate": 4.41e-06, "loss": 0.0035, "mean_token_accuracy": 1.0, "num_tokens": 2100974.0, "step": 4560 }, { "epoch": 1.59957997899895, "grad_norm": 0.024477414786815643, "learning_rate": 4.31e-06, "loss": 0.0199, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 2105553.0, "step": 4570 }, { "epoch": 1.6030801540077004, "grad_norm": 9.273329734802246, "learning_rate": 4.21e-06, "loss": 0.0197, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 2110182.0, "step": 4580 }, { "epoch": 1.6065803290164509, "grad_norm": 6.43629264831543, "learning_rate": 4.11e-06, "loss": 0.0163, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 2114818.0, "step": 4590 }, { "epoch": 1.6100805040252013, "grad_norm": 0.047764312475919724, "learning_rate": 4.01e-06, "loss": 0.0112, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 2119456.0, "step": 4600 }, { "epoch": 1.6135806790339517, "grad_norm": 3.2197811603546143, "learning_rate": 3.910000000000001e-06, "loss": 0.0357, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 2124191.0, "step": 4610 }, { "epoch": 1.6170808540427022, "grad_norm": 0.04559561237692833, "learning_rate": 3.8100000000000004e-06, "loss": 0.0896, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 2128813.0, "step": 4620 }, { "epoch": 1.6205810290514526, "grad_norm": 0.03245115652680397, "learning_rate": 3.7100000000000005e-06, "loss": 0.0167, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 2133363.0, "step": 4630 }, { "epoch": 1.624081204060203, "grad_norm": 0.0637376606464386, "learning_rate": 3.61e-06, "loss": 0.0161, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 2137919.0, "step": 4640 }, { "epoch": 1.6275813790689533, "grad_norm": 0.10170795023441315, "learning_rate": 3.5100000000000003e-06, "loss": 0.0619, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 2142475.0, "step": 4650 }, { "epoch": 1.6310815540777037, "grad_norm": 0.11928985267877579, "learning_rate": 3.41e-06, "loss": 0.0317, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 2147153.0, "step": 4660 }, { "epoch": 1.6345817290864542, "grad_norm": 0.31450000405311584, "learning_rate": 3.31e-06, "loss": 0.0262, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 2151803.0, "step": 4670 }, { "epoch": 1.6380819040952046, "grad_norm": 0.025399642065167427, "learning_rate": 3.2099999999999998e-06, "loss": 0.029, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 2156302.0, "step": 4680 }, { "epoch": 1.641582079103955, "grad_norm": 0.07148288935422897, "learning_rate": 3.11e-06, "loss": 0.058, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 2160964.0, "step": 4690 }, { "epoch": 1.6450822541127055, "grad_norm": 0.043584585189819336, "learning_rate": 3.01e-06, "loss": 0.0118, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 2165614.0, "step": 4700 }, { "epoch": 1.648582429121456, "grad_norm": 0.021303439512848854, "learning_rate": 2.91e-06, "loss": 0.0593, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 2170098.0, "step": 4710 }, { "epoch": 1.6520826041302064, "grad_norm": 3.4671308994293213, "learning_rate": 2.81e-06, "loss": 0.0461, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 2174607.0, "step": 4720 }, { "epoch": 1.6555827791389568, "grad_norm": 0.03900500759482384, "learning_rate": 2.71e-06, "loss": 0.0029, "mean_token_accuracy": 1.0, "num_tokens": 2179286.0, "step": 4730 }, { "epoch": 1.6590829541477072, "grad_norm": 0.02918989770114422, "learning_rate": 2.6100000000000004e-06, "loss": 0.0892, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 2183836.0, "step": 4740 }, { "epoch": 1.6625831291564577, "grad_norm": 0.029129987582564354, "learning_rate": 2.51e-06, "loss": 0.0861, "mean_token_accuracy": 0.9699999928474426, "num_tokens": 2188430.0, "step": 4750 }, { "epoch": 1.6660833041652081, "grad_norm": 0.08811552822589874, "learning_rate": 2.4100000000000002e-06, "loss": 0.007, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 2193026.0, "step": 4760 }, { "epoch": 1.6695834791739586, "grad_norm": 3.6819851398468018, "learning_rate": 2.31e-06, "loss": 0.016, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 2197606.0, "step": 4770 }, { "epoch": 1.673083654182709, "grad_norm": 0.0475095734000206, "learning_rate": 2.2100000000000004e-06, "loss": 0.0318, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 2202131.0, "step": 4780 }, { "epoch": 1.6765838291914594, "grad_norm": 5.221133708953857, "learning_rate": 2.11e-06, "loss": 0.0138, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 2206887.0, "step": 4790 }, { "epoch": 1.6800840042002099, "grad_norm": 8.505605697631836, "learning_rate": 2.0100000000000002e-06, "loss": 0.0216, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 2211562.0, "step": 4800 }, { "epoch": 1.6835841792089603, "grad_norm": 0.05636419355869293, "learning_rate": 1.91e-06, "loss": 0.0568, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 2216043.0, "step": 4810 }, { "epoch": 1.6870843542177107, "grad_norm": 0.06148410961031914, "learning_rate": 1.8100000000000002e-06, "loss": 0.0313, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 2220686.0, "step": 4820 }, { "epoch": 1.6905845292264612, "grad_norm": 0.1644497960805893, "learning_rate": 1.7100000000000001e-06, "loss": 0.0611, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 2225264.0, "step": 4830 }, { "epoch": 1.6940847042352116, "grad_norm": 8.40280532836914, "learning_rate": 1.61e-06, "loss": 0.0134, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 2229777.0, "step": 4840 }, { "epoch": 1.697584879243962, "grad_norm": 0.4285930097103119, "learning_rate": 1.5100000000000002e-06, "loss": 0.0016, "mean_token_accuracy": 1.0, "num_tokens": 2234477.0, "step": 4850 }, { "epoch": 1.7010850542527125, "grad_norm": 0.05217473581433296, "learning_rate": 1.41e-06, "loss": 0.0159, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 2239087.0, "step": 4860 }, { "epoch": 1.704585229261463, "grad_norm": 17.20269012451172, "learning_rate": 1.3100000000000002e-06, "loss": 0.0332, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 2243751.0, "step": 4870 }, { "epoch": 1.7080854042702134, "grad_norm": 0.019595852121710777, "learning_rate": 1.21e-06, "loss": 0.0438, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 2248371.0, "step": 4880 }, { "epoch": 1.7115855792789638, "grad_norm": 0.021653831005096436, "learning_rate": 1.1100000000000002e-06, "loss": 0.0014, "mean_token_accuracy": 1.0, "num_tokens": 2252963.0, "step": 4890 }, { "epoch": 1.7150857542877143, "grad_norm": 0.06774129718542099, "learning_rate": 1.01e-06, "loss": 0.0086, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 2257627.0, "step": 4900 }, { "epoch": 1.7185859292964647, "grad_norm": 0.17008963227272034, "learning_rate": 9.100000000000001e-07, "loss": 0.0511, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 2262204.0, "step": 4910 }, { "epoch": 1.7220861043052151, "grad_norm": 0.05181132256984711, "learning_rate": 8.1e-07, "loss": 0.102, "mean_token_accuracy": 0.9699999928474426, "num_tokens": 2266782.0, "step": 4920 }, { "epoch": 1.7255862793139656, "grad_norm": 0.019397318363189697, "learning_rate": 7.100000000000001e-07, "loss": 0.0653, "mean_token_accuracy": 0.9799999952316284, "num_tokens": 2271407.0, "step": 4930 }, { "epoch": 1.729086454322716, "grad_norm": 12.307579040527344, "learning_rate": 6.100000000000001e-07, "loss": 0.0463, "mean_token_accuracy": 0.9899999976158143, "num_tokens": 2276067.0, "step": 4940 }, { "epoch": 1.7325866293314665, "grad_norm": 0.9262644648551941, "learning_rate": 5.100000000000001e-07, "loss": 0.1174, "mean_token_accuracy": 0.9749999940395355, "num_tokens": 2280720.0, "step": 4950 }, { "epoch": 1.736086804340217, "grad_norm": 10.878531455993652, "learning_rate": 4.1000000000000004e-07, "loss": 0.0366, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 2285258.0, "step": 4960 }, { "epoch": 1.7395869793489673, "grad_norm": 0.015575112774968147, "learning_rate": 3.1e-07, "loss": 0.013, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 2289807.0, "step": 4970 }, { "epoch": 1.7430871543577178, "grad_norm": 6.684645652770996, "learning_rate": 2.1e-07, "loss": 0.048, "mean_token_accuracy": 0.9849999964237213, "num_tokens": 2294427.0, "step": 4980 }, { "epoch": 1.7465873293664682, "grad_norm": 0.2887522280216217, "learning_rate": 1.1e-07, "loss": 0.0268, "mean_token_accuracy": 0.9949999988079071, "num_tokens": 2299106.0, "step": 4990 }, { "epoch": 1.7500875043752186, "grad_norm": 3.4086289405822754, "learning_rate": 1e-08, "loss": 0.0635, "step": 5000 }, { "epoch": 1.7500875043752186, "eval_accuracy": 0.4516740366392925, "eval_f1": 0.3847962989063844, "eval_loss": 0.04607350006699562, "eval_mean_token_accuracy": 0.9878607566910561, "eval_num_tokens": 2303714.0, "eval_precision": 0.4551334955315969, "eval_recall": 0.42526555766959756, "eval_runtime": 245.308, "eval_samples_per_second": 6.453, "eval_steps_per_second": 0.807, "step": 5000 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.477983256915968e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }