| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9994972347913524, | |
| "eval_steps": 750, | |
| "global_step": 1491, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0033517680576504107, | |
| "grad_norm": 14.694869995117188, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 53.6406, | |
| "mean_token_accuracy": 0.5338318642228842, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.006703536115300821, | |
| "grad_norm": 14.033230781555176, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 52.3838, | |
| "mean_token_accuracy": 0.5248840853571892, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.010055304172951232, | |
| "grad_norm": 6.804769039154053, | |
| "learning_rate": 2e-05, | |
| "loss": 47.9105, | |
| "mean_token_accuracy": 0.5399681400507689, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.013407072230601643, | |
| "grad_norm": 7.750083923339844, | |
| "learning_rate": 2.6666666666666667e-05, | |
| "loss": 41.8861, | |
| "mean_token_accuracy": 0.55653104968369, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01675884028825205, | |
| "grad_norm": 6.184543132781982, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 37.33, | |
| "mean_token_accuracy": 0.5655230440199375, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.020110608345902465, | |
| "grad_norm": 4.537179946899414, | |
| "learning_rate": 4e-05, | |
| "loss": 32.7503, | |
| "mean_token_accuracy": 0.587661711126566, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.023462376403552875, | |
| "grad_norm": 3.6645753383636475, | |
| "learning_rate": 4.666666666666667e-05, | |
| "loss": 29.1892, | |
| "mean_token_accuracy": 0.6075583577156067, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.026814144461203285, | |
| "grad_norm": 3.7526533603668213, | |
| "learning_rate": 5.333333333333333e-05, | |
| "loss": 26.3524, | |
| "mean_token_accuracy": 0.6198613092303276, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.030165912518853696, | |
| "grad_norm": 3.0561397075653076, | |
| "learning_rate": 6e-05, | |
| "loss": 24.1513, | |
| "mean_token_accuracy": 0.6353930421173573, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.0335176805765041, | |
| "grad_norm": 2.857618808746338, | |
| "learning_rate": 6.666666666666667e-05, | |
| "loss": 23.5029, | |
| "mean_token_accuracy": 0.6437373287975788, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03686944863415452, | |
| "grad_norm": 2.7901978492736816, | |
| "learning_rate": 7.333333333333333e-05, | |
| "loss": 22.9387, | |
| "mean_token_accuracy": 0.646886795759201, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.04022121669180493, | |
| "grad_norm": 2.8266501426696777, | |
| "learning_rate": 8e-05, | |
| "loss": 22.0359, | |
| "mean_token_accuracy": 0.6525138475000858, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.04357298474945534, | |
| "grad_norm": 2.5010733604431152, | |
| "learning_rate": 8.666666666666667e-05, | |
| "loss": 21.5158, | |
| "mean_token_accuracy": 0.6548139773309231, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.04692475280710575, | |
| "grad_norm": 2.5834386348724365, | |
| "learning_rate": 9.333333333333334e-05, | |
| "loss": 21.5409, | |
| "mean_token_accuracy": 0.6478891499340534, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.05027652086475616, | |
| "grad_norm": 2.6927576065063477, | |
| "learning_rate": 0.0001, | |
| "loss": 20.1017, | |
| "mean_token_accuracy": 0.6757474772632122, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.05362828892240657, | |
| "grad_norm": 2.0276572704315186, | |
| "learning_rate": 9.964689265536724e-05, | |
| "loss": 19.9912, | |
| "mean_token_accuracy": 0.6763999305665493, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.05698005698005698, | |
| "grad_norm": 2.4628567695617676, | |
| "learning_rate": 9.929378531073446e-05, | |
| "loss": 19.9089, | |
| "mean_token_accuracy": 0.672279854118824, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.06033182503770739, | |
| "grad_norm": 2.258838415145874, | |
| "learning_rate": 9.89406779661017e-05, | |
| "loss": 19.7132, | |
| "mean_token_accuracy": 0.6713059276342392, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.0636835930953578, | |
| "grad_norm": 2.447565793991089, | |
| "learning_rate": 9.858757062146892e-05, | |
| "loss": 18.7631, | |
| "mean_token_accuracy": 0.6825208596885204, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.0670353611530082, | |
| "grad_norm": 2.1105902194976807, | |
| "learning_rate": 9.823446327683616e-05, | |
| "loss": 19.4631, | |
| "mean_token_accuracy": 0.6674435302615166, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.07038712921065862, | |
| "grad_norm": 2.309248447418213, | |
| "learning_rate": 9.78813559322034e-05, | |
| "loss": 19.0249, | |
| "mean_token_accuracy": 0.6734571024775505, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.07373889726830904, | |
| "grad_norm": 2.101681709289551, | |
| "learning_rate": 9.752824858757063e-05, | |
| "loss": 18.593, | |
| "mean_token_accuracy": 0.6875097192823887, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.07709066532595944, | |
| "grad_norm": 2.157726526260376, | |
| "learning_rate": 9.717514124293787e-05, | |
| "loss": 18.5973, | |
| "mean_token_accuracy": 0.6829216606914997, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.08044243338360986, | |
| "grad_norm": 2.0711209774017334, | |
| "learning_rate": 9.682203389830509e-05, | |
| "loss": 19.1541, | |
| "mean_token_accuracy": 0.6785640828311443, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.08379420144126026, | |
| "grad_norm": 2.015594959259033, | |
| "learning_rate": 9.646892655367233e-05, | |
| "loss": 18.9493, | |
| "mean_token_accuracy": 0.6861244946718216, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.08714596949891068, | |
| "grad_norm": 2.1295998096466064, | |
| "learning_rate": 9.611581920903955e-05, | |
| "loss": 18.5125, | |
| "mean_token_accuracy": 0.6793887488543987, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.09049773755656108, | |
| "grad_norm": 2.2496395111083984, | |
| "learning_rate": 9.576271186440679e-05, | |
| "loss": 18.4019, | |
| "mean_token_accuracy": 0.6890006221830844, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.0938495056142115, | |
| "grad_norm": 2.1168577671051025, | |
| "learning_rate": 9.540960451977402e-05, | |
| "loss": 18.7305, | |
| "mean_token_accuracy": 0.6841622419655323, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.0972012736718619, | |
| "grad_norm": 1.8554915189743042, | |
| "learning_rate": 9.505649717514125e-05, | |
| "loss": 18.6606, | |
| "mean_token_accuracy": 0.6859239712357521, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.10055304172951232, | |
| "grad_norm": 1.9698066711425781, | |
| "learning_rate": 9.470338983050848e-05, | |
| "loss": 19.1065, | |
| "mean_token_accuracy": 0.6759489566087723, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.10390480978716272, | |
| "grad_norm": 2.2483623027801514, | |
| "learning_rate": 9.43502824858757e-05, | |
| "loss": 18.8041, | |
| "mean_token_accuracy": 0.68142851293087, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.10725657784481314, | |
| "grad_norm": 1.8570690155029297, | |
| "learning_rate": 9.399717514124294e-05, | |
| "loss": 18.8862, | |
| "mean_token_accuracy": 0.6791303649544715, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.11060834590246355, | |
| "grad_norm": 2.143021583557129, | |
| "learning_rate": 9.364406779661016e-05, | |
| "loss": 18.7605, | |
| "mean_token_accuracy": 0.681893227249384, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.11396011396011396, | |
| "grad_norm": 1.8951307535171509, | |
| "learning_rate": 9.32909604519774e-05, | |
| "loss": 18.3005, | |
| "mean_token_accuracy": 0.6897541806101799, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.11731188201776437, | |
| "grad_norm": 1.971745252609253, | |
| "learning_rate": 9.293785310734464e-05, | |
| "loss": 18.8995, | |
| "mean_token_accuracy": 0.6820204116404056, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.12066365007541478, | |
| "grad_norm": 1.910328984260559, | |
| "learning_rate": 9.258474576271187e-05, | |
| "loss": 18.8808, | |
| "mean_token_accuracy": 0.6812884464859963, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.12401541813306519, | |
| "grad_norm": 1.730974555015564, | |
| "learning_rate": 9.223163841807911e-05, | |
| "loss": 18.0871, | |
| "mean_token_accuracy": 0.6907590143382549, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.1273671861907156, | |
| "grad_norm": 2.125452995300293, | |
| "learning_rate": 9.187853107344633e-05, | |
| "loss": 18.1569, | |
| "mean_token_accuracy": 0.689236406236887, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.13071895424836602, | |
| "grad_norm": 2.0234949588775635, | |
| "learning_rate": 9.152542372881357e-05, | |
| "loss": 18.3342, | |
| "mean_token_accuracy": 0.6902932204306126, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.1340707223060164, | |
| "grad_norm": 1.9802364110946655, | |
| "learning_rate": 9.11723163841808e-05, | |
| "loss": 18.7942, | |
| "mean_token_accuracy": 0.6788501650094986, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.13742249036366683, | |
| "grad_norm": 1.8897534608840942, | |
| "learning_rate": 9.081920903954803e-05, | |
| "loss": 18.4679, | |
| "mean_token_accuracy": 0.6900524459779263, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.14077425842131724, | |
| "grad_norm": 1.9040635824203491, | |
| "learning_rate": 9.046610169491526e-05, | |
| "loss": 18.0058, | |
| "mean_token_accuracy": 0.690093420445919, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.14412602647896766, | |
| "grad_norm": 2.0558955669403076, | |
| "learning_rate": 9.011299435028249e-05, | |
| "loss": 17.5489, | |
| "mean_token_accuracy": 0.7006829999387264, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.14747779453661808, | |
| "grad_norm": 1.7952055931091309, | |
| "learning_rate": 8.975988700564972e-05, | |
| "loss": 18.2907, | |
| "mean_token_accuracy": 0.6876891441643238, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.15082956259426847, | |
| "grad_norm": 1.8588192462921143, | |
| "learning_rate": 8.940677966101694e-05, | |
| "loss": 18.4005, | |
| "mean_token_accuracy": 0.6897859051823616, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.15418133065191889, | |
| "grad_norm": 1.9269477128982544, | |
| "learning_rate": 8.905367231638418e-05, | |
| "loss": 18.2096, | |
| "mean_token_accuracy": 0.6909494370222091, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.1575330987095693, | |
| "grad_norm": 1.8693301677703857, | |
| "learning_rate": 8.870056497175142e-05, | |
| "loss": 18.394, | |
| "mean_token_accuracy": 0.6836515329778194, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.16088486676721972, | |
| "grad_norm": 1.787061333656311, | |
| "learning_rate": 8.834745762711864e-05, | |
| "loss": 18.1503, | |
| "mean_token_accuracy": 0.6907145738601684, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.1642366348248701, | |
| "grad_norm": 1.8895225524902344, | |
| "learning_rate": 8.799435028248588e-05, | |
| "loss": 18.3026, | |
| "mean_token_accuracy": 0.6878940775990486, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.16758840288252053, | |
| "grad_norm": 1.835693120956421, | |
| "learning_rate": 8.764124293785311e-05, | |
| "loss": 17.9347, | |
| "mean_token_accuracy": 0.6917316012084485, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.17094017094017094, | |
| "grad_norm": 1.7408661842346191, | |
| "learning_rate": 8.728813559322035e-05, | |
| "loss": 18.0051, | |
| "mean_token_accuracy": 0.689583633840084, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.17429193899782136, | |
| "grad_norm": 1.9096996784210205, | |
| "learning_rate": 8.693502824858759e-05, | |
| "loss": 17.6064, | |
| "mean_token_accuracy": 0.6965925216674804, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.17764370705547175, | |
| "grad_norm": 1.9822146892547607, | |
| "learning_rate": 8.658192090395481e-05, | |
| "loss": 17.6301, | |
| "mean_token_accuracy": 0.7005406267940998, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.18099547511312217, | |
| "grad_norm": 1.8383901119232178, | |
| "learning_rate": 8.622881355932204e-05, | |
| "loss": 17.9114, | |
| "mean_token_accuracy": 0.6876685306429863, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.18434724317077258, | |
| "grad_norm": 1.7920355796813965, | |
| "learning_rate": 8.587570621468927e-05, | |
| "loss": 18.1271, | |
| "mean_token_accuracy": 0.689356567710638, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.187699011228423, | |
| "grad_norm": 1.6455663442611694, | |
| "learning_rate": 8.55225988700565e-05, | |
| "loss": 17.787, | |
| "mean_token_accuracy": 0.6919776491820813, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.1910507792860734, | |
| "grad_norm": 1.9442647695541382, | |
| "learning_rate": 8.516949152542373e-05, | |
| "loss": 17.6019, | |
| "mean_token_accuracy": 0.6980393722653389, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.1944025473437238, | |
| "grad_norm": 2.294377565383911, | |
| "learning_rate": 8.481638418079096e-05, | |
| "loss": 17.8778, | |
| "mean_token_accuracy": 0.6954585202038288, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.19775431540137423, | |
| "grad_norm": 1.8009259700775146, | |
| "learning_rate": 8.44632768361582e-05, | |
| "loss": 17.5257, | |
| "mean_token_accuracy": 0.6998075112700463, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.20110608345902464, | |
| "grad_norm": 2.015516757965088, | |
| "learning_rate": 8.411016949152542e-05, | |
| "loss": 17.7554, | |
| "mean_token_accuracy": 0.6968327619135379, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.20445785151667506, | |
| "grad_norm": 1.5640082359313965, | |
| "learning_rate": 8.375706214689266e-05, | |
| "loss": 17.3438, | |
| "mean_token_accuracy": 0.69996168166399, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.20780961957432545, | |
| "grad_norm": 1.9527899026870728, | |
| "learning_rate": 8.340395480225988e-05, | |
| "loss": 17.6883, | |
| "mean_token_accuracy": 0.6988407798111439, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.21116138763197587, | |
| "grad_norm": 1.8222606182098389, | |
| "learning_rate": 8.305084745762712e-05, | |
| "loss": 17.0646, | |
| "mean_token_accuracy": 0.7061679445207119, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.21451315568962628, | |
| "grad_norm": 1.8560868501663208, | |
| "learning_rate": 8.269774011299435e-05, | |
| "loss": 17.8875, | |
| "mean_token_accuracy": 0.6941629223525524, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.2178649237472767, | |
| "grad_norm": 1.7588037252426147, | |
| "learning_rate": 8.234463276836159e-05, | |
| "loss": 17.6412, | |
| "mean_token_accuracy": 0.6954927705228329, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.2212166918049271, | |
| "grad_norm": 1.738242268562317, | |
| "learning_rate": 8.199152542372883e-05, | |
| "loss": 17.8251, | |
| "mean_token_accuracy": 0.6898994512856007, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.2245684598625775, | |
| "grad_norm": 1.8485089540481567, | |
| "learning_rate": 8.163841807909605e-05, | |
| "loss": 17.3078, | |
| "mean_token_accuracy": 0.7000270999968052, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.22792022792022792, | |
| "grad_norm": 1.8579105138778687, | |
| "learning_rate": 8.128531073446328e-05, | |
| "loss": 17.3078, | |
| "mean_token_accuracy": 0.6995702408254146, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.23127199597787834, | |
| "grad_norm": 1.7994352579116821, | |
| "learning_rate": 8.093220338983051e-05, | |
| "loss": 17.7557, | |
| "mean_token_accuracy": 0.6928035505115986, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.23462376403552873, | |
| "grad_norm": 1.9240634441375732, | |
| "learning_rate": 8.057909604519774e-05, | |
| "loss": 17.4329, | |
| "mean_token_accuracy": 0.6960855178534985, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.23797553209317915, | |
| "grad_norm": 1.6718952655792236, | |
| "learning_rate": 8.022598870056498e-05, | |
| "loss": 17.5951, | |
| "mean_token_accuracy": 0.6947735913097859, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.24132730015082957, | |
| "grad_norm": 1.6835826635360718, | |
| "learning_rate": 7.98728813559322e-05, | |
| "loss": 18.1085, | |
| "mean_token_accuracy": 0.6882089108228684, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.24467906820847998, | |
| "grad_norm": 1.7387073040008545, | |
| "learning_rate": 7.951977401129944e-05, | |
| "loss": 17.799, | |
| "mean_token_accuracy": 0.6932998545467853, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.24803083626613037, | |
| "grad_norm": 2.0071725845336914, | |
| "learning_rate": 7.916666666666666e-05, | |
| "loss": 17.4076, | |
| "mean_token_accuracy": 0.6961173862218857, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.2513826043237808, | |
| "grad_norm": 2.326915740966797, | |
| "learning_rate": 7.88135593220339e-05, | |
| "loss": 17.3121, | |
| "mean_token_accuracy": 0.7005321949720382, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.2547343723814312, | |
| "grad_norm": 2.1876060962677, | |
| "learning_rate": 7.846045197740113e-05, | |
| "loss": 17.9069, | |
| "mean_token_accuracy": 0.6906426399946213, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.2580861404390816, | |
| "grad_norm": 1.849671483039856, | |
| "learning_rate": 7.810734463276837e-05, | |
| "loss": 17.483, | |
| "mean_token_accuracy": 0.7000573620200157, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.26143790849673204, | |
| "grad_norm": 1.6676862239837646, | |
| "learning_rate": 7.775423728813561e-05, | |
| "loss": 16.8936, | |
| "mean_token_accuracy": 0.7045633904635906, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.26478967655438246, | |
| "grad_norm": 1.6702505350112915, | |
| "learning_rate": 7.740112994350283e-05, | |
| "loss": 17.904, | |
| "mean_token_accuracy": 0.6874841086566448, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.2681414446120328, | |
| "grad_norm": 1.7280704975128174, | |
| "learning_rate": 7.704802259887007e-05, | |
| "loss": 17.4515, | |
| "mean_token_accuracy": 0.7018027983605861, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.27149321266968324, | |
| "grad_norm": 1.8801991939544678, | |
| "learning_rate": 7.669491525423729e-05, | |
| "loss": 17.43, | |
| "mean_token_accuracy": 0.7009049601852894, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.27484498072733365, | |
| "grad_norm": 1.9758073091506958, | |
| "learning_rate": 7.634180790960453e-05, | |
| "loss": 17.5984, | |
| "mean_token_accuracy": 0.6948069363832474, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.27819674878498407, | |
| "grad_norm": 1.5747147798538208, | |
| "learning_rate": 7.598870056497176e-05, | |
| "loss": 18.3079, | |
| "mean_token_accuracy": 0.6853139907121658, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.2815485168426345, | |
| "grad_norm": 1.6292234659194946, | |
| "learning_rate": 7.563559322033898e-05, | |
| "loss": 17.4527, | |
| "mean_token_accuracy": 0.697540608048439, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.2849002849002849, | |
| "grad_norm": 1.6185086965560913, | |
| "learning_rate": 7.528248587570622e-05, | |
| "loss": 17.4193, | |
| "mean_token_accuracy": 0.7012022204697133, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.2882520529579353, | |
| "grad_norm": 1.8361762762069702, | |
| "learning_rate": 7.492937853107344e-05, | |
| "loss": 17.4544, | |
| "mean_token_accuracy": 0.698820473998785, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.29160382101558574, | |
| "grad_norm": 1.7740592956542969, | |
| "learning_rate": 7.457627118644068e-05, | |
| "loss": 18.0507, | |
| "mean_token_accuracy": 0.6881603226065636, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.29495558907323616, | |
| "grad_norm": 1.8252911567687988, | |
| "learning_rate": 7.42231638418079e-05, | |
| "loss": 17.155, | |
| "mean_token_accuracy": 0.7065504610538482, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.2983073571308865, | |
| "grad_norm": 1.8424382209777832, | |
| "learning_rate": 7.387005649717514e-05, | |
| "loss": 17.3055, | |
| "mean_token_accuracy": 0.6978819817304611, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.30165912518853694, | |
| "grad_norm": 1.7494243383407593, | |
| "learning_rate": 7.351694915254238e-05, | |
| "loss": 16.8365, | |
| "mean_token_accuracy": 0.7099504336714745, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.30501089324618735, | |
| "grad_norm": 1.936540961265564, | |
| "learning_rate": 7.316384180790961e-05, | |
| "loss": 18.2753, | |
| "mean_token_accuracy": 0.6913827233016491, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.30836266130383777, | |
| "grad_norm": 1.810272216796875, | |
| "learning_rate": 7.281073446327685e-05, | |
| "loss": 17.0536, | |
| "mean_token_accuracy": 0.6986232809722424, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.3117144293614882, | |
| "grad_norm": 1.6832094192504883, | |
| "learning_rate": 7.245762711864407e-05, | |
| "loss": 17.2231, | |
| "mean_token_accuracy": 0.702030860632658, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.3150661974191386, | |
| "grad_norm": 1.8872151374816895, | |
| "learning_rate": 7.21045197740113e-05, | |
| "loss": 17.5502, | |
| "mean_token_accuracy": 0.6932449921965599, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.318417965476789, | |
| "grad_norm": 1.788021445274353, | |
| "learning_rate": 7.175141242937854e-05, | |
| "loss": 16.8596, | |
| "mean_token_accuracy": 0.7096694305539131, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.32176973353443944, | |
| "grad_norm": 1.8025559186935425, | |
| "learning_rate": 7.139830508474577e-05, | |
| "loss": 16.662, | |
| "mean_token_accuracy": 0.7063573338091373, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.3251215015920898, | |
| "grad_norm": 2.274674654006958, | |
| "learning_rate": 7.1045197740113e-05, | |
| "loss": 17.5965, | |
| "mean_token_accuracy": 0.6934389650821686, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.3284732696497402, | |
| "grad_norm": 1.6426053047180176, | |
| "learning_rate": 7.069209039548022e-05, | |
| "loss": 17.0914, | |
| "mean_token_accuracy": 0.7049042917788029, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.33182503770739064, | |
| "grad_norm": 1.6252586841583252, | |
| "learning_rate": 7.033898305084746e-05, | |
| "loss": 17.6078, | |
| "mean_token_accuracy": 0.6924709647893905, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.33517680576504105, | |
| "grad_norm": 1.7185930013656616, | |
| "learning_rate": 6.998587570621468e-05, | |
| "loss": 17.314, | |
| "mean_token_accuracy": 0.7039985358715057, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.33852857382269147, | |
| "grad_norm": 1.7891852855682373, | |
| "learning_rate": 6.963276836158192e-05, | |
| "loss": 17.2188, | |
| "mean_token_accuracy": 0.6977060906589031, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.3418803418803419, | |
| "grad_norm": 1.9103929996490479, | |
| "learning_rate": 6.927966101694916e-05, | |
| "loss": 17.4467, | |
| "mean_token_accuracy": 0.6982413403689861, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.3452321099379923, | |
| "grad_norm": 1.8996375799179077, | |
| "learning_rate": 6.892655367231638e-05, | |
| "loss": 16.9608, | |
| "mean_token_accuracy": 0.7054095402359962, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.3485838779956427, | |
| "grad_norm": 2.0335419178009033, | |
| "learning_rate": 6.857344632768362e-05, | |
| "loss": 17.3361, | |
| "mean_token_accuracy": 0.7016568422317505, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.35193564605329314, | |
| "grad_norm": 1.9008755683898926, | |
| "learning_rate": 6.822033898305085e-05, | |
| "loss": 16.9694, | |
| "mean_token_accuracy": 0.7059390284121037, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.3552874141109435, | |
| "grad_norm": 1.8340988159179688, | |
| "learning_rate": 6.786723163841809e-05, | |
| "loss": 17.3528, | |
| "mean_token_accuracy": 0.7033507622778415, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.3586391821685939, | |
| "grad_norm": 1.6903594732284546, | |
| "learning_rate": 6.751412429378532e-05, | |
| "loss": 17.3021, | |
| "mean_token_accuracy": 0.7001501135528088, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.36199095022624433, | |
| "grad_norm": 1.8101950883865356, | |
| "learning_rate": 6.716101694915255e-05, | |
| "loss": 17.938, | |
| "mean_token_accuracy": 0.6908830553293228, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.36534271828389475, | |
| "grad_norm": 1.6470075845718384, | |
| "learning_rate": 6.680790960451978e-05, | |
| "loss": 17.6612, | |
| "mean_token_accuracy": 0.6923478744924069, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.36869448634154517, | |
| "grad_norm": 2.1860337257385254, | |
| "learning_rate": 6.6454802259887e-05, | |
| "loss": 17.5684, | |
| "mean_token_accuracy": 0.6983748801052571, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.3720462543991956, | |
| "grad_norm": 1.717653512954712, | |
| "learning_rate": 6.610169491525424e-05, | |
| "loss": 17.1166, | |
| "mean_token_accuracy": 0.7025655619800091, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.375398022456846, | |
| "grad_norm": 1.9525723457336426, | |
| "learning_rate": 6.574858757062147e-05, | |
| "loss": 17.2908, | |
| "mean_token_accuracy": 0.6997996769845486, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.3787497905144964, | |
| "grad_norm": 1.6053602695465088, | |
| "learning_rate": 6.53954802259887e-05, | |
| "loss": 17.3894, | |
| "mean_token_accuracy": 0.698741364479065, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.3821015585721468, | |
| "grad_norm": 1.7356934547424316, | |
| "learning_rate": 6.504237288135594e-05, | |
| "loss": 17.1546, | |
| "mean_token_accuracy": 0.7013543620705605, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3854533266297972, | |
| "grad_norm": 1.7188559770584106, | |
| "learning_rate": 6.468926553672316e-05, | |
| "loss": 17.7637, | |
| "mean_token_accuracy": 0.6936320647597313, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.3888050946874476, | |
| "grad_norm": 1.8413478136062622, | |
| "learning_rate": 6.43361581920904e-05, | |
| "loss": 17.8498, | |
| "mean_token_accuracy": 0.695782047510147, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.39215686274509803, | |
| "grad_norm": 1.5715190172195435, | |
| "learning_rate": 6.398305084745762e-05, | |
| "loss": 17.4304, | |
| "mean_token_accuracy": 0.6989135831594467, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.39550863080274845, | |
| "grad_norm": 1.8729442358016968, | |
| "learning_rate": 6.362994350282486e-05, | |
| "loss": 16.9125, | |
| "mean_token_accuracy": 0.708356649428606, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.39886039886039887, | |
| "grad_norm": 2.099592685699463, | |
| "learning_rate": 6.327683615819209e-05, | |
| "loss": 17.542, | |
| "mean_token_accuracy": 0.6888726130127907, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.4022121669180493, | |
| "grad_norm": 1.6204314231872559, | |
| "learning_rate": 6.292372881355933e-05, | |
| "loss": 16.9305, | |
| "mean_token_accuracy": 0.7038852870464325, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.4055639349756997, | |
| "grad_norm": 2.12034010887146, | |
| "learning_rate": 6.257062146892656e-05, | |
| "loss": 17.0389, | |
| "mean_token_accuracy": 0.704576326906681, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.4089157030333501, | |
| "grad_norm": 1.6821502447128296, | |
| "learning_rate": 6.221751412429379e-05, | |
| "loss": 16.788, | |
| "mean_token_accuracy": 0.7000284940004349, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.4122674710910005, | |
| "grad_norm": 1.8137435913085938, | |
| "learning_rate": 6.186440677966102e-05, | |
| "loss": 17.5926, | |
| "mean_token_accuracy": 0.6961537927389145, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.4156192391486509, | |
| "grad_norm": 1.6652235984802246, | |
| "learning_rate": 6.151129943502825e-05, | |
| "loss": 17.3539, | |
| "mean_token_accuracy": 0.7028377398848533, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.4189710072063013, | |
| "grad_norm": 1.766480803489685, | |
| "learning_rate": 6.115819209039548e-05, | |
| "loss": 17.529, | |
| "mean_token_accuracy": 0.6905739739537239, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.42232277526395173, | |
| "grad_norm": 1.6319854259490967, | |
| "learning_rate": 6.080508474576272e-05, | |
| "loss": 16.9847, | |
| "mean_token_accuracy": 0.7060947254300117, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.42567454332160215, | |
| "grad_norm": 2.1006696224212646, | |
| "learning_rate": 6.045197740112994e-05, | |
| "loss": 16.9317, | |
| "mean_token_accuracy": 0.7015593230724335, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.42902631137925257, | |
| "grad_norm": 1.7353427410125732, | |
| "learning_rate": 6.009887005649718e-05, | |
| "loss": 17.4744, | |
| "mean_token_accuracy": 0.7001501567661762, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.432378079436903, | |
| "grad_norm": 1.9449700117111206, | |
| "learning_rate": 5.974576271186441e-05, | |
| "loss": 16.8705, | |
| "mean_token_accuracy": 0.7026407413184643, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.4357298474945534, | |
| "grad_norm": 1.6030067205429077, | |
| "learning_rate": 5.9392655367231644e-05, | |
| "loss": 16.8924, | |
| "mean_token_accuracy": 0.702277285605669, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.43908161555220376, | |
| "grad_norm": 1.5722424983978271, | |
| "learning_rate": 5.903954802259888e-05, | |
| "loss": 17.364, | |
| "mean_token_accuracy": 0.6959278948605061, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.4424333836098542, | |
| "grad_norm": 1.8168216943740845, | |
| "learning_rate": 5.86864406779661e-05, | |
| "loss": 16.704, | |
| "mean_token_accuracy": 0.7045813865959645, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.4457851516675046, | |
| "grad_norm": 1.905402660369873, | |
| "learning_rate": 5.833333333333334e-05, | |
| "loss": 16.8896, | |
| "mean_token_accuracy": 0.7026248089969158, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.449136919725155, | |
| "grad_norm": 1.7437454462051392, | |
| "learning_rate": 5.798022598870056e-05, | |
| "loss": 17.0496, | |
| "mean_token_accuracy": 0.702862861007452, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.45248868778280543, | |
| "grad_norm": 1.7496871948242188, | |
| "learning_rate": 5.76271186440678e-05, | |
| "loss": 16.7024, | |
| "mean_token_accuracy": 0.7073140636086463, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.45584045584045585, | |
| "grad_norm": 1.6521803140640259, | |
| "learning_rate": 5.727401129943503e-05, | |
| "loss": 17.4437, | |
| "mean_token_accuracy": 0.6910906590521335, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.45919222389810627, | |
| "grad_norm": 1.7904677391052246, | |
| "learning_rate": 5.6920903954802264e-05, | |
| "loss": 17.4803, | |
| "mean_token_accuracy": 0.6987466789782047, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.4625439919557567, | |
| "grad_norm": 2.4545388221740723, | |
| "learning_rate": 5.65677966101695e-05, | |
| "loss": 17.2987, | |
| "mean_token_accuracy": 0.699196208268404, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.46589576001340705, | |
| "grad_norm": 1.6428866386413574, | |
| "learning_rate": 5.6214689265536723e-05, | |
| "loss": 16.7636, | |
| "mean_token_accuracy": 0.7029999569058418, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.46924752807105746, | |
| "grad_norm": 1.9685977697372437, | |
| "learning_rate": 5.586158192090396e-05, | |
| "loss": 17.3887, | |
| "mean_token_accuracy": 0.6938736639916897, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.4725992961287079, | |
| "grad_norm": 1.5567928552627563, | |
| "learning_rate": 5.550847457627118e-05, | |
| "loss": 17.1879, | |
| "mean_token_accuracy": 0.7024729043245316, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.4759510641863583, | |
| "grad_norm": 1.6846567392349243, | |
| "learning_rate": 5.515536723163842e-05, | |
| "loss": 16.8679, | |
| "mean_token_accuracy": 0.7025640495121479, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.4793028322440087, | |
| "grad_norm": 1.6596832275390625, | |
| "learning_rate": 5.480225988700565e-05, | |
| "loss": 16.7137, | |
| "mean_token_accuracy": 0.7031160019338131, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.48265460030165913, | |
| "grad_norm": 2.04453444480896, | |
| "learning_rate": 5.4449152542372885e-05, | |
| "loss": 17.0646, | |
| "mean_token_accuracy": 0.7018779084086418, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.48600636835930955, | |
| "grad_norm": 1.7244528532028198, | |
| "learning_rate": 5.409604519774012e-05, | |
| "loss": 17.1897, | |
| "mean_token_accuracy": 0.6981223806738853, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.48935813641695997, | |
| "grad_norm": 1.6929802894592285, | |
| "learning_rate": 5.3742937853107344e-05, | |
| "loss": 17.2678, | |
| "mean_token_accuracy": 0.6996262572705746, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.4927099044746104, | |
| "grad_norm": 1.7945303916931152, | |
| "learning_rate": 5.338983050847458e-05, | |
| "loss": 17.1465, | |
| "mean_token_accuracy": 0.7002299666404724, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.49606167253226074, | |
| "grad_norm": 1.5936013460159302, | |
| "learning_rate": 5.30367231638418e-05, | |
| "loss": 17.0265, | |
| "mean_token_accuracy": 0.6998031720519066, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.49941344058991116, | |
| "grad_norm": 1.553004264831543, | |
| "learning_rate": 5.268361581920904e-05, | |
| "loss": 16.7301, | |
| "mean_token_accuracy": 0.7022854961454869, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.5027652086475616, | |
| "grad_norm": 1.7667690515518188, | |
| "learning_rate": 5.2330508474576275e-05, | |
| "loss": 16.8576, | |
| "mean_token_accuracy": 0.7085686258971691, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.5027652086475616, | |
| "eval_loss": 1.0600364208221436, | |
| "eval_mean_token_accuracy": 0.7049777010093035, | |
| "eval_runtime": 1736.5707, | |
| "eval_samples_per_second": 1.392, | |
| "eval_steps_per_second": 0.174, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.506116976705212, | |
| "grad_norm": 1.4901829957962036, | |
| "learning_rate": 5.1977401129943505e-05, | |
| "loss": 17.0004, | |
| "mean_token_accuracy": 0.6990960523486137, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.5094687447628624, | |
| "grad_norm": 1.8451662063598633, | |
| "learning_rate": 5.162429378531074e-05, | |
| "loss": 17.2012, | |
| "mean_token_accuracy": 0.7007680244743824, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.5128205128205128, | |
| "grad_norm": 1.6952011585235596, | |
| "learning_rate": 5.1271186440677964e-05, | |
| "loss": 17.612, | |
| "mean_token_accuracy": 0.6927438467741013, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.5161722808781632, | |
| "grad_norm": 1.7307817935943604, | |
| "learning_rate": 5.09180790960452e-05, | |
| "loss": 16.8776, | |
| "mean_token_accuracy": 0.706513649225235, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.5195240489358136, | |
| "grad_norm": 1.6692585945129395, | |
| "learning_rate": 5.056497175141243e-05, | |
| "loss": 17.0364, | |
| "mean_token_accuracy": 0.704279126226902, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.5228758169934641, | |
| "grad_norm": 1.6963402032852173, | |
| "learning_rate": 5.0211864406779666e-05, | |
| "loss": 16.8957, | |
| "mean_token_accuracy": 0.7085353158414364, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.5262275850511144, | |
| "grad_norm": 1.678458571434021, | |
| "learning_rate": 4.9858757062146896e-05, | |
| "loss": 17.7932, | |
| "mean_token_accuracy": 0.6964584030210972, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.5295793531087649, | |
| "grad_norm": 1.7449827194213867, | |
| "learning_rate": 4.9505649717514125e-05, | |
| "loss": 16.8765, | |
| "mean_token_accuracy": 0.7036922007799149, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.5329311211664153, | |
| "grad_norm": 1.7107524871826172, | |
| "learning_rate": 4.915254237288136e-05, | |
| "loss": 17.243, | |
| "mean_token_accuracy": 0.6997682720422744, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.5362828892240656, | |
| "grad_norm": 1.6416223049163818, | |
| "learning_rate": 4.879943502824859e-05, | |
| "loss": 16.7253, | |
| "mean_token_accuracy": 0.7050332672894001, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.5396346572817161, | |
| "grad_norm": 1.867213249206543, | |
| "learning_rate": 4.844632768361582e-05, | |
| "loss": 16.8566, | |
| "mean_token_accuracy": 0.7032786093652248, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.5429864253393665, | |
| "grad_norm": 1.6539360284805298, | |
| "learning_rate": 4.809322033898305e-05, | |
| "loss": 16.6993, | |
| "mean_token_accuracy": 0.7117977932095527, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.546338193397017, | |
| "grad_norm": 1.752715826034546, | |
| "learning_rate": 4.7740112994350286e-05, | |
| "loss": 17.5809, | |
| "mean_token_accuracy": 0.6992670528590679, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.5496899614546673, | |
| "grad_norm": 1.806174397468567, | |
| "learning_rate": 4.7387005649717516e-05, | |
| "loss": 17.1588, | |
| "mean_token_accuracy": 0.6960965767502785, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.5530417295123178, | |
| "grad_norm": 1.719764232635498, | |
| "learning_rate": 4.703389830508475e-05, | |
| "loss": 16.8685, | |
| "mean_token_accuracy": 0.7025568410754204, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.5563934975699681, | |
| "grad_norm": 1.7800629138946533, | |
| "learning_rate": 4.668079096045198e-05, | |
| "loss": 16.8872, | |
| "mean_token_accuracy": 0.6994628652930259, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.5597452656276186, | |
| "grad_norm": 1.7011103630065918, | |
| "learning_rate": 4.632768361581921e-05, | |
| "loss": 17.2342, | |
| "mean_token_accuracy": 0.7006913289427757, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.563097033685269, | |
| "grad_norm": 1.6887695789337158, | |
| "learning_rate": 4.597457627118644e-05, | |
| "loss": 16.7385, | |
| "mean_token_accuracy": 0.7045929700136184, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.5664488017429193, | |
| "grad_norm": 1.9496142864227295, | |
| "learning_rate": 4.562146892655367e-05, | |
| "loss": 16.8387, | |
| "mean_token_accuracy": 0.7083131410181522, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.5698005698005698, | |
| "grad_norm": 1.7757388353347778, | |
| "learning_rate": 4.5268361581920906e-05, | |
| "loss": 17.3856, | |
| "mean_token_accuracy": 0.6994826771318913, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.5731523378582202, | |
| "grad_norm": 1.7115302085876465, | |
| "learning_rate": 4.491525423728814e-05, | |
| "loss": 16.5993, | |
| "mean_token_accuracy": 0.7093915119767189, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.5765041059158706, | |
| "grad_norm": 1.7968231439590454, | |
| "learning_rate": 4.456214689265537e-05, | |
| "loss": 16.8983, | |
| "mean_token_accuracy": 0.7087731070816516, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.579855873973521, | |
| "grad_norm": 1.6066899299621582, | |
| "learning_rate": 4.42090395480226e-05, | |
| "loss": 16.7126, | |
| "mean_token_accuracy": 0.7053335346281528, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.5832076420311715, | |
| "grad_norm": 1.6380205154418945, | |
| "learning_rate": 4.385593220338983e-05, | |
| "loss": 17.0037, | |
| "mean_token_accuracy": 0.7038719221949578, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.5865594100888218, | |
| "grad_norm": 1.8956695795059204, | |
| "learning_rate": 4.350282485875706e-05, | |
| "loss": 16.9679, | |
| "mean_token_accuracy": 0.6983371920883655, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.5899111781464723, | |
| "grad_norm": 1.625135064125061, | |
| "learning_rate": 4.314971751412429e-05, | |
| "loss": 17.0642, | |
| "mean_token_accuracy": 0.7067640118300915, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.5932629462041227, | |
| "grad_norm": 1.6344581842422485, | |
| "learning_rate": 4.279661016949153e-05, | |
| "loss": 16.3079, | |
| "mean_token_accuracy": 0.7225491903722286, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.596614714261773, | |
| "grad_norm": 1.7680976390838623, | |
| "learning_rate": 4.244350282485876e-05, | |
| "loss": 16.7187, | |
| "mean_token_accuracy": 0.7041032016277313, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.5999664823194235, | |
| "grad_norm": 1.8056613206863403, | |
| "learning_rate": 4.209039548022599e-05, | |
| "loss": 17.3536, | |
| "mean_token_accuracy": 0.6975419208407402, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.6033182503770739, | |
| "grad_norm": 1.8398966789245605, | |
| "learning_rate": 4.173728813559322e-05, | |
| "loss": 16.6245, | |
| "mean_token_accuracy": 0.7088275127112865, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.6066700184347243, | |
| "grad_norm": 1.8332566022872925, | |
| "learning_rate": 4.138418079096045e-05, | |
| "loss": 17.0128, | |
| "mean_token_accuracy": 0.7018843114376068, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.6100217864923747, | |
| "grad_norm": 1.6582337617874146, | |
| "learning_rate": 4.103107344632768e-05, | |
| "loss": 16.8948, | |
| "mean_token_accuracy": 0.7051651798188686, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.6133735545500252, | |
| "grad_norm": 1.7373839616775513, | |
| "learning_rate": 4.067796610169492e-05, | |
| "loss": 16.9138, | |
| "mean_token_accuracy": 0.7022108249366283, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.6167253226076755, | |
| "grad_norm": 1.6373577117919922, | |
| "learning_rate": 4.0324858757062154e-05, | |
| "loss": 17.0573, | |
| "mean_token_accuracy": 0.7042267486453057, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.620077090665326, | |
| "grad_norm": 1.581024408340454, | |
| "learning_rate": 3.997175141242938e-05, | |
| "loss": 16.6234, | |
| "mean_token_accuracy": 0.7054463028907776, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.6234288587229764, | |
| "grad_norm": 1.6900616884231567, | |
| "learning_rate": 3.961864406779661e-05, | |
| "loss": 17.0468, | |
| "mean_token_accuracy": 0.7014504976570606, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.6267806267806267, | |
| "grad_norm": 1.6560430526733398, | |
| "learning_rate": 3.926553672316384e-05, | |
| "loss": 16.909, | |
| "mean_token_accuracy": 0.7064756542444229, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.6301323948382772, | |
| "grad_norm": 1.8687000274658203, | |
| "learning_rate": 3.891242937853107e-05, | |
| "loss": 17.0047, | |
| "mean_token_accuracy": 0.7055176287889481, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.6334841628959276, | |
| "grad_norm": 1.777716040611267, | |
| "learning_rate": 3.855932203389831e-05, | |
| "loss": 16.556, | |
| "mean_token_accuracy": 0.7047871246933937, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.636835930953578, | |
| "grad_norm": 1.6830016374588013, | |
| "learning_rate": 3.820621468926554e-05, | |
| "loss": 16.5832, | |
| "mean_token_accuracy": 0.7049862682819367, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.6401876990112284, | |
| "grad_norm": 1.5959638357162476, | |
| "learning_rate": 3.7853107344632774e-05, | |
| "loss": 16.8336, | |
| "mean_token_accuracy": 0.7072055459022522, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.6435394670688789, | |
| "grad_norm": 1.82794189453125, | |
| "learning_rate": 3.7500000000000003e-05, | |
| "loss": 16.6644, | |
| "mean_token_accuracy": 0.7058505766093731, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.6468912351265292, | |
| "grad_norm": 1.6554478406906128, | |
| "learning_rate": 3.714689265536723e-05, | |
| "loss": 16.2796, | |
| "mean_token_accuracy": 0.7101977132260799, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.6502430031841796, | |
| "grad_norm": 1.8698370456695557, | |
| "learning_rate": 3.679378531073446e-05, | |
| "loss": 16.1934, | |
| "mean_token_accuracy": 0.7142874717712402, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.6535947712418301, | |
| "grad_norm": 1.8040566444396973, | |
| "learning_rate": 3.644067796610169e-05, | |
| "loss": 16.5345, | |
| "mean_token_accuracy": 0.7125143676996231, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.6569465392994804, | |
| "grad_norm": 1.6644558906555176, | |
| "learning_rate": 3.608757062146893e-05, | |
| "loss": 16.508, | |
| "mean_token_accuracy": 0.7078846462070942, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.6602983073571309, | |
| "grad_norm": 1.7228506803512573, | |
| "learning_rate": 3.573446327683616e-05, | |
| "loss": 16.8474, | |
| "mean_token_accuracy": 0.7084795109927654, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.6636500754147813, | |
| "grad_norm": 1.486241102218628, | |
| "learning_rate": 3.5381355932203394e-05, | |
| "loss": 17.1453, | |
| "mean_token_accuracy": 0.6975291892886162, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.6670018434724317, | |
| "grad_norm": 1.7130765914916992, | |
| "learning_rate": 3.5028248587570624e-05, | |
| "loss": 16.458, | |
| "mean_token_accuracy": 0.7106956362724304, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.6703536115300821, | |
| "grad_norm": 1.863926649093628, | |
| "learning_rate": 3.467514124293785e-05, | |
| "loss": 17.3095, | |
| "mean_token_accuracy": 0.6962033234536648, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6737053795877326, | |
| "grad_norm": 1.6535072326660156, | |
| "learning_rate": 3.432203389830508e-05, | |
| "loss": 16.6846, | |
| "mean_token_accuracy": 0.7084034703671932, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.6770571476453829, | |
| "grad_norm": 1.7278594970703125, | |
| "learning_rate": 3.396892655367232e-05, | |
| "loss": 16.9805, | |
| "mean_token_accuracy": 0.7026786416769027, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.6804089157030333, | |
| "grad_norm": 1.9055004119873047, | |
| "learning_rate": 3.361581920903955e-05, | |
| "loss": 17.2562, | |
| "mean_token_accuracy": 0.6977267302572727, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.6837606837606838, | |
| "grad_norm": 1.6398614645004272, | |
| "learning_rate": 3.326271186440678e-05, | |
| "loss": 17.3378, | |
| "mean_token_accuracy": 0.6958214737474918, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.6871124518183341, | |
| "grad_norm": 1.926950454711914, | |
| "learning_rate": 3.2909604519774014e-05, | |
| "loss": 16.6536, | |
| "mean_token_accuracy": 0.7083842910826206, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.6904642198759846, | |
| "grad_norm": 1.8061659336090088, | |
| "learning_rate": 3.2556497175141244e-05, | |
| "loss": 16.643, | |
| "mean_token_accuracy": 0.7093963578343392, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.693815987933635, | |
| "grad_norm": 1.6816084384918213, | |
| "learning_rate": 3.2203389830508473e-05, | |
| "loss": 16.9696, | |
| "mean_token_accuracy": 0.7000316813588142, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.6971677559912854, | |
| "grad_norm": 1.630842685699463, | |
| "learning_rate": 3.185028248587571e-05, | |
| "loss": 16.587, | |
| "mean_token_accuracy": 0.7107978977262974, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.7005195240489358, | |
| "grad_norm": 1.755123257637024, | |
| "learning_rate": 3.149717514124294e-05, | |
| "loss": 17.0736, | |
| "mean_token_accuracy": 0.7017260067164898, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.7038712921065863, | |
| "grad_norm": 1.4850029945373535, | |
| "learning_rate": 3.114406779661017e-05, | |
| "loss": 16.3165, | |
| "mean_token_accuracy": 0.7119720429182053, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.7072230601642366, | |
| "grad_norm": 1.916961908340454, | |
| "learning_rate": 3.0790960451977405e-05, | |
| "loss": 17.0237, | |
| "mean_token_accuracy": 0.6976533338427544, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.710574828221887, | |
| "grad_norm": 1.5003294944763184, | |
| "learning_rate": 3.043785310734463e-05, | |
| "loss": 16.8504, | |
| "mean_token_accuracy": 0.7056308597326278, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.7139265962795375, | |
| "grad_norm": 1.9166836738586426, | |
| "learning_rate": 3.0084745762711864e-05, | |
| "loss": 16.8231, | |
| "mean_token_accuracy": 0.7023352533578873, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.7172783643371878, | |
| "grad_norm": 1.7789411544799805, | |
| "learning_rate": 2.97316384180791e-05, | |
| "loss": 17.3132, | |
| "mean_token_accuracy": 0.6994914725422859, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.7206301323948383, | |
| "grad_norm": 1.7289875745773315, | |
| "learning_rate": 2.937853107344633e-05, | |
| "loss": 17.3902, | |
| "mean_token_accuracy": 0.69447166249156, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.7239819004524887, | |
| "grad_norm": 1.4835467338562012, | |
| "learning_rate": 2.902542372881356e-05, | |
| "loss": 16.751, | |
| "mean_token_accuracy": 0.7052346661686897, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.7273336685101391, | |
| "grad_norm": 1.5802119970321655, | |
| "learning_rate": 2.8672316384180792e-05, | |
| "loss": 16.6574, | |
| "mean_token_accuracy": 0.7059398606419564, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.7306854365677895, | |
| "grad_norm": 1.8420851230621338, | |
| "learning_rate": 2.8319209039548022e-05, | |
| "loss": 16.9315, | |
| "mean_token_accuracy": 0.7063411138951778, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.7340372046254399, | |
| "grad_norm": 1.7593777179718018, | |
| "learning_rate": 2.7966101694915255e-05, | |
| "loss": 16.8653, | |
| "mean_token_accuracy": 0.7089171193540096, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.7373889726830903, | |
| "grad_norm": 1.681443452835083, | |
| "learning_rate": 2.7612994350282488e-05, | |
| "loss": 16.9878, | |
| "mean_token_accuracy": 0.7057393230497837, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.7407407407407407, | |
| "grad_norm": 1.6064281463623047, | |
| "learning_rate": 2.725988700564972e-05, | |
| "loss": 16.6153, | |
| "mean_token_accuracy": 0.7038764618337154, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.7440925087983912, | |
| "grad_norm": 1.5632483959197998, | |
| "learning_rate": 2.690677966101695e-05, | |
| "loss": 16.0927, | |
| "mean_token_accuracy": 0.7171440742909908, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.7474442768560415, | |
| "grad_norm": 1.8588156700134277, | |
| "learning_rate": 2.6553672316384183e-05, | |
| "loss": 16.5765, | |
| "mean_token_accuracy": 0.7098327249288559, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.750796044913692, | |
| "grad_norm": 1.5576221942901611, | |
| "learning_rate": 2.6200564971751413e-05, | |
| "loss": 16.6568, | |
| "mean_token_accuracy": 0.7029327027499676, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.7541478129713424, | |
| "grad_norm": 1.645244836807251, | |
| "learning_rate": 2.5847457627118642e-05, | |
| "loss": 16.7294, | |
| "mean_token_accuracy": 0.7060277953743934, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.7574995810289928, | |
| "grad_norm": 1.4038984775543213, | |
| "learning_rate": 2.549435028248588e-05, | |
| "loss": 16.5925, | |
| "mean_token_accuracy": 0.7068064086139202, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.7608513490866432, | |
| "grad_norm": 1.7987641096115112, | |
| "learning_rate": 2.514124293785311e-05, | |
| "loss": 16.6834, | |
| "mean_token_accuracy": 0.7070130936801433, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.7642031171442936, | |
| "grad_norm": 1.5423444509506226, | |
| "learning_rate": 2.478813559322034e-05, | |
| "loss": 16.4551, | |
| "mean_token_accuracy": 0.7121224895119667, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.767554885201944, | |
| "grad_norm": 1.7546942234039307, | |
| "learning_rate": 2.443502824858757e-05, | |
| "loss": 16.9741, | |
| "mean_token_accuracy": 0.7010989025235176, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.7709066532595944, | |
| "grad_norm": 1.8481935262680054, | |
| "learning_rate": 2.4081920903954803e-05, | |
| "loss": 16.6323, | |
| "mean_token_accuracy": 0.7058765202760696, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.7742584213172449, | |
| "grad_norm": 1.6855909824371338, | |
| "learning_rate": 2.3728813559322036e-05, | |
| "loss": 16.6844, | |
| "mean_token_accuracy": 0.7119428858160972, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.7776101893748952, | |
| "grad_norm": 1.9828130006790161, | |
| "learning_rate": 2.3375706214689266e-05, | |
| "loss": 16.866, | |
| "mean_token_accuracy": 0.7036800056695938, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.7809619574325457, | |
| "grad_norm": 1.5005120038986206, | |
| "learning_rate": 2.30225988700565e-05, | |
| "loss": 16.3539, | |
| "mean_token_accuracy": 0.711839384585619, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.7843137254901961, | |
| "grad_norm": 2.262735366821289, | |
| "learning_rate": 2.266949152542373e-05, | |
| "loss": 16.4102, | |
| "mean_token_accuracy": 0.7110463745892048, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.7876654935478465, | |
| "grad_norm": 1.6699568033218384, | |
| "learning_rate": 2.231638418079096e-05, | |
| "loss": 17.1027, | |
| "mean_token_accuracy": 0.7031991191208362, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.7910172616054969, | |
| "grad_norm": 1.6248890161514282, | |
| "learning_rate": 2.196327683615819e-05, | |
| "loss": 16.3399, | |
| "mean_token_accuracy": 0.7143234215676785, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.7943690296631473, | |
| "grad_norm": 1.7570775747299194, | |
| "learning_rate": 2.1610169491525427e-05, | |
| "loss": 16.2255, | |
| "mean_token_accuracy": 0.7123358778655529, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.7977207977207977, | |
| "grad_norm": 1.9391677379608154, | |
| "learning_rate": 2.1257062146892657e-05, | |
| "loss": 16.3472, | |
| "mean_token_accuracy": 0.711616413295269, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.8010725657784481, | |
| "grad_norm": 1.8997981548309326, | |
| "learning_rate": 2.0903954802259886e-05, | |
| "loss": 16.5601, | |
| "mean_token_accuracy": 0.7071553356945515, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.8044243338360986, | |
| "grad_norm": 1.6094359159469604, | |
| "learning_rate": 2.055084745762712e-05, | |
| "loss": 16.622, | |
| "mean_token_accuracy": 0.7043877936899662, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.8077761018937489, | |
| "grad_norm": 1.7940973043441772, | |
| "learning_rate": 2.0197740112994352e-05, | |
| "loss": 16.6535, | |
| "mean_token_accuracy": 0.705554535984993, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.8111278699513994, | |
| "grad_norm": 1.6890041828155518, | |
| "learning_rate": 1.984463276836158e-05, | |
| "loss": 17.2328, | |
| "mean_token_accuracy": 0.6988375537097454, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.8144796380090498, | |
| "grad_norm": 1.5568735599517822, | |
| "learning_rate": 1.9491525423728814e-05, | |
| "loss": 16.9753, | |
| "mean_token_accuracy": 0.7015632651746273, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.8178314060667002, | |
| "grad_norm": 1.7157835960388184, | |
| "learning_rate": 1.9138418079096047e-05, | |
| "loss": 16.3668, | |
| "mean_token_accuracy": 0.7098449252545833, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.8211831741243506, | |
| "grad_norm": 1.7175644636154175, | |
| "learning_rate": 1.8785310734463277e-05, | |
| "loss": 16.8061, | |
| "mean_token_accuracy": 0.7032932281494141, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.824534942182001, | |
| "grad_norm": 1.7225829362869263, | |
| "learning_rate": 1.843220338983051e-05, | |
| "loss": 16.5716, | |
| "mean_token_accuracy": 0.7074852548539639, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.8278867102396514, | |
| "grad_norm": 1.8654727935791016, | |
| "learning_rate": 1.8079096045197743e-05, | |
| "loss": 16.8172, | |
| "mean_token_accuracy": 0.7035241700708866, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.8312384782973018, | |
| "grad_norm": 1.9604694843292236, | |
| "learning_rate": 1.7725988700564972e-05, | |
| "loss": 16.2992, | |
| "mean_token_accuracy": 0.714275274425745, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.8345902463549523, | |
| "grad_norm": 1.7569185495376587, | |
| "learning_rate": 1.7372881355932205e-05, | |
| "loss": 16.6269, | |
| "mean_token_accuracy": 0.7052666112780571, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.8379420144126026, | |
| "grad_norm": 1.6537069082260132, | |
| "learning_rate": 1.7019774011299435e-05, | |
| "loss": 16.5978, | |
| "mean_token_accuracy": 0.708269502967596, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.8412937824702531, | |
| "grad_norm": 1.8623359203338623, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 16.1831, | |
| "mean_token_accuracy": 0.7164609245955944, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.8446455505279035, | |
| "grad_norm": 1.7004101276397705, | |
| "learning_rate": 1.63135593220339e-05, | |
| "loss": 16.9611, | |
| "mean_token_accuracy": 0.7057129152119159, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.8479973185855538, | |
| "grad_norm": 1.8294973373413086, | |
| "learning_rate": 1.596045197740113e-05, | |
| "loss": 16.8036, | |
| "mean_token_accuracy": 0.7046464517712593, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.8513490866432043, | |
| "grad_norm": 1.7992702722549438, | |
| "learning_rate": 1.5607344632768363e-05, | |
| "loss": 16.139, | |
| "mean_token_accuracy": 0.7126708298921585, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.8547008547008547, | |
| "grad_norm": 2.033846855163574, | |
| "learning_rate": 1.5254237288135596e-05, | |
| "loss": 16.49, | |
| "mean_token_accuracy": 0.707030464708805, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.8580526227585051, | |
| "grad_norm": 1.690617561340332, | |
| "learning_rate": 1.4901129943502825e-05, | |
| "loss": 16.7829, | |
| "mean_token_accuracy": 0.7026272863149643, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.8614043908161555, | |
| "grad_norm": 1.7161706686019897, | |
| "learning_rate": 1.4548022598870056e-05, | |
| "loss": 16.4907, | |
| "mean_token_accuracy": 0.7054763376712799, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.864756158873806, | |
| "grad_norm": 1.5910500288009644, | |
| "learning_rate": 1.419491525423729e-05, | |
| "loss": 16.3073, | |
| "mean_token_accuracy": 0.7165283918380737, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.8681079269314563, | |
| "grad_norm": 1.5939749479293823, | |
| "learning_rate": 1.384180790960452e-05, | |
| "loss": 16.6524, | |
| "mean_token_accuracy": 0.705347529053688, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.8714596949891068, | |
| "grad_norm": 1.7478996515274048, | |
| "learning_rate": 1.3488700564971752e-05, | |
| "loss": 17.1832, | |
| "mean_token_accuracy": 0.6956523738801479, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.8748114630467572, | |
| "grad_norm": 1.6442205905914307, | |
| "learning_rate": 1.3135593220338985e-05, | |
| "loss": 16.3978, | |
| "mean_token_accuracy": 0.7132278561592102, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.8781632311044075, | |
| "grad_norm": 1.7201565504074097, | |
| "learning_rate": 1.2782485875706216e-05, | |
| "loss": 16.3159, | |
| "mean_token_accuracy": 0.711051919311285, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.881514999162058, | |
| "grad_norm": 1.829209327697754, | |
| "learning_rate": 1.2429378531073447e-05, | |
| "loss": 16.7987, | |
| "mean_token_accuracy": 0.7058401651680469, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.8848667672197084, | |
| "grad_norm": 1.4660886526107788, | |
| "learning_rate": 1.2076271186440678e-05, | |
| "loss": 16.7297, | |
| "mean_token_accuracy": 0.7092804253101349, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.8882185352773588, | |
| "grad_norm": 1.4927663803100586, | |
| "learning_rate": 1.172316384180791e-05, | |
| "loss": 15.9333, | |
| "mean_token_accuracy": 0.7158772744238376, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.8915703033350092, | |
| "grad_norm": 1.6522186994552612, | |
| "learning_rate": 1.137005649717514e-05, | |
| "loss": 16.4156, | |
| "mean_token_accuracy": 0.7134528748691082, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.8949220713926597, | |
| "grad_norm": 1.7809523344039917, | |
| "learning_rate": 1.1016949152542374e-05, | |
| "loss": 16.2625, | |
| "mean_token_accuracy": 0.7148336976766586, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.89827383945031, | |
| "grad_norm": 1.8860619068145752, | |
| "learning_rate": 1.0663841807909605e-05, | |
| "loss": 16.6187, | |
| "mean_token_accuracy": 0.7087382405996323, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.9016256075079605, | |
| "grad_norm": 1.854195475578308, | |
| "learning_rate": 1.0310734463276836e-05, | |
| "loss": 16.5843, | |
| "mean_token_accuracy": 0.7144103929400444, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.9049773755656109, | |
| "grad_norm": 1.7052239179611206, | |
| "learning_rate": 9.957627118644067e-06, | |
| "loss": 16.3345, | |
| "mean_token_accuracy": 0.7125584341585636, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.9083291436232612, | |
| "grad_norm": 1.5887420177459717, | |
| "learning_rate": 9.6045197740113e-06, | |
| "loss": 16.2409, | |
| "mean_token_accuracy": 0.7080107174813748, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.9116809116809117, | |
| "grad_norm": 1.6052732467651367, | |
| "learning_rate": 9.251412429378532e-06, | |
| "loss": 16.2373, | |
| "mean_token_accuracy": 0.7137157171964645, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.9150326797385621, | |
| "grad_norm": 1.7612617015838623, | |
| "learning_rate": 8.898305084745763e-06, | |
| "loss": 16.0292, | |
| "mean_token_accuracy": 0.7181592255830764, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.9183844477962125, | |
| "grad_norm": 1.8271749019622803, | |
| "learning_rate": 8.545197740112996e-06, | |
| "loss": 16.8757, | |
| "mean_token_accuracy": 0.701992305368185, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.9217362158538629, | |
| "grad_norm": 1.6350926160812378, | |
| "learning_rate": 8.192090395480225e-06, | |
| "loss": 16.6061, | |
| "mean_token_accuracy": 0.7089238859713077, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.9250879839115134, | |
| "grad_norm": 1.7321621179580688, | |
| "learning_rate": 7.838983050847458e-06, | |
| "loss": 16.2532, | |
| "mean_token_accuracy": 0.7115737572312355, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.9284397519691637, | |
| "grad_norm": 1.8958040475845337, | |
| "learning_rate": 7.48587570621469e-06, | |
| "loss": 16.5068, | |
| "mean_token_accuracy": 0.7108790181577206, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.9317915200268141, | |
| "grad_norm": 1.629992127418518, | |
| "learning_rate": 7.1327683615819206e-06, | |
| "loss": 16.2367, | |
| "mean_token_accuracy": 0.7134776934981346, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.9351432880844646, | |
| "grad_norm": 1.904123067855835, | |
| "learning_rate": 6.779661016949153e-06, | |
| "loss": 16.3444, | |
| "mean_token_accuracy": 0.7045241884887219, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.9384950561421149, | |
| "grad_norm": 1.6319600343704224, | |
| "learning_rate": 6.426553672316385e-06, | |
| "loss": 16.3, | |
| "mean_token_accuracy": 0.7118948072195053, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.9418468241997654, | |
| "grad_norm": 1.6921709775924683, | |
| "learning_rate": 6.073446327683617e-06, | |
| "loss": 16.5816, | |
| "mean_token_accuracy": 0.7079687170684338, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.9451985922574158, | |
| "grad_norm": 1.636551856994629, | |
| "learning_rate": 5.720338983050848e-06, | |
| "loss": 16.785, | |
| "mean_token_accuracy": 0.7054948009550571, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.9485503603150662, | |
| "grad_norm": 1.6171858310699463, | |
| "learning_rate": 5.367231638418079e-06, | |
| "loss": 16.6877, | |
| "mean_token_accuracy": 0.7033485405147075, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.9519021283727166, | |
| "grad_norm": 1.6833641529083252, | |
| "learning_rate": 5.014124293785311e-06, | |
| "loss": 16.5803, | |
| "mean_token_accuracy": 0.706027402728796, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.9552538964303671, | |
| "grad_norm": 2.0238494873046875, | |
| "learning_rate": 4.6610169491525425e-06, | |
| "loss": 16.4305, | |
| "mean_token_accuracy": 0.7110757566988468, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.9586056644880174, | |
| "grad_norm": 1.5262683629989624, | |
| "learning_rate": 4.307909604519774e-06, | |
| "loss": 16.105, | |
| "mean_token_accuracy": 0.7173994883894921, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.9619574325456678, | |
| "grad_norm": 1.6822128295898438, | |
| "learning_rate": 3.954802259887006e-06, | |
| "loss": 17.0064, | |
| "mean_token_accuracy": 0.7033144362270832, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.9653092006033183, | |
| "grad_norm": 2.1382946968078613, | |
| "learning_rate": 3.6016949152542374e-06, | |
| "loss": 16.6567, | |
| "mean_token_accuracy": 0.7085098147392273, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.9686609686609686, | |
| "grad_norm": 1.6137080192565918, | |
| "learning_rate": 3.248587570621469e-06, | |
| "loss": 16.4193, | |
| "mean_token_accuracy": 0.7077061600983143, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.9720127367186191, | |
| "grad_norm": 1.6318018436431885, | |
| "learning_rate": 2.8954802259887007e-06, | |
| "loss": 16.5904, | |
| "mean_token_accuracy": 0.7037704810500145, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.9753645047762695, | |
| "grad_norm": 1.6723519563674927, | |
| "learning_rate": 2.5423728813559323e-06, | |
| "loss": 16.351, | |
| "mean_token_accuracy": 0.715372896194458, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.9787162728339199, | |
| "grad_norm": 2.6915719509124756, | |
| "learning_rate": 2.189265536723164e-06, | |
| "loss": 16.5627, | |
| "mean_token_accuracy": 0.706637478619814, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.9820680408915703, | |
| "grad_norm": 1.9349390268325806, | |
| "learning_rate": 1.8361581920903956e-06, | |
| "loss": 16.7821, | |
| "mean_token_accuracy": 0.7010103747248649, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.9854198089492208, | |
| "grad_norm": 1.6685172319412231, | |
| "learning_rate": 1.4830508474576273e-06, | |
| "loss": 16.7016, | |
| "mean_token_accuracy": 0.7086931586265564, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.9887715770068711, | |
| "grad_norm": 1.7148998975753784, | |
| "learning_rate": 1.129943502824859e-06, | |
| "loss": 16.4809, | |
| "mean_token_accuracy": 0.7131018862128258, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.9921233450645215, | |
| "grad_norm": 1.8873836994171143, | |
| "learning_rate": 7.768361581920904e-07, | |
| "loss": 16.5183, | |
| "mean_token_accuracy": 0.7111847102642059, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.995475113122172, | |
| "grad_norm": 1.8390552997589111, | |
| "learning_rate": 4.2372881355932204e-07, | |
| "loss": 16.1742, | |
| "mean_token_accuracy": 0.7128683432936669, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.9988268811798223, | |
| "grad_norm": 1.8799461126327515, | |
| "learning_rate": 7.062146892655368e-08, | |
| "loss": 17.1633, | |
| "mean_token_accuracy": 0.6963419988751411, | |
| "step": 1490 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1491, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 750, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.5012213304045076e+19, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |