| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9995732696082615, | |
| "eval_steps": 100, | |
| "global_step": 732, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0013655372535631989, | |
| "grad_norm": 12.694538866354412, | |
| "learning_rate": 2.702702702702703e-07, | |
| "loss": 1.349, | |
| "mean_token_accuracy": 0.6731206055166955, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0027310745071263977, | |
| "grad_norm": 14.28682858099926, | |
| "learning_rate": 5.405405405405406e-07, | |
| "loss": 1.3695, | |
| "mean_token_accuracy": 0.6685968866221867, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.004096611760689596, | |
| "grad_norm": 12.153556394325475, | |
| "learning_rate": 8.108108108108109e-07, | |
| "loss": 1.2534, | |
| "mean_token_accuracy": 0.6932581184973134, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0054621490142527955, | |
| "grad_norm": 10.014906326374183, | |
| "learning_rate": 1.0810810810810812e-06, | |
| "loss": 1.1814, | |
| "mean_token_accuracy": 0.7059116849514391, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.006827686267815994, | |
| "grad_norm": 13.154435035689092, | |
| "learning_rate": 1.3513513513513515e-06, | |
| "loss": 1.3152, | |
| "mean_token_accuracy": 0.6801386295402803, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.008193223521379193, | |
| "grad_norm": 12.656442197272979, | |
| "learning_rate": 1.6216216216216219e-06, | |
| "loss": 1.3519, | |
| "mean_token_accuracy": 0.6725742496834655, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.009558760774942391, | |
| "grad_norm": 9.154457220968185, | |
| "learning_rate": 1.8918918918918922e-06, | |
| "loss": 1.3201, | |
| "mean_token_accuracy": 0.6776771426562855, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.010924298028505591, | |
| "grad_norm": 12.926796361133281, | |
| "learning_rate": 2.1621621621621623e-06, | |
| "loss": 1.3668, | |
| "mean_token_accuracy": 0.6739682822033727, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.01228983528206879, | |
| "grad_norm": 13.702550349182166, | |
| "learning_rate": 2.432432432432433e-06, | |
| "loss": 1.3398, | |
| "mean_token_accuracy": 0.675676714000192, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.013655372535631987, | |
| "grad_norm": 12.100638571387348, | |
| "learning_rate": 2.702702702702703e-06, | |
| "loss": 1.237, | |
| "mean_token_accuracy": 0.6897704789682599, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.015020909789195187, | |
| "grad_norm": 9.02772882068927, | |
| "learning_rate": 2.9729729729729736e-06, | |
| "loss": 1.237, | |
| "mean_token_accuracy": 0.6967972223171133, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.016386447042758386, | |
| "grad_norm": 7.751826273673437, | |
| "learning_rate": 3.2432432432432437e-06, | |
| "loss": 1.2499, | |
| "mean_token_accuracy": 0.6873925007889158, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.017751984296321584, | |
| "grad_norm": 14.605567306288991, | |
| "learning_rate": 3.513513513513514e-06, | |
| "loss": 1.1656, | |
| "mean_token_accuracy": 0.702033566686862, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.019117521549884782, | |
| "grad_norm": 16.69580523349115, | |
| "learning_rate": 3.7837837837837844e-06, | |
| "loss": 1.1057, | |
| "mean_token_accuracy": 0.7166825564328566, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.02048305880344798, | |
| "grad_norm": 4.277348708917889, | |
| "learning_rate": 4.0540540540540545e-06, | |
| "loss": 1.1453, | |
| "mean_token_accuracy": 0.7064730765093167, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.021848596057011182, | |
| "grad_norm": 5.501675262398263, | |
| "learning_rate": 4.324324324324325e-06, | |
| "loss": 1.0609, | |
| "mean_token_accuracy": 0.7263837797527011, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.02321413331057438, | |
| "grad_norm": 4.594618515621041, | |
| "learning_rate": 4.594594594594596e-06, | |
| "loss": 1.1575, | |
| "mean_token_accuracy": 0.6986264355169661, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.02457967056413758, | |
| "grad_norm": 4.370254196578077, | |
| "learning_rate": 4.864864864864866e-06, | |
| "loss": 0.9763, | |
| "mean_token_accuracy": 0.7371238560229174, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.025945207817700777, | |
| "grad_norm": 12.420426104801459, | |
| "learning_rate": 5.135135135135135e-06, | |
| "loss": 0.9947, | |
| "mean_token_accuracy": 0.7336301337158547, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.027310745071263975, | |
| "grad_norm": 7.306932902857305, | |
| "learning_rate": 5.405405405405406e-06, | |
| "loss": 1.0089, | |
| "mean_token_accuracy": 0.7298717119207753, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.028676282324827173, | |
| "grad_norm": 3.792544000354483, | |
| "learning_rate": 5.675675675675676e-06, | |
| "loss": 0.9992, | |
| "mean_token_accuracy": 0.7355598065427686, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.030041819578390375, | |
| "grad_norm": 5.599819351248164, | |
| "learning_rate": 5.945945945945947e-06, | |
| "loss": 0.938, | |
| "mean_token_accuracy": 0.7446242908841856, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.03140735683195357, | |
| "grad_norm": 3.2303878160644106, | |
| "learning_rate": 6.2162162162162164e-06, | |
| "loss": 0.842, | |
| "mean_token_accuracy": 0.7625513938625904, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.03277289408551677, | |
| "grad_norm": 7.560111146519516, | |
| "learning_rate": 6.486486486486487e-06, | |
| "loss": 0.8541, | |
| "mean_token_accuracy": 0.7603489427240392, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.03413843133907997, | |
| "grad_norm": 8.994000664594445, | |
| "learning_rate": 6.7567567567567575e-06, | |
| "loss": 0.8033, | |
| "mean_token_accuracy": 0.7710385708849239, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.03550396859264317, | |
| "grad_norm": 11.174112139759627, | |
| "learning_rate": 7.027027027027028e-06, | |
| "loss": 0.7972, | |
| "mean_token_accuracy": 0.7709743153956785, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.036869505846206366, | |
| "grad_norm": 4.4474620188546465, | |
| "learning_rate": 7.297297297297298e-06, | |
| "loss": 0.7779, | |
| "mean_token_accuracy": 0.7762167832548105, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.038235043099769564, | |
| "grad_norm": 6.420798005839511, | |
| "learning_rate": 7.567567567567569e-06, | |
| "loss": 0.7364, | |
| "mean_token_accuracy": 0.7872430296163124, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.03960058035333276, | |
| "grad_norm": 3.505076403243893, | |
| "learning_rate": 7.837837837837838e-06, | |
| "loss": 0.7366, | |
| "mean_token_accuracy": 0.7846892672832243, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.04096611760689596, | |
| "grad_norm": 4.257419834855069, | |
| "learning_rate": 8.108108108108109e-06, | |
| "loss": 0.7276, | |
| "mean_token_accuracy": 0.7832414543493696, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.04233165486045916, | |
| "grad_norm": 1.8596248429959361, | |
| "learning_rate": 8.378378378378378e-06, | |
| "loss": 0.6917, | |
| "mean_token_accuracy": 0.7949227280414939, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.043697192114022364, | |
| "grad_norm": 5.998640085410845, | |
| "learning_rate": 8.64864864864865e-06, | |
| "loss": 0.74, | |
| "mean_token_accuracy": 0.7826185949965834, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.04506272936758556, | |
| "grad_norm": 1.6136645601879687, | |
| "learning_rate": 8.91891891891892e-06, | |
| "loss": 0.682, | |
| "mean_token_accuracy": 0.7971229334178439, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.04642826662114876, | |
| "grad_norm": 4.813218130166523, | |
| "learning_rate": 9.189189189189191e-06, | |
| "loss": 0.6818, | |
| "mean_token_accuracy": 0.7956781396448597, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.04779380387471196, | |
| "grad_norm": 1.6648957199273453, | |
| "learning_rate": 9.45945945945946e-06, | |
| "loss": 0.6536, | |
| "mean_token_accuracy": 0.8039919273243677, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.04915934112827516, | |
| "grad_norm": 2.727767929908723, | |
| "learning_rate": 9.729729729729732e-06, | |
| "loss": 0.5991, | |
| "mean_token_accuracy": 0.8157655755998052, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.050524878381838355, | |
| "grad_norm": 3.307683813511232, | |
| "learning_rate": 1e-05, | |
| "loss": 0.6039, | |
| "mean_token_accuracy": 0.8131568470367118, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.05189041563540155, | |
| "grad_norm": 1.6551722305811238, | |
| "learning_rate": 1.027027027027027e-05, | |
| "loss": 0.6005, | |
| "mean_token_accuracy": 0.8140848694651097, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.05325595288896475, | |
| "grad_norm": 1.447375367061217, | |
| "learning_rate": 1.0540540540540541e-05, | |
| "loss": 0.6002, | |
| "mean_token_accuracy": 0.8150523180034766, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.05462149014252795, | |
| "grad_norm": 1.072024342448014, | |
| "learning_rate": 1.0810810810810812e-05, | |
| "loss": 0.6008, | |
| "mean_token_accuracy": 0.8159713680250943, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.05598702739609115, | |
| "grad_norm": 0.7603023981378255, | |
| "learning_rate": 1.1081081081081081e-05, | |
| "loss": 0.5885, | |
| "mean_token_accuracy": 0.8169645259893636, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.057352564649654346, | |
| "grad_norm": 0.9581646192295539, | |
| "learning_rate": 1.1351351351351352e-05, | |
| "loss": 0.561, | |
| "mean_token_accuracy": 0.8244735432096343, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.058718101903217544, | |
| "grad_norm": 1.2036888425863506, | |
| "learning_rate": 1.1621621621621622e-05, | |
| "loss": 0.5912, | |
| "mean_token_accuracy": 0.8190549345072387, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.06008363915678075, | |
| "grad_norm": 0.7718687528534887, | |
| "learning_rate": 1.1891891891891894e-05, | |
| "loss": 0.5749, | |
| "mean_token_accuracy": 0.819959293524251, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.06144917641034395, | |
| "grad_norm": 1.682073620529749, | |
| "learning_rate": 1.2162162162162164e-05, | |
| "loss": 0.5584, | |
| "mean_token_accuracy": 0.825639299275228, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.06281471366390715, | |
| "grad_norm": 0.7001166412928403, | |
| "learning_rate": 1.2432432432432433e-05, | |
| "loss": 0.5581, | |
| "mean_token_accuracy": 0.8246738540448362, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.06418025091747034, | |
| "grad_norm": 1.0537050693773666, | |
| "learning_rate": 1.2702702702702702e-05, | |
| "loss": 0.5551, | |
| "mean_token_accuracy": 0.8259298934704246, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.06554578817103354, | |
| "grad_norm": 0.7920322967117208, | |
| "learning_rate": 1.2972972972972975e-05, | |
| "loss": 0.5507, | |
| "mean_token_accuracy": 0.8265237150428488, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.06691132542459674, | |
| "grad_norm": 0.562181333813763, | |
| "learning_rate": 1.3243243243243244e-05, | |
| "loss": 0.5688, | |
| "mean_token_accuracy": 0.8204618404616847, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.06827686267815994, | |
| "grad_norm": 0.5763859640174397, | |
| "learning_rate": 1.3513513513513515e-05, | |
| "loss": 0.5127, | |
| "mean_token_accuracy": 0.8361940201766735, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06964239993172314, | |
| "grad_norm": 0.8441858338744881, | |
| "learning_rate": 1.3783783783783784e-05, | |
| "loss": 0.5605, | |
| "mean_token_accuracy": 0.8224741152107983, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.07100793718528634, | |
| "grad_norm": 0.623060068151014, | |
| "learning_rate": 1.4054054054054055e-05, | |
| "loss": 0.5154, | |
| "mean_token_accuracy": 0.8358351656376076, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.07237347443884953, | |
| "grad_norm": 0.487364049768217, | |
| "learning_rate": 1.4324324324324326e-05, | |
| "loss": 0.5508, | |
| "mean_token_accuracy": 0.824997923142241, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.07373901169241273, | |
| "grad_norm": 0.4710640541449941, | |
| "learning_rate": 1.4594594594594596e-05, | |
| "loss": 0.5121, | |
| "mean_token_accuracy": 0.8361596705359315, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.07510454894597593, | |
| "grad_norm": 0.5307868639543387, | |
| "learning_rate": 1.4864864864864865e-05, | |
| "loss": 0.5388, | |
| "mean_token_accuracy": 0.8287927367224052, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.07647008619953913, | |
| "grad_norm": 0.502790636865642, | |
| "learning_rate": 1.5135135135135138e-05, | |
| "loss": 0.5002, | |
| "mean_token_accuracy": 0.8393840192332627, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.07783562345310233, | |
| "grad_norm": 0.43346690723829867, | |
| "learning_rate": 1.540540540540541e-05, | |
| "loss": 0.5125, | |
| "mean_token_accuracy": 0.8363432033977718, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.07920116070666552, | |
| "grad_norm": 0.4993605510803232, | |
| "learning_rate": 1.5675675675675676e-05, | |
| "loss": 0.5188, | |
| "mean_token_accuracy": 0.8324141327632871, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.08056669796022872, | |
| "grad_norm": 0.4277849657264807, | |
| "learning_rate": 1.5945945945945947e-05, | |
| "loss": 0.5099, | |
| "mean_token_accuracy": 0.8360334578638953, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.08193223521379192, | |
| "grad_norm": 0.4591464651505517, | |
| "learning_rate": 1.6216216216216218e-05, | |
| "loss": 0.5046, | |
| "mean_token_accuracy": 0.8377397858533894, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.08329777246735512, | |
| "grad_norm": 0.4510027457654597, | |
| "learning_rate": 1.648648648648649e-05, | |
| "loss": 0.511, | |
| "mean_token_accuracy": 0.8336544170905339, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.08466330972091832, | |
| "grad_norm": 0.4050262343002075, | |
| "learning_rate": 1.6756756756756757e-05, | |
| "loss": 0.5051, | |
| "mean_token_accuracy": 0.8365473990294894, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.08602884697448153, | |
| "grad_norm": 0.44615719951666044, | |
| "learning_rate": 1.7027027027027028e-05, | |
| "loss": 0.5265, | |
| "mean_token_accuracy": 0.8290703723078953, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.08739438422804473, | |
| "grad_norm": 0.38700566099417927, | |
| "learning_rate": 1.72972972972973e-05, | |
| "loss": 0.4888, | |
| "mean_token_accuracy": 0.8415804018657317, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.08875992148160793, | |
| "grad_norm": 0.45966773746839323, | |
| "learning_rate": 1.756756756756757e-05, | |
| "loss": 0.5034, | |
| "mean_token_accuracy": 0.8367421080881654, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.09012545873517112, | |
| "grad_norm": 0.3462174672127141, | |
| "learning_rate": 1.783783783783784e-05, | |
| "loss": 0.5056, | |
| "mean_token_accuracy": 0.8362969110791204, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.09149099598873432, | |
| "grad_norm": 0.5378368089497744, | |
| "learning_rate": 1.8108108108108108e-05, | |
| "loss": 0.4845, | |
| "mean_token_accuracy": 0.841530221609978, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.09285653324229752, | |
| "grad_norm": 0.3927908242857095, | |
| "learning_rate": 1.8378378378378383e-05, | |
| "loss": 0.5017, | |
| "mean_token_accuracy": 0.8354575522429637, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.09422207049586072, | |
| "grad_norm": 0.48195021375325325, | |
| "learning_rate": 1.864864864864865e-05, | |
| "loss": 0.4794, | |
| "mean_token_accuracy": 0.8440971672690992, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.09558760774942392, | |
| "grad_norm": 0.42743054642794015, | |
| "learning_rate": 1.891891891891892e-05, | |
| "loss": 0.5054, | |
| "mean_token_accuracy": 0.8351946171032623, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.09695314500298712, | |
| "grad_norm": 0.5073725399698784, | |
| "learning_rate": 1.918918918918919e-05, | |
| "loss": 0.4872, | |
| "mean_token_accuracy": 0.8416211405529895, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.09831868225655031, | |
| "grad_norm": 0.32802476428146243, | |
| "learning_rate": 1.9459459459459463e-05, | |
| "loss": 0.4873, | |
| "mean_token_accuracy": 0.8410772826528182, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.09968421951011351, | |
| "grad_norm": 0.44056399284971365, | |
| "learning_rate": 1.972972972972973e-05, | |
| "loss": 0.4698, | |
| "mean_token_accuracy": 0.8444900562879512, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.10104975676367671, | |
| "grad_norm": 0.46074182462916863, | |
| "learning_rate": 2e-05, | |
| "loss": 0.5052, | |
| "mean_token_accuracy": 0.8356767677438455, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.10241529401723991, | |
| "grad_norm": 0.43060352923187173, | |
| "learning_rate": 1.999988602302209e-05, | |
| "loss": 0.5034, | |
| "mean_token_accuracy": 0.8357036382011872, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.1037808312708031, | |
| "grad_norm": 0.5392177840776831, | |
| "learning_rate": 1.9999544094686517e-05, | |
| "loss": 0.464, | |
| "mean_token_accuracy": 0.8469077059106851, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.1051463685243663, | |
| "grad_norm": 0.4383769298059935, | |
| "learning_rate": 1.999897422278767e-05, | |
| "loss": 0.475, | |
| "mean_token_accuracy": 0.8436229064865175, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.1065119057779295, | |
| "grad_norm": 0.4904266229445523, | |
| "learning_rate": 1.9998176420316002e-05, | |
| "loss": 0.4853, | |
| "mean_token_accuracy": 0.8408816529254115, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.1078774430314927, | |
| "grad_norm": 0.36607390424605935, | |
| "learning_rate": 1.9997150705457738e-05, | |
| "loss": 0.4531, | |
| "mean_token_accuracy": 0.8487558375701372, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.1092429802850559, | |
| "grad_norm": 0.4551168695964319, | |
| "learning_rate": 1.9995897101594454e-05, | |
| "loss": 0.4876, | |
| "mean_token_accuracy": 0.8396327622489336, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1106085175386191, | |
| "grad_norm": 0.42442283856925495, | |
| "learning_rate": 1.9994415637302545e-05, | |
| "loss": 0.4633, | |
| "mean_token_accuracy": 0.8483948181491869, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.1119740547921823, | |
| "grad_norm": 0.6393625908948564, | |
| "learning_rate": 1.999270634635258e-05, | |
| "loss": 0.4606, | |
| "mean_token_accuracy": 0.8486694050596694, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.1133395920457455, | |
| "grad_norm": 0.3938759824609981, | |
| "learning_rate": 1.9990769267708517e-05, | |
| "loss": 0.4703, | |
| "mean_token_accuracy": 0.843921182173024, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.11470512929930869, | |
| "grad_norm": 0.46323080288240137, | |
| "learning_rate": 1.998860444552683e-05, | |
| "loss": 0.4579, | |
| "mean_token_accuracy": 0.8487062802081777, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.11607066655287189, | |
| "grad_norm": 0.4100887075069859, | |
| "learning_rate": 1.99862119291555e-05, | |
| "loss": 0.4753, | |
| "mean_token_accuracy": 0.8426625139694884, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.11743620380643509, | |
| "grad_norm": 0.460266156523643, | |
| "learning_rate": 1.9983591773132885e-05, | |
| "loss": 0.4634, | |
| "mean_token_accuracy": 0.8479961635199413, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.11880174105999829, | |
| "grad_norm": 0.4093999187610122, | |
| "learning_rate": 1.998074403718647e-05, | |
| "loss": 0.481, | |
| "mean_token_accuracy": 0.8413903297828744, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.1201672783135615, | |
| "grad_norm": 0.4529979351460269, | |
| "learning_rate": 1.9977668786231536e-05, | |
| "loss": 0.4646, | |
| "mean_token_accuracy": 0.8439730016702047, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.1215328155671247, | |
| "grad_norm": 0.31940153601185445, | |
| "learning_rate": 1.997436609036963e-05, | |
| "loss": 0.4929, | |
| "mean_token_accuracy": 0.8389086531135729, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.1228983528206879, | |
| "grad_norm": 0.36879956082602044, | |
| "learning_rate": 1.997083602488702e-05, | |
| "loss": 0.4753, | |
| "mean_token_accuracy": 0.8442029592930739, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1242638900742511, | |
| "grad_norm": 0.31625144322456566, | |
| "learning_rate": 1.9967078670252947e-05, | |
| "loss": 0.4809, | |
| "mean_token_accuracy": 0.8414532739667411, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.1256294273278143, | |
| "grad_norm": 0.37572795515302265, | |
| "learning_rate": 1.9963094112117786e-05, | |
| "loss": 0.4731, | |
| "mean_token_accuracy": 0.8458833864797164, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.1269949645813775, | |
| "grad_norm": 0.32729110107644405, | |
| "learning_rate": 1.995888244131113e-05, | |
| "loss": 0.4729, | |
| "mean_token_accuracy": 0.844289710669155, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.1283605018349407, | |
| "grad_norm": 0.4508003392275939, | |
| "learning_rate": 1.9954443753839666e-05, | |
| "loss": 0.4844, | |
| "mean_token_accuracy": 0.8410096036799376, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.12972603908850389, | |
| "grad_norm": 0.34367729445854445, | |
| "learning_rate": 1.994977815088504e-05, | |
| "loss": 0.4536, | |
| "mean_token_accuracy": 0.8496317958214457, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.13109157634206708, | |
| "grad_norm": 0.39368016078712464, | |
| "learning_rate": 1.994488573880152e-05, | |
| "loss": 0.4427, | |
| "mean_token_accuracy": 0.8527215734701595, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.13245711359563028, | |
| "grad_norm": 0.39400450065152637, | |
| "learning_rate": 1.9939766629113568e-05, | |
| "loss": 0.4718, | |
| "mean_token_accuracy": 0.8439487535684244, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.13382265084919348, | |
| "grad_norm": 0.3644413048898344, | |
| "learning_rate": 1.9934420938513313e-05, | |
| "loss": 0.4451, | |
| "mean_token_accuracy": 0.8515555292296904, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.13518818810275668, | |
| "grad_norm": 0.33820519416057315, | |
| "learning_rate": 1.9928848788857887e-05, | |
| "loss": 0.4687, | |
| "mean_token_accuracy": 0.8435826255747995, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.13655372535631988, | |
| "grad_norm": 0.35474640844985367, | |
| "learning_rate": 1.9923050307166655e-05, | |
| "loss": 0.4694, | |
| "mean_token_accuracy": 0.8443660349778408, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.13791926260988308, | |
| "grad_norm": 0.3787756690185754, | |
| "learning_rate": 1.9917025625618295e-05, | |
| "loss": 0.443, | |
| "mean_token_accuracy": 0.8515359704739787, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.13928479986344627, | |
| "grad_norm": 0.3362568869450033, | |
| "learning_rate": 1.9910774881547803e-05, | |
| "loss": 0.4697, | |
| "mean_token_accuracy": 0.8443949120016512, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.14065033711700947, | |
| "grad_norm": 0.33478608195399623, | |
| "learning_rate": 1.9904298217443366e-05, | |
| "loss": 0.4655, | |
| "mean_token_accuracy": 0.8454621512591025, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.14201587437057267, | |
| "grad_norm": 0.3157906568874303, | |
| "learning_rate": 1.9897595780943104e-05, | |
| "loss": 0.4549, | |
| "mean_token_accuracy": 0.8479587393969411, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.14338141162413587, | |
| "grad_norm": 0.33967710856761196, | |
| "learning_rate": 1.989066772483171e-05, | |
| "loss": 0.4584, | |
| "mean_token_accuracy": 0.8502480645030208, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.14474694887769907, | |
| "grad_norm": 0.3625458211243482, | |
| "learning_rate": 1.988351420703696e-05, | |
| "loss": 0.4449, | |
| "mean_token_accuracy": 0.8503154268653385, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.14611248613126226, | |
| "grad_norm": 0.33343733476340787, | |
| "learning_rate": 1.9876135390626123e-05, | |
| "loss": 0.4612, | |
| "mean_token_accuracy": 0.8469237180257693, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.14747802338482546, | |
| "grad_norm": 0.3402896753249496, | |
| "learning_rate": 1.986853144380224e-05, | |
| "loss": 0.4737, | |
| "mean_token_accuracy": 0.8415288812333515, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.14884356063838866, | |
| "grad_norm": 0.30321741460606055, | |
| "learning_rate": 1.9860702539900288e-05, | |
| "loss": 0.4604, | |
| "mean_token_accuracy": 0.8479057193176431, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.15020909789195186, | |
| "grad_norm": 0.34336551349752786, | |
| "learning_rate": 1.9852648857383224e-05, | |
| "loss": 0.4588, | |
| "mean_token_accuracy": 0.8476062469011421, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.15157463514551506, | |
| "grad_norm": 0.3096450131783167, | |
| "learning_rate": 1.984437057983793e-05, | |
| "loss": 0.4388, | |
| "mean_token_accuracy": 0.8534628120170084, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.15294017239907826, | |
| "grad_norm": 0.36874309540738714, | |
| "learning_rate": 1.9835867895971015e-05, | |
| "loss": 0.4403, | |
| "mean_token_accuracy": 0.8526718507556617, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.15430570965264145, | |
| "grad_norm": 0.3605796560605681, | |
| "learning_rate": 1.982714099960452e-05, | |
| "loss": 0.4622, | |
| "mean_token_accuracy": 0.846226495136954, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.15567124690620465, | |
| "grad_norm": 0.4627441704849731, | |
| "learning_rate": 1.981819008967151e-05, | |
| "loss": 0.4858, | |
| "mean_token_accuracy": 0.8394183495047153, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.15703678415976785, | |
| "grad_norm": 0.34714642141003565, | |
| "learning_rate": 1.9809015370211505e-05, | |
| "loss": 0.4667, | |
| "mean_token_accuracy": 0.8454976903304268, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.15840232141333105, | |
| "grad_norm": 0.34086378030044884, | |
| "learning_rate": 1.979961705036587e-05, | |
| "loss": 0.4483, | |
| "mean_token_accuracy": 0.8508435940254968, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.15976785866689425, | |
| "grad_norm": 0.4140107037865621, | |
| "learning_rate": 1.9789995344373027e-05, | |
| "loss": 0.4449, | |
| "mean_token_accuracy": 0.8518490808801514, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.16113339592045745, | |
| "grad_norm": 0.34331795050708586, | |
| "learning_rate": 1.978015047156356e-05, | |
| "loss": 0.4604, | |
| "mean_token_accuracy": 0.8451241258921095, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.16249893317402064, | |
| "grad_norm": 0.3380747151761631, | |
| "learning_rate": 1.977008265635525e-05, | |
| "loss": 0.4343, | |
| "mean_token_accuracy": 0.8551851359088176, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.16386447042758384, | |
| "grad_norm": 0.39246892074574696, | |
| "learning_rate": 1.9759792128247922e-05, | |
| "loss": 0.4566, | |
| "mean_token_accuracy": 0.8475478494544696, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.16523000768114704, | |
| "grad_norm": 0.3077478512701279, | |
| "learning_rate": 1.9749279121818235e-05, | |
| "loss": 0.4446, | |
| "mean_token_accuracy": 0.8506487627564587, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.16659554493471024, | |
| "grad_norm": 0.3128694161488127, | |
| "learning_rate": 1.9738543876714335e-05, | |
| "loss": 0.4566, | |
| "mean_token_accuracy": 0.8477825790742587, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.16796108218827344, | |
| "grad_norm": 0.33718369881101784, | |
| "learning_rate": 1.9727586637650373e-05, | |
| "loss": 0.4381, | |
| "mean_token_accuracy": 0.8531846568153524, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.16932661944183663, | |
| "grad_norm": 0.31111696829985575, | |
| "learning_rate": 1.9716407654400954e-05, | |
| "loss": 0.4715, | |
| "mean_token_accuracy": 0.8433811011221168, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.17069215669539983, | |
| "grad_norm": 0.3250655447526398, | |
| "learning_rate": 1.9705007181795416e-05, | |
| "loss": 0.4594, | |
| "mean_token_accuracy": 0.847703186503929, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.17205769394896306, | |
| "grad_norm": 0.31423640348811155, | |
| "learning_rate": 1.9693385479712047e-05, | |
| "loss": 0.442, | |
| "mean_token_accuracy": 0.8514727417072737, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.17342323120252626, | |
| "grad_norm": 0.35736412097514153, | |
| "learning_rate": 1.9681542813072147e-05, | |
| "loss": 0.4776, | |
| "mean_token_accuracy": 0.8411854317148191, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.17478876845608946, | |
| "grad_norm": 0.2902859082859034, | |
| "learning_rate": 1.9669479451833976e-05, | |
| "loss": 0.4388, | |
| "mean_token_accuracy": 0.8539477536048056, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.17615430570965265, | |
| "grad_norm": 0.31967638765399986, | |
| "learning_rate": 1.9657195670986638e-05, | |
| "loss": 0.4563, | |
| "mean_token_accuracy": 0.8475490129873101, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.17751984296321585, | |
| "grad_norm": 0.3019630526325269, | |
| "learning_rate": 1.964469175054377e-05, | |
| "loss": 0.4508, | |
| "mean_token_accuracy": 0.8500409132192751, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.17888538021677905, | |
| "grad_norm": 0.28471378493835015, | |
| "learning_rate": 1.963196797553718e-05, | |
| "loss": 0.4592, | |
| "mean_token_accuracy": 0.8465730185761348, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.18025091747034225, | |
| "grad_norm": 0.2976461319068471, | |
| "learning_rate": 1.961902463601036e-05, | |
| "loss": 0.4483, | |
| "mean_token_accuracy": 0.8504603311676011, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.18161645472390545, | |
| "grad_norm": 0.3420853043630677, | |
| "learning_rate": 1.9605862027011858e-05, | |
| "loss": 0.446, | |
| "mean_token_accuracy": 0.8522397868056077, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.18298199197746864, | |
| "grad_norm": 0.30068683893569925, | |
| "learning_rate": 1.959248044858854e-05, | |
| "loss": 0.4538, | |
| "mean_token_accuracy": 0.8479476016652441, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.18434752923103184, | |
| "grad_norm": 0.337968764863526, | |
| "learning_rate": 1.9578880205778793e-05, | |
| "loss": 0.432, | |
| "mean_token_accuracy": 0.8544472005484545, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.18571306648459504, | |
| "grad_norm": 0.3187881918567848, | |
| "learning_rate": 1.9565061608605526e-05, | |
| "loss": 0.4533, | |
| "mean_token_accuracy": 0.8479640485502516, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.18707860373815824, | |
| "grad_norm": 0.35269232923831895, | |
| "learning_rate": 1.9551024972069127e-05, | |
| "loss": 0.4509, | |
| "mean_token_accuracy": 0.8508397906934503, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.18844414099172144, | |
| "grad_norm": 0.3058471460955213, | |
| "learning_rate": 1.9536770616140277e-05, | |
| "loss": 0.4514, | |
| "mean_token_accuracy": 0.8494591928231272, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.18980967824528464, | |
| "grad_norm": 0.2923803706742491, | |
| "learning_rate": 1.9522298865752662e-05, | |
| "loss": 0.4495, | |
| "mean_token_accuracy": 0.8497101586020829, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.19117521549884783, | |
| "grad_norm": 0.35715418556836526, | |
| "learning_rate": 1.950761005079556e-05, | |
| "loss": 0.4779, | |
| "mean_token_accuracy": 0.8411692267142432, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.19254075275241103, | |
| "grad_norm": 0.27357483270279737, | |
| "learning_rate": 1.949270450610631e-05, | |
| "loss": 0.4116, | |
| "mean_token_accuracy": 0.8609656246183783, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.19390629000597423, | |
| "grad_norm": 0.3713092433646525, | |
| "learning_rate": 1.9477582571462706e-05, | |
| "loss": 0.4513, | |
| "mean_token_accuracy": 0.8494811710899883, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.19527182725953743, | |
| "grad_norm": 0.34866780781386153, | |
| "learning_rate": 1.9462244591575222e-05, | |
| "loss": 0.4447, | |
| "mean_token_accuracy": 0.8526829842758383, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.19663736451310063, | |
| "grad_norm": 0.2949930109269014, | |
| "learning_rate": 1.944669091607919e-05, | |
| "loss": 0.4273, | |
| "mean_token_accuracy": 0.8562389055851354, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.19800290176666382, | |
| "grad_norm": 0.374752731020472, | |
| "learning_rate": 1.9430921899526786e-05, | |
| "loss": 0.4478, | |
| "mean_token_accuracy": 0.8499277623582386, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.19936843902022702, | |
| "grad_norm": 0.3651538698048502, | |
| "learning_rate": 1.941493790137898e-05, | |
| "loss": 0.4442, | |
| "mean_token_accuracy": 0.8526649985454055, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.20073397627379022, | |
| "grad_norm": 0.36636578170953166, | |
| "learning_rate": 1.9398739285997342e-05, | |
| "loss": 0.4485, | |
| "mean_token_accuracy": 0.8504179631544849, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.20209951352735342, | |
| "grad_norm": 0.40761326956885474, | |
| "learning_rate": 1.9382326422635705e-05, | |
| "loss": 0.4642, | |
| "mean_token_accuracy": 0.8452167499746224, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.20346505078091662, | |
| "grad_norm": 0.31422691233021155, | |
| "learning_rate": 1.936569968543179e-05, | |
| "loss": 0.4368, | |
| "mean_token_accuracy": 0.8547842065414144, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.20483058803447982, | |
| "grad_norm": 0.35446348871617744, | |
| "learning_rate": 1.934885945339865e-05, | |
| "loss": 0.4402, | |
| "mean_token_accuracy": 0.8503691031564117, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.20619612528804301, | |
| "grad_norm": 0.32285449024202295, | |
| "learning_rate": 1.9331806110416027e-05, | |
| "loss": 0.4531, | |
| "mean_token_accuracy": 0.8497348879856765, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.2075616625416062, | |
| "grad_norm": 0.2974246522701837, | |
| "learning_rate": 1.9314540045221628e-05, | |
| "loss": 0.4292, | |
| "mean_token_accuracy": 0.8566265295867389, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.2089271997951694, | |
| "grad_norm": 0.34647196924654977, | |
| "learning_rate": 1.9297061651402237e-05, | |
| "loss": 0.4326, | |
| "mean_token_accuracy": 0.8539450355140382, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.2102927370487326, | |
| "grad_norm": 0.386258719161046, | |
| "learning_rate": 1.927937132738476e-05, | |
| "loss": 0.4421, | |
| "mean_token_accuracy": 0.8532191960011701, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.2116582743022958, | |
| "grad_norm": 0.34454258298504326, | |
| "learning_rate": 1.9261469476427122e-05, | |
| "loss": 0.4258, | |
| "mean_token_accuracy": 0.8568923616497963, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.213023811555859, | |
| "grad_norm": 0.3541361157616604, | |
| "learning_rate": 1.92433565066091e-05, | |
| "loss": 0.4529, | |
| "mean_token_accuracy": 0.8479736960845543, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.2143893488094222, | |
| "grad_norm": 0.35191182105790164, | |
| "learning_rate": 1.922503283082301e-05, | |
| "loss": 0.4202, | |
| "mean_token_accuracy": 0.8578095481186783, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.2157548860629854, | |
| "grad_norm": 0.42204829019945844, | |
| "learning_rate": 1.920649886676429e-05, | |
| "loss": 0.4753, | |
| "mean_token_accuracy": 0.8431488494361127, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.2171204233165486, | |
| "grad_norm": 0.282563065166861, | |
| "learning_rate": 1.9187755036921976e-05, | |
| "loss": 0.4481, | |
| "mean_token_accuracy": 0.8492420629486095, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.2184859605701118, | |
| "grad_norm": 0.3294030087715004, | |
| "learning_rate": 1.916880176856909e-05, | |
| "loss": 0.4239, | |
| "mean_token_accuracy": 0.856072747237234, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.219851497823675, | |
| "grad_norm": 0.31721191426888634, | |
| "learning_rate": 1.914963949375288e-05, | |
| "loss": 0.4393, | |
| "mean_token_accuracy": 0.8534969611431503, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.2212170350772382, | |
| "grad_norm": 0.34061976253510645, | |
| "learning_rate": 1.9130268649284982e-05, | |
| "loss": 0.4495, | |
| "mean_token_accuracy": 0.8489997065725852, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.2225825723308014, | |
| "grad_norm": 0.319599698534074, | |
| "learning_rate": 1.9110689676731454e-05, | |
| "loss": 0.4524, | |
| "mean_token_accuracy": 0.8484242719179398, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.2239481095843646, | |
| "grad_norm": 0.38523458399821875, | |
| "learning_rate": 1.909090302240273e-05, | |
| "loss": 0.4459, | |
| "mean_token_accuracy": 0.8499845577594263, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.2253136468379278, | |
| "grad_norm": 0.3082646414065079, | |
| "learning_rate": 1.907090913734341e-05, | |
| "loss": 0.4228, | |
| "mean_token_accuracy": 0.8565411948866354, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.226679184091491, | |
| "grad_norm": 0.4281504609323445, | |
| "learning_rate": 1.905070847732202e-05, | |
| "loss": 0.4231, | |
| "mean_token_accuracy": 0.8567314407107315, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.22804472134505419, | |
| "grad_norm": 0.3122301354550845, | |
| "learning_rate": 1.9030301502820597e-05, | |
| "loss": 0.4406, | |
| "mean_token_accuracy": 0.8526059868064779, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.22941025859861738, | |
| "grad_norm": 0.3600671344222034, | |
| "learning_rate": 1.900968867902419e-05, | |
| "loss": 0.4423, | |
| "mean_token_accuracy": 0.8509005211327412, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.23077579585218058, | |
| "grad_norm": 0.3335012451792573, | |
| "learning_rate": 1.8988870475810284e-05, | |
| "loss": 0.4323, | |
| "mean_token_accuracy": 0.8542063476327129, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.23214133310574378, | |
| "grad_norm": 0.3416637272262734, | |
| "learning_rate": 1.896784736773805e-05, | |
| "loss": 0.4544, | |
| "mean_token_accuracy": 0.8489287380113998, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.23350687035930698, | |
| "grad_norm": 0.4300545403831784, | |
| "learning_rate": 1.894661983403755e-05, | |
| "loss": 0.441, | |
| "mean_token_accuracy": 0.8535810843521906, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.23487240761287018, | |
| "grad_norm": 0.3258898174006088, | |
| "learning_rate": 1.8925188358598815e-05, | |
| "loss": 0.4376, | |
| "mean_token_accuracy": 0.8517990481723685, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.23623794486643337, | |
| "grad_norm": 0.2988219576519335, | |
| "learning_rate": 1.8903553429960803e-05, | |
| "loss": 0.4694, | |
| "mean_token_accuracy": 0.8419916214618768, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.23760348211999657, | |
| "grad_norm": 0.32171167299283115, | |
| "learning_rate": 1.8881715541300278e-05, | |
| "loss": 0.4412, | |
| "mean_token_accuracy": 0.8526490817200958, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.23896901937355977, | |
| "grad_norm": 0.36549760624914024, | |
| "learning_rate": 1.885967519042054e-05, | |
| "loss": 0.4575, | |
| "mean_token_accuracy": 0.8467209578442905, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.240334556627123, | |
| "grad_norm": 0.3206347251456072, | |
| "learning_rate": 1.8837432879740113e-05, | |
| "loss": 0.4253, | |
| "mean_token_accuracy": 0.8562328690343533, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.2417000938806862, | |
| "grad_norm": 0.36277383760815624, | |
| "learning_rate": 1.881498911628127e-05, | |
| "loss": 0.4317, | |
| "mean_token_accuracy": 0.8544379338426664, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.2430656311342494, | |
| "grad_norm": 0.3634699354146494, | |
| "learning_rate": 1.879234441165847e-05, | |
| "loss": 0.4257, | |
| "mean_token_accuracy": 0.8562673113043472, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.2444311683878126, | |
| "grad_norm": 0.3364593897580825, | |
| "learning_rate": 1.8769499282066716e-05, | |
| "loss": 0.4288, | |
| "mean_token_accuracy": 0.8545978179267162, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.2457967056413758, | |
| "grad_norm": 0.36736450922616337, | |
| "learning_rate": 1.8746454248269777e-05, | |
| "loss": 0.4547, | |
| "mean_token_accuracy": 0.8489931015745901, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.247162242894939, | |
| "grad_norm": 0.2842529030222029, | |
| "learning_rate": 1.872320983558831e-05, | |
| "loss": 0.4357, | |
| "mean_token_accuracy": 0.8537670434938108, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.2485277801485022, | |
| "grad_norm": 0.3503501258863951, | |
| "learning_rate": 1.8699766573887902e-05, | |
| "loss": 0.4275, | |
| "mean_token_accuracy": 0.8555725755068402, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.24989331740206538, | |
| "grad_norm": 0.32115728482149525, | |
| "learning_rate": 1.867612499756697e-05, | |
| "loss": 0.4349, | |
| "mean_token_accuracy": 0.8538767592466098, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.2512588546556286, | |
| "grad_norm": 0.37284059913436857, | |
| "learning_rate": 1.8652285645544602e-05, | |
| "loss": 0.4229, | |
| "mean_token_accuracy": 0.8565206996350189, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.2526243919091918, | |
| "grad_norm": 0.27914377362605364, | |
| "learning_rate": 1.862824906124826e-05, | |
| "loss": 0.4389, | |
| "mean_token_accuracy": 0.8533956082213735, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.253989929162755, | |
| "grad_norm": 0.33047332583495903, | |
| "learning_rate": 1.8604015792601395e-05, | |
| "loss": 0.4259, | |
| "mean_token_accuracy": 0.8587885669486044, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.2553554664163182, | |
| "grad_norm": 0.3098565583156071, | |
| "learning_rate": 1.8579586392010943e-05, | |
| "loss": 0.4725, | |
| "mean_token_accuracy": 0.841463056808317, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.2567210036698814, | |
| "grad_norm": 0.32188400036801423, | |
| "learning_rate": 1.855496141635476e-05, | |
| "loss": 0.4488, | |
| "mean_token_accuracy": 0.8494999156308464, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.2580865409234446, | |
| "grad_norm": 0.3130726973042191, | |
| "learning_rate": 1.8530141426968905e-05, | |
| "loss": 0.4278, | |
| "mean_token_accuracy": 0.8566668881226269, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.25945207817700777, | |
| "grad_norm": 0.3957651044446383, | |
| "learning_rate": 1.850512698963485e-05, | |
| "loss": 0.4261, | |
| "mean_token_accuracy": 0.8556154103093908, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.26081761543057097, | |
| "grad_norm": 0.3031069025669304, | |
| "learning_rate": 1.8479918674566602e-05, | |
| "loss": 0.4329, | |
| "mean_token_accuracy": 0.8541440562465102, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.26218315268413417, | |
| "grad_norm": 0.3643754085496684, | |
| "learning_rate": 1.8454517056397663e-05, | |
| "loss": 0.4324, | |
| "mean_token_accuracy": 0.855023699771463, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.26354868993769737, | |
| "grad_norm": 0.3003852707955295, | |
| "learning_rate": 1.842892271416797e-05, | |
| "loss": 0.4261, | |
| "mean_token_accuracy": 0.8553599004543545, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.26491422719126057, | |
| "grad_norm": 0.34484824975763084, | |
| "learning_rate": 1.8403136231310686e-05, | |
| "loss": 0.4252, | |
| "mean_token_accuracy": 0.8566094556118384, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.26627976444482376, | |
| "grad_norm": 0.3248230314864982, | |
| "learning_rate": 1.8377158195638877e-05, | |
| "loss": 0.4411, | |
| "mean_token_accuracy": 0.850191582672607, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.26764530169838696, | |
| "grad_norm": 0.3425796212560859, | |
| "learning_rate": 1.8350989199332156e-05, | |
| "loss": 0.4467, | |
| "mean_token_accuracy": 0.8491048527905652, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.26901083895195016, | |
| "grad_norm": 0.34768568039417325, | |
| "learning_rate": 1.8324629838923132e-05, | |
| "loss": 0.4421, | |
| "mean_token_accuracy": 0.8511677206305359, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.27037637620551336, | |
| "grad_norm": 0.34080656327066544, | |
| "learning_rate": 1.829808071528386e-05, | |
| "loss": 0.4281, | |
| "mean_token_accuracy": 0.8569207458025948, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.27174191345907656, | |
| "grad_norm": 0.3367911094729273, | |
| "learning_rate": 1.8271342433612114e-05, | |
| "loss": 0.4395, | |
| "mean_token_accuracy": 0.8523639031650138, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.27310745071263975, | |
| "grad_norm": 0.3151512003460036, | |
| "learning_rate": 1.8244415603417603e-05, | |
| "loss": 0.4348, | |
| "mean_token_accuracy": 0.8558035079566513, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.27447298796620295, | |
| "grad_norm": 0.3220180875156976, | |
| "learning_rate": 1.8217300838508075e-05, | |
| "loss": 0.403, | |
| "mean_token_accuracy": 0.8631310614267218, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.27583852521976615, | |
| "grad_norm": 0.28929076526601605, | |
| "learning_rate": 1.8189998756975318e-05, | |
| "loss": 0.4282, | |
| "mean_token_accuracy": 0.8555874931968974, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.27720406247332935, | |
| "grad_norm": 0.36330635065555233, | |
| "learning_rate": 1.8162509981181084e-05, | |
| "loss": 0.4479, | |
| "mean_token_accuracy": 0.8496229750925639, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.27856959972689255, | |
| "grad_norm": 0.27638019683036386, | |
| "learning_rate": 1.813483513774289e-05, | |
| "loss": 0.436, | |
| "mean_token_accuracy": 0.8529176942517751, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.27993513698045575, | |
| "grad_norm": 0.35506296866931764, | |
| "learning_rate": 1.8106974857519737e-05, | |
| "loss": 0.4426, | |
| "mean_token_accuracy": 0.8514268242896852, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.28130067423401894, | |
| "grad_norm": 0.3286137389758072, | |
| "learning_rate": 1.807892977559774e-05, | |
| "loss": 0.4417, | |
| "mean_token_accuracy": 0.8510960913771437, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.28266621148758214, | |
| "grad_norm": 0.25518635308883814, | |
| "learning_rate": 1.8050700531275632e-05, | |
| "loss": 0.418, | |
| "mean_token_accuracy": 0.8594128726528723, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.28403174874114534, | |
| "grad_norm": 0.34031577918112327, | |
| "learning_rate": 1.80222877680502e-05, | |
| "loss": 0.421, | |
| "mean_token_accuracy": 0.8569013665570591, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.28539728599470854, | |
| "grad_norm": 0.39038645873424477, | |
| "learning_rate": 1.799369213360163e-05, | |
| "loss": 0.4389, | |
| "mean_token_accuracy": 0.851740201076106, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.28676282324827174, | |
| "grad_norm": 0.34488640211714505, | |
| "learning_rate": 1.7964914279778716e-05, | |
| "loss": 0.4449, | |
| "mean_token_accuracy": 0.8504230733766642, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.28812836050183493, | |
| "grad_norm": 0.2935740182091683, | |
| "learning_rate": 1.7935954862584018e-05, | |
| "loss": 0.4321, | |
| "mean_token_accuracy": 0.8557445828098919, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.28949389775539813, | |
| "grad_norm": 0.3228921123428817, | |
| "learning_rate": 1.7906814542158913e-05, | |
| "loss": 0.4305, | |
| "mean_token_accuracy": 0.8537493570872144, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.29085943500896133, | |
| "grad_norm": 0.28556153848386795, | |
| "learning_rate": 1.7877493982768527e-05, | |
| "loss": 0.444, | |
| "mean_token_accuracy": 0.8495622520508093, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.29222497226252453, | |
| "grad_norm": 0.31694287799599025, | |
| "learning_rate": 1.7847993852786612e-05, | |
| "loss": 0.4461, | |
| "mean_token_accuracy": 0.8494224622463822, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.2935905095160877, | |
| "grad_norm": 0.3092018814084449, | |
| "learning_rate": 1.78183148246803e-05, | |
| "loss": 0.4301, | |
| "mean_token_accuracy": 0.8543694459213177, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.2949560467696509, | |
| "grad_norm": 0.2902008097484682, | |
| "learning_rate": 1.7788457574994777e-05, | |
| "loss": 0.4151, | |
| "mean_token_accuracy": 0.8582394007580811, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.2963215840232141, | |
| "grad_norm": 0.3328026971189236, | |
| "learning_rate": 1.775842278433786e-05, | |
| "loss": 0.4255, | |
| "mean_token_accuracy": 0.8564833688598898, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.2976871212767773, | |
| "grad_norm": 0.27435410637394864, | |
| "learning_rate": 1.772821113736449e-05, | |
| "loss": 0.4262, | |
| "mean_token_accuracy": 0.8559848881104419, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.2990526585303405, | |
| "grad_norm": 0.3632335496079091, | |
| "learning_rate": 1.76978233227611e-05, | |
| "loss": 0.4107, | |
| "mean_token_accuracy": 0.8612579363381194, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.3004181957839037, | |
| "grad_norm": 0.3167909582527359, | |
| "learning_rate": 1.7667260033229953e-05, | |
| "loss": 0.4342, | |
| "mean_token_accuracy": 0.8534075264867609, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3017837330374669, | |
| "grad_norm": 0.31022395154637067, | |
| "learning_rate": 1.7636521965473324e-05, | |
| "loss": 0.4295, | |
| "mean_token_accuracy": 0.8542319208452556, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.3031492702910301, | |
| "grad_norm": 0.33282321447951396, | |
| "learning_rate": 1.760560982017762e-05, | |
| "loss": 0.4529, | |
| "mean_token_accuracy": 0.8473886646532048, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.3045148075445933, | |
| "grad_norm": 0.3703839580647503, | |
| "learning_rate": 1.7574524301997425e-05, | |
| "loss": 0.4418, | |
| "mean_token_accuracy": 0.8520835295773124, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.3058803447981565, | |
| "grad_norm": 0.29499276522228773, | |
| "learning_rate": 1.7543266119539424e-05, | |
| "loss": 0.4222, | |
| "mean_token_accuracy": 0.8572304378636991, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.3072458820517197, | |
| "grad_norm": 0.36404736107381586, | |
| "learning_rate": 1.751183598534625e-05, | |
| "loss": 0.417, | |
| "mean_token_accuracy": 0.8592393267568642, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.3086114193052829, | |
| "grad_norm": 0.31900880116039887, | |
| "learning_rate": 1.7480234615880247e-05, | |
| "loss": 0.4221, | |
| "mean_token_accuracy": 0.8570558484506429, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.3099769565588461, | |
| "grad_norm": 0.39339547941721115, | |
| "learning_rate": 1.7448462731507133e-05, | |
| "loss": 0.4351, | |
| "mean_token_accuracy": 0.8529953241096725, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.3113424938124093, | |
| "grad_norm": 0.38617429480416876, | |
| "learning_rate": 1.7416521056479577e-05, | |
| "loss": 0.4352, | |
| "mean_token_accuracy": 0.8529279155103227, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.3127080310659725, | |
| "grad_norm": 0.3239118787173082, | |
| "learning_rate": 1.7384410318920698e-05, | |
| "loss": 0.4387, | |
| "mean_token_accuracy": 0.8516904522581532, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.3140735683195357, | |
| "grad_norm": 0.3142116046355029, | |
| "learning_rate": 1.7352131250807466e-05, | |
| "loss": 0.4164, | |
| "mean_token_accuracy": 0.8592193212873477, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3154391055730989, | |
| "grad_norm": 0.3581861836859027, | |
| "learning_rate": 1.7319684587954e-05, | |
| "loss": 0.4175, | |
| "mean_token_accuracy": 0.8585542096028473, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.3168046428266621, | |
| "grad_norm": 0.3011586770676647, | |
| "learning_rate": 1.728707106999482e-05, | |
| "loss": 0.4034, | |
| "mean_token_accuracy": 0.8631680964286239, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.3181701800802253, | |
| "grad_norm": 0.3215575152723449, | |
| "learning_rate": 1.725429144036797e-05, | |
| "loss": 0.4146, | |
| "mean_token_accuracy": 0.8578385115822528, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.3195357173337885, | |
| "grad_norm": 0.34263046362886657, | |
| "learning_rate": 1.722134644629807e-05, | |
| "loss": 0.4142, | |
| "mean_token_accuracy": 0.8586846673999058, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.3209012545873517, | |
| "grad_norm": 0.33342256217650906, | |
| "learning_rate": 1.7188236838779297e-05, | |
| "loss": 0.44, | |
| "mean_token_accuracy": 0.851783206820397, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.3222667918409149, | |
| "grad_norm": 0.3442067446188413, | |
| "learning_rate": 1.7154963372558246e-05, | |
| "loss": 0.4415, | |
| "mean_token_accuracy": 0.8494488715634262, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.3236323290944781, | |
| "grad_norm": 0.39286054840274137, | |
| "learning_rate": 1.712152680611675e-05, | |
| "loss": 0.4446, | |
| "mean_token_accuracy": 0.8519838146425381, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.3249978663480413, | |
| "grad_norm": 0.3707489334417565, | |
| "learning_rate": 1.7087927901654558e-05, | |
| "loss": 0.4018, | |
| "mean_token_accuracy": 0.8628199801152938, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.3263634036016045, | |
| "grad_norm": 0.32915317985042997, | |
| "learning_rate": 1.7054167425071995e-05, | |
| "loss": 0.4253, | |
| "mean_token_accuracy": 0.8583234741171639, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.3277289408551677, | |
| "grad_norm": 0.3582433826817124, | |
| "learning_rate": 1.702024614595248e-05, | |
| "loss": 0.4245, | |
| "mean_token_accuracy": 0.8573367967440888, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3290944781087309, | |
| "grad_norm": 0.33842853808026, | |
| "learning_rate": 1.6986164837544987e-05, | |
| "loss": 0.4179, | |
| "mean_token_accuracy": 0.8586180123428547, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.3304600153622941, | |
| "grad_norm": 0.2994720944821025, | |
| "learning_rate": 1.6951924276746425e-05, | |
| "loss": 0.4362, | |
| "mean_token_accuracy": 0.8523116286165904, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.3318255526158573, | |
| "grad_norm": 0.3387086926170212, | |
| "learning_rate": 1.6917525244083918e-05, | |
| "loss": 0.4512, | |
| "mean_token_accuracy": 0.8479398316838589, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.3331910898694205, | |
| "grad_norm": 0.28444793823353953, | |
| "learning_rate": 1.688296852369703e-05, | |
| "loss": 0.4227, | |
| "mean_token_accuracy": 0.857218350086272, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.3345566271229837, | |
| "grad_norm": 0.32234365149946503, | |
| "learning_rate": 1.6848254903319866e-05, | |
| "loss": 0.433, | |
| "mean_token_accuracy": 0.8537039316837896, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.3359221643765469, | |
| "grad_norm": 0.3091069266476509, | |
| "learning_rate": 1.6813385174263137e-05, | |
| "loss": 0.4195, | |
| "mean_token_accuracy": 0.8581039754478647, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.33728770163011007, | |
| "grad_norm": 0.3640194501208851, | |
| "learning_rate": 1.677836013139611e-05, | |
| "loss": 0.4255, | |
| "mean_token_accuracy": 0.856087771880514, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.33865323888367327, | |
| "grad_norm": 0.2842912100028283, | |
| "learning_rate": 1.6743180573128494e-05, | |
| "loss": 0.4063, | |
| "mean_token_accuracy": 0.8612077361066686, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.34001877613723647, | |
| "grad_norm": 0.33927830284172017, | |
| "learning_rate": 1.6707847301392237e-05, | |
| "loss": 0.4328, | |
| "mean_token_accuracy": 0.854018667863303, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.34138431339079967, | |
| "grad_norm": 0.3380064283112177, | |
| "learning_rate": 1.6672361121623238e-05, | |
| "loss": 0.4196, | |
| "mean_token_accuracy": 0.858289578175996, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3427498506443629, | |
| "grad_norm": 0.2921545005723523, | |
| "learning_rate": 1.6636722842743013e-05, | |
| "loss": 0.418, | |
| "mean_token_accuracy": 0.8576834083602248, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.3441153878979261, | |
| "grad_norm": 0.30696667285073703, | |
| "learning_rate": 1.660093327714022e-05, | |
| "loss": 0.4223, | |
| "mean_token_accuracy": 0.8571891667514102, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.3454809251514893, | |
| "grad_norm": 0.2692563224404543, | |
| "learning_rate": 1.656499324065217e-05, | |
| "loss": 0.424, | |
| "mean_token_accuracy": 0.8564762224897754, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.3468464624050525, | |
| "grad_norm": 0.2902797695424939, | |
| "learning_rate": 1.6528903552546207e-05, | |
| "loss": 0.4348, | |
| "mean_token_accuracy": 0.8531339042109244, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.3482119996586157, | |
| "grad_norm": 0.3054595022905831, | |
| "learning_rate": 1.6492665035501048e-05, | |
| "loss": 0.4304, | |
| "mean_token_accuracy": 0.8550912353650719, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.3495775369121789, | |
| "grad_norm": 0.30207364986816204, | |
| "learning_rate": 1.6456278515588023e-05, | |
| "loss": 0.4185, | |
| "mean_token_accuracy": 0.8576527365281009, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.3509430741657421, | |
| "grad_norm": 0.29635598975960087, | |
| "learning_rate": 1.6419744822252255e-05, | |
| "loss": 0.4078, | |
| "mean_token_accuracy": 0.8610621124104776, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.3523086114193053, | |
| "grad_norm": 0.31915252694298046, | |
| "learning_rate": 1.638306478829373e-05, | |
| "loss": 0.4231, | |
| "mean_token_accuracy": 0.8561889856733563, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.3536741486728685, | |
| "grad_norm": 0.31923467215480067, | |
| "learning_rate": 1.634623924984833e-05, | |
| "loss": 0.4135, | |
| "mean_token_accuracy": 0.8592538002988109, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.3550396859264317, | |
| "grad_norm": 0.30692282494391426, | |
| "learning_rate": 1.6309269046368777e-05, | |
| "loss": 0.3966, | |
| "mean_token_accuracy": 0.8640884833628784, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3564052231799949, | |
| "grad_norm": 0.27482122390015884, | |
| "learning_rate": 1.627215502060548e-05, | |
| "loss": 0.4216, | |
| "mean_token_accuracy": 0.8559256932307063, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.3577707604335581, | |
| "grad_norm": 0.34179806060032714, | |
| "learning_rate": 1.6234898018587336e-05, | |
| "loss": 0.4541, | |
| "mean_token_accuracy": 0.8491678521649152, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.3591362976871213, | |
| "grad_norm": 0.3011617462318224, | |
| "learning_rate": 1.619749888960245e-05, | |
| "loss": 0.4261, | |
| "mean_token_accuracy": 0.8554996248647118, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.3605018349406845, | |
| "grad_norm": 0.31807503774612333, | |
| "learning_rate": 1.615995848617876e-05, | |
| "loss": 0.4318, | |
| "mean_token_accuracy": 0.8559916893442254, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.3618673721942477, | |
| "grad_norm": 0.3316772571952849, | |
| "learning_rate": 1.612227766406461e-05, | |
| "loss": 0.4114, | |
| "mean_token_accuracy": 0.8605685266181821, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.3632329094478109, | |
| "grad_norm": 0.2831512502783301, | |
| "learning_rate": 1.6084457282209244e-05, | |
| "loss": 0.4182, | |
| "mean_token_accuracy": 0.8572316390854738, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.3645984467013741, | |
| "grad_norm": 0.3020965586894018, | |
| "learning_rate": 1.6046498202743232e-05, | |
| "loss": 0.4278, | |
| "mean_token_accuracy": 0.8552705974923581, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.3659639839549373, | |
| "grad_norm": 0.34579468143149206, | |
| "learning_rate": 1.6008401290958806e-05, | |
| "loss": 0.4224, | |
| "mean_token_accuracy": 0.8574742487392688, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.3673295212085005, | |
| "grad_norm": 0.3213456748377343, | |
| "learning_rate": 1.5970167415290142e-05, | |
| "loss": 0.4191, | |
| "mean_token_accuracy": 0.8592830241547558, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.3686950584620637, | |
| "grad_norm": 0.27169537891651463, | |
| "learning_rate": 1.5931797447293553e-05, | |
| "loss": 0.4257, | |
| "mean_token_accuracy": 0.8565419088799743, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.3700605957156269, | |
| "grad_norm": 0.31485256379338494, | |
| "learning_rate": 1.5893292261627644e-05, | |
| "loss": 0.4211, | |
| "mean_token_accuracy": 0.8583665894914854, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.3714261329691901, | |
| "grad_norm": 0.34197779647959986, | |
| "learning_rate": 1.5854652736033353e-05, | |
| "loss": 0.4199, | |
| "mean_token_accuracy": 0.8586362735400643, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.3727916702227533, | |
| "grad_norm": 0.3463622744135729, | |
| "learning_rate": 1.5815879751313957e-05, | |
| "loss": 0.4153, | |
| "mean_token_accuracy": 0.8590972009223808, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.3741572074763165, | |
| "grad_norm": 0.2854754568932129, | |
| "learning_rate": 1.577697419131497e-05, | |
| "loss": 0.4286, | |
| "mean_token_accuracy": 0.8549681291103441, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.3755227447298797, | |
| "grad_norm": 0.3612800362541057, | |
| "learning_rate": 1.5737936942904025e-05, | |
| "loss": 0.4164, | |
| "mean_token_accuracy": 0.8589556245901419, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.3768882819834429, | |
| "grad_norm": 0.3095104128297478, | |
| "learning_rate": 1.5698768895950644e-05, | |
| "loss": 0.4196, | |
| "mean_token_accuracy": 0.8573392516346519, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.3782538192370061, | |
| "grad_norm": 0.28011182927413847, | |
| "learning_rate": 1.5659470943305956e-05, | |
| "loss": 0.4207, | |
| "mean_token_accuracy": 0.8575836185111236, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.37961935649056927, | |
| "grad_norm": 0.38665599972326675, | |
| "learning_rate": 1.5620043980782327e-05, | |
| "loss": 0.4228, | |
| "mean_token_accuracy": 0.8577991932053891, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.38098489374413247, | |
| "grad_norm": 0.3045801906729731, | |
| "learning_rate": 1.5580488907132972e-05, | |
| "loss": 0.4127, | |
| "mean_token_accuracy": 0.859320001053601, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.38235043099769567, | |
| "grad_norm": 0.3027464188711677, | |
| "learning_rate": 1.554080662403144e-05, | |
| "loss": 0.4344, | |
| "mean_token_accuracy": 0.8530298389418411, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.38371596825125887, | |
| "grad_norm": 0.30735481784619834, | |
| "learning_rate": 1.5500998036051075e-05, | |
| "loss": 0.4226, | |
| "mean_token_accuracy": 0.8564421780464974, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.38508150550482206, | |
| "grad_norm": 0.28386427155284427, | |
| "learning_rate": 1.546106405064438e-05, | |
| "loss": 0.4447, | |
| "mean_token_accuracy": 0.8507831934070248, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.38644704275838526, | |
| "grad_norm": 0.3157137742956653, | |
| "learning_rate": 1.5421005578122356e-05, | |
| "loss": 0.4264, | |
| "mean_token_accuracy": 0.8566646495893814, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.38781258001194846, | |
| "grad_norm": 0.3109770970705045, | |
| "learning_rate": 1.5380823531633727e-05, | |
| "loss": 0.4252, | |
| "mean_token_accuracy": 0.8575001524672929, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.38917811726551166, | |
| "grad_norm": 0.3467377136844463, | |
| "learning_rate": 1.5340518827144145e-05, | |
| "loss": 0.4233, | |
| "mean_token_accuracy": 0.8582951077732086, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.39054365451907486, | |
| "grad_norm": 0.3381885844162203, | |
| "learning_rate": 1.5300092383415282e-05, | |
| "loss": 0.4289, | |
| "mean_token_accuracy": 0.8549039858412707, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.39190919177263805, | |
| "grad_norm": 0.3297678208614477, | |
| "learning_rate": 1.525954512198392e-05, | |
| "loss": 0.4096, | |
| "mean_token_accuracy": 0.8613219951830501, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.39327472902620125, | |
| "grad_norm": 0.3368037995093397, | |
| "learning_rate": 1.5218877967140921e-05, | |
| "loss": 0.4224, | |
| "mean_token_accuracy": 0.8548496161946588, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.39464026627976445, | |
| "grad_norm": 0.3212698070828666, | |
| "learning_rate": 1.517809184591017e-05, | |
| "loss": 0.4263, | |
| "mean_token_accuracy": 0.8574614580441292, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.39600580353332765, | |
| "grad_norm": 0.40537049798129476, | |
| "learning_rate": 1.5137187688027437e-05, | |
| "loss": 0.43, | |
| "mean_token_accuracy": 0.8550197749735438, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.39737134078689085, | |
| "grad_norm": 0.31835336382101076, | |
| "learning_rate": 1.5096166425919176e-05, | |
| "loss": 0.4298, | |
| "mean_token_accuracy": 0.8548674541736617, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.39873687804045405, | |
| "grad_norm": 0.34618990419302037, | |
| "learning_rate": 1.5055028994681284e-05, | |
| "loss": 0.409, | |
| "mean_token_accuracy": 0.8607840254015119, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.40010241529401724, | |
| "grad_norm": 0.350507807135204, | |
| "learning_rate": 1.5013776332057786e-05, | |
| "loss": 0.4151, | |
| "mean_token_accuracy": 0.8577964045490754, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.40146795254758044, | |
| "grad_norm": 0.30189871647045075, | |
| "learning_rate": 1.4972409378419439e-05, | |
| "loss": 0.4094, | |
| "mean_token_accuracy": 0.8607436168222747, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.40283348980114364, | |
| "grad_norm": 0.3016208540740829, | |
| "learning_rate": 1.4930929076742317e-05, | |
| "loss": 0.4151, | |
| "mean_token_accuracy": 0.8595533954555142, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.40419902705470684, | |
| "grad_norm": 0.29033881446311255, | |
| "learning_rate": 1.4889336372586305e-05, | |
| "loss": 0.3928, | |
| "mean_token_accuracy": 0.8637799769336061, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.40556456430827004, | |
| "grad_norm": 0.3668162312007593, | |
| "learning_rate": 1.4847632214073548e-05, | |
| "loss": 0.4426, | |
| "mean_token_accuracy": 0.8499288824377587, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.40693010156183324, | |
| "grad_norm": 0.32449852074777696, | |
| "learning_rate": 1.4805817551866839e-05, | |
| "loss": 0.4084, | |
| "mean_token_accuracy": 0.8605696761410078, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.40829563881539643, | |
| "grad_norm": 0.2970564472354985, | |
| "learning_rate": 1.4763893339147942e-05, | |
| "loss": 0.418, | |
| "mean_token_accuracy": 0.8573568392411037, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.40966117606895963, | |
| "grad_norm": 0.32582221983564175, | |
| "learning_rate": 1.4721860531595868e-05, | |
| "loss": 0.4155, | |
| "mean_token_accuracy": 0.8602134585041119, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.41102671332252283, | |
| "grad_norm": 0.3089086009639669, | |
| "learning_rate": 1.4679720087365097e-05, | |
| "loss": 0.4099, | |
| "mean_token_accuracy": 0.8610357533905291, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.41239225057608603, | |
| "grad_norm": 0.32593412647065384, | |
| "learning_rate": 1.4637472967063721e-05, | |
| "loss": 0.4383, | |
| "mean_token_accuracy": 0.8527080209938287, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.4137577878296492, | |
| "grad_norm": 0.3277327156387932, | |
| "learning_rate": 1.4595120133731564e-05, | |
| "loss": 0.4181, | |
| "mean_token_accuracy": 0.8569046290075678, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.4151233250832124, | |
| "grad_norm": 0.3730986070237096, | |
| "learning_rate": 1.4552662552818211e-05, | |
| "loss": 0.4433, | |
| "mean_token_accuracy": 0.8504104581423189, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.4164888623367756, | |
| "grad_norm": 0.28719200698202685, | |
| "learning_rate": 1.451010119216102e-05, | |
| "loss": 0.4369, | |
| "mean_token_accuracy": 0.8519207939028978, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.4178543995903388, | |
| "grad_norm": 0.2597878117331174, | |
| "learning_rate": 1.446743702196304e-05, | |
| "loss": 0.4187, | |
| "mean_token_accuracy": 0.8574448859814668, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.419219936843902, | |
| "grad_norm": 0.34783364098922354, | |
| "learning_rate": 1.4424671014770906e-05, | |
| "loss": 0.4359, | |
| "mean_token_accuracy": 0.8523371430850553, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.4205854740974652, | |
| "grad_norm": 0.33054277050758557, | |
| "learning_rate": 1.4381804145452672e-05, | |
| "loss": 0.4155, | |
| "mean_token_accuracy": 0.8585607772789662, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.4219510113510284, | |
| "grad_norm": 0.2495797590927163, | |
| "learning_rate": 1.4338837391175582e-05, | |
| "loss": 0.4296, | |
| "mean_token_accuracy": 0.855114414007386, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.4233165486045916, | |
| "grad_norm": 0.35728528385485553, | |
| "learning_rate": 1.4295771731383799e-05, | |
| "loss": 0.434, | |
| "mean_token_accuracy": 0.8536842164352073, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.4246820858581548, | |
| "grad_norm": 0.3288240381017802, | |
| "learning_rate": 1.4252608147776067e-05, | |
| "loss": 0.4051, | |
| "mean_token_accuracy": 0.8619649344224427, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.426047623111718, | |
| "grad_norm": 0.3883949212610513, | |
| "learning_rate": 1.4209347624283352e-05, | |
| "loss": 0.4325, | |
| "mean_token_accuracy": 0.8541025576014111, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.4274131603652812, | |
| "grad_norm": 0.3012394552571007, | |
| "learning_rate": 1.4165991147046404e-05, | |
| "loss": 0.4121, | |
| "mean_token_accuracy": 0.8598195582006549, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.4287786976188444, | |
| "grad_norm": 0.3197771206984412, | |
| "learning_rate": 1.4122539704393265e-05, | |
| "loss": 0.4237, | |
| "mean_token_accuracy": 0.8560034937668209, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.4301442348724076, | |
| "grad_norm": 0.3057046597401257, | |
| "learning_rate": 1.4078994286816768e-05, | |
| "loss": 0.4329, | |
| "mean_token_accuracy": 0.853573226005123, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.4315097721259708, | |
| "grad_norm": 0.3422003725649723, | |
| "learning_rate": 1.4035355886951924e-05, | |
| "loss": 0.4327, | |
| "mean_token_accuracy": 0.8533252072862426, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.432875309379534, | |
| "grad_norm": 0.2909937625605861, | |
| "learning_rate": 1.3991625499553325e-05, | |
| "loss": 0.4373, | |
| "mean_token_accuracy": 0.8526268972423322, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.4342408466330972, | |
| "grad_norm": 0.2810097508624511, | |
| "learning_rate": 1.3947804121472453e-05, | |
| "loss": 0.4209, | |
| "mean_token_accuracy": 0.8580495358481994, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.4356063838866604, | |
| "grad_norm": 0.29587543321646376, | |
| "learning_rate": 1.3903892751634949e-05, | |
| "loss": 0.4181, | |
| "mean_token_accuracy": 0.8580909047258651, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.4369719211402236, | |
| "grad_norm": 0.2803397206271039, | |
| "learning_rate": 1.3859892391017867e-05, | |
| "loss": 0.398, | |
| "mean_token_accuracy": 0.8636022295266738, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.4383374583937868, | |
| "grad_norm": 0.3031649571676904, | |
| "learning_rate": 1.3815804042626828e-05, | |
| "loss": 0.4316, | |
| "mean_token_accuracy": 0.8540270034472657, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.43970299564735, | |
| "grad_norm": 0.2868053671287889, | |
| "learning_rate": 1.3771628711473173e-05, | |
| "loss": 0.4458, | |
| "mean_token_accuracy": 0.8500793417904773, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.4410685329009132, | |
| "grad_norm": 0.2607596548746336, | |
| "learning_rate": 1.3727367404551055e-05, | |
| "loss": 0.4194, | |
| "mean_token_accuracy": 0.85756931958572, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.4424340701544764, | |
| "grad_norm": 0.31970960349713734, | |
| "learning_rate": 1.368302113081447e-05, | |
| "loss": 0.3988, | |
| "mean_token_accuracy": 0.8640139707624799, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.4437996074080396, | |
| "grad_norm": 0.3073106580181061, | |
| "learning_rate": 1.3638590901154276e-05, | |
| "loss": 0.4158, | |
| "mean_token_accuracy": 0.8588068782465295, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.4451651446616028, | |
| "grad_norm": 0.30286141069120004, | |
| "learning_rate": 1.3594077728375129e-05, | |
| "loss": 0.4201, | |
| "mean_token_accuracy": 0.8569773919052446, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.446530681915166, | |
| "grad_norm": 0.279965967756229, | |
| "learning_rate": 1.3549482627172412e-05, | |
| "loss": 0.432, | |
| "mean_token_accuracy": 0.8531155705252534, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.4478962191687292, | |
| "grad_norm": 0.30169596805937576, | |
| "learning_rate": 1.3504806614109098e-05, | |
| "loss": 0.4207, | |
| "mean_token_accuracy": 0.8567625209561366, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.4492617564222924, | |
| "grad_norm": 0.3459438236471543, | |
| "learning_rate": 1.3460050707592581e-05, | |
| "loss": 0.4336, | |
| "mean_token_accuracy": 0.8529489776321926, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.4506272936758556, | |
| "grad_norm": 0.30726581139747894, | |
| "learning_rate": 1.341521592785145e-05, | |
| "loss": 0.399, | |
| "mean_token_accuracy": 0.8630228365259139, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.4519928309294188, | |
| "grad_norm": 0.3053218799713375, | |
| "learning_rate": 1.3370303296912248e-05, | |
| "loss": 0.4382, | |
| "mean_token_accuracy": 0.8519843865306225, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.453358368182982, | |
| "grad_norm": 0.34629772646315893, | |
| "learning_rate": 1.332531383857616e-05, | |
| "loss": 0.4112, | |
| "mean_token_accuracy": 0.860751373812898, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.4547239054365452, | |
| "grad_norm": 0.35416501044407706, | |
| "learning_rate": 1.328024857839569e-05, | |
| "loss": 0.4179, | |
| "mean_token_accuracy": 0.857242064978725, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.45608944269010837, | |
| "grad_norm": 0.30654844603106524, | |
| "learning_rate": 1.3235108543651272e-05, | |
| "loss": 0.4152, | |
| "mean_token_accuracy": 0.8594253890394505, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.45745497994367157, | |
| "grad_norm": 0.35789751735364733, | |
| "learning_rate": 1.3189894763327851e-05, | |
| "loss": 0.4352, | |
| "mean_token_accuracy": 0.8524388902413612, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.45882051719723477, | |
| "grad_norm": 0.27296525421750206, | |
| "learning_rate": 1.3144608268091435e-05, | |
| "loss": 0.4125, | |
| "mean_token_accuracy": 0.8600694337472964, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.46018605445079797, | |
| "grad_norm": 0.35167998845661236, | |
| "learning_rate": 1.3099250090265599e-05, | |
| "loss": 0.4212, | |
| "mean_token_accuracy": 0.8573687132989843, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.46155159170436116, | |
| "grad_norm": 0.3399733274636042, | |
| "learning_rate": 1.3053821263807947e-05, | |
| "loss": 0.4133, | |
| "mean_token_accuracy": 0.8588422955112552, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.46291712895792436, | |
| "grad_norm": 0.2843523201867168, | |
| "learning_rate": 1.3008322824286554e-05, | |
| "loss": 0.4217, | |
| "mean_token_accuracy": 0.8581068469248813, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.46428266621148756, | |
| "grad_norm": 0.25579241148803916, | |
| "learning_rate": 1.2962755808856341e-05, | |
| "loss": 0.4329, | |
| "mean_token_accuracy": 0.8531418182041834, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.46564820346505076, | |
| "grad_norm": 0.2849802262084464, | |
| "learning_rate": 1.2917121256235454e-05, | |
| "loss": 0.4063, | |
| "mean_token_accuracy": 0.8619580010780344, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.46701374071861396, | |
| "grad_norm": 0.28376439896546823, | |
| "learning_rate": 1.2871420206681573e-05, | |
| "loss": 0.4126, | |
| "mean_token_accuracy": 0.8584155141530699, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.46837927797217715, | |
| "grad_norm": 0.27659846392534093, | |
| "learning_rate": 1.2825653701968199e-05, | |
| "loss": 0.4028, | |
| "mean_token_accuracy": 0.8630971886050622, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.46974481522574035, | |
| "grad_norm": 0.2680308561203238, | |
| "learning_rate": 1.2779822785360913e-05, | |
| "loss": 0.4106, | |
| "mean_token_accuracy": 0.8594322782022719, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.47111035247930355, | |
| "grad_norm": 0.31082123251549165, | |
| "learning_rate": 1.2733928501593587e-05, | |
| "loss": 0.4409, | |
| "mean_token_accuracy": 0.8504463201078798, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.47247588973286675, | |
| "grad_norm": 0.28196867432168504, | |
| "learning_rate": 1.2687971896844575e-05, | |
| "loss": 0.4143, | |
| "mean_token_accuracy": 0.8599393422041681, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.47384142698642995, | |
| "grad_norm": 0.30055986798006457, | |
| "learning_rate": 1.2641954018712863e-05, | |
| "loss": 0.427, | |
| "mean_token_accuracy": 0.8553718131914751, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.47520696423999315, | |
| "grad_norm": 0.3360073322586856, | |
| "learning_rate": 1.2595875916194188e-05, | |
| "loss": 0.4325, | |
| "mean_token_accuracy": 0.8513545891221347, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.47657250149355634, | |
| "grad_norm": 0.27942894694642584, | |
| "learning_rate": 1.2549738639657117e-05, | |
| "loss": 0.4069, | |
| "mean_token_accuracy": 0.8616844269198048, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.47793803874711954, | |
| "grad_norm": 0.25876338591566067, | |
| "learning_rate": 1.2503543240819127e-05, | |
| "loss": 0.4189, | |
| "mean_token_accuracy": 0.8584846388067386, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.4793035760006828, | |
| "grad_norm": 0.38300218448302764, | |
| "learning_rate": 1.2457290772722607e-05, | |
| "loss": 0.4207, | |
| "mean_token_accuracy": 0.8563944060284168, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.480669113254246, | |
| "grad_norm": 0.29865375211927264, | |
| "learning_rate": 1.2410982289710865e-05, | |
| "loss": 0.4274, | |
| "mean_token_accuracy": 0.8550406509222035, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.4820346505078092, | |
| "grad_norm": 0.29262760726515985, | |
| "learning_rate": 1.2364618847404088e-05, | |
| "loss": 0.4195, | |
| "mean_token_accuracy": 0.856524809953863, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.4834001877613724, | |
| "grad_norm": 0.33358761083999955, | |
| "learning_rate": 1.2318201502675285e-05, | |
| "loss": 0.4376, | |
| "mean_token_accuracy": 0.8536589150399464, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.4847657250149356, | |
| "grad_norm": 0.24446626288123202, | |
| "learning_rate": 1.227173131362619e-05, | |
| "loss": 0.4275, | |
| "mean_token_accuracy": 0.8556876461790799, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.4861312622684988, | |
| "grad_norm": 0.2998978259766614, | |
| "learning_rate": 1.2225209339563144e-05, | |
| "loss": 0.4244, | |
| "mean_token_accuracy": 0.8561228530745463, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.487496799522062, | |
| "grad_norm": 0.3206206105210003, | |
| "learning_rate": 1.2178636640972954e-05, | |
| "loss": 0.4277, | |
| "mean_token_accuracy": 0.8534380165849528, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.4888623367756252, | |
| "grad_norm": 0.3084930416049475, | |
| "learning_rate": 1.2132014279498702e-05, | |
| "loss": 0.4288, | |
| "mean_token_accuracy": 0.8533390096208622, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.4902278740291884, | |
| "grad_norm": 0.28357516576751085, | |
| "learning_rate": 1.2085343317915565e-05, | |
| "loss": 0.4243, | |
| "mean_token_accuracy": 0.8562038691159706, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.4915934112827516, | |
| "grad_norm": 0.33187103925787603, | |
| "learning_rate": 1.2038624820106572e-05, | |
| "loss": 0.4333, | |
| "mean_token_accuracy": 0.852135547957135, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4929589485363148, | |
| "grad_norm": 0.2747099173609463, | |
| "learning_rate": 1.1991859851038362e-05, | |
| "loss": 0.4007, | |
| "mean_token_accuracy": 0.8638977286823286, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.494324485789878, | |
| "grad_norm": 0.3635110977798377, | |
| "learning_rate": 1.1945049476736905e-05, | |
| "loss": 0.3958, | |
| "mean_token_accuracy": 0.8647084604295199, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.4956900230434412, | |
| "grad_norm": 0.28529172664511565, | |
| "learning_rate": 1.1898194764263198e-05, | |
| "loss": 0.4233, | |
| "mean_token_accuracy": 0.8569803575165137, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.4970555602970044, | |
| "grad_norm": 0.27537692377864037, | |
| "learning_rate": 1.1851296781688952e-05, | |
| "loss": 0.4073, | |
| "mean_token_accuracy": 0.8600902266025654, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.49842109755056757, | |
| "grad_norm": 0.32325185164148623, | |
| "learning_rate": 1.1804356598072223e-05, | |
| "loss": 0.4176, | |
| "mean_token_accuracy": 0.8587095805096491, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.49978663480413077, | |
| "grad_norm": 0.2940543368854904, | |
| "learning_rate": 1.1757375283433077e-05, | |
| "loss": 0.421, | |
| "mean_token_accuracy": 0.8564938642066868, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.5011521720576939, | |
| "grad_norm": 0.28550857526720214, | |
| "learning_rate": 1.1710353908729157e-05, | |
| "loss": 0.4032, | |
| "mean_token_accuracy": 0.8619838649329472, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.5025177093112572, | |
| "grad_norm": 0.29699607198851946, | |
| "learning_rate": 1.1663293545831302e-05, | |
| "loss": 0.4146, | |
| "mean_token_accuracy": 0.8589313902193905, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.5038832465648203, | |
| "grad_norm": 0.25709714555087276, | |
| "learning_rate": 1.1616195267499102e-05, | |
| "loss": 0.4298, | |
| "mean_token_accuracy": 0.8524904329384897, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.5052487838183836, | |
| "grad_norm": 0.2623470396493795, | |
| "learning_rate": 1.1569060147356441e-05, | |
| "loss": 0.4004, | |
| "mean_token_accuracy": 0.8624617116263626, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.5066143210719467, | |
| "grad_norm": 0.27349918771238607, | |
| "learning_rate": 1.1521889259867032e-05, | |
| "loss": 0.3875, | |
| "mean_token_accuracy": 0.8656588419393155, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.50797985832551, | |
| "grad_norm": 0.2659088919583038, | |
| "learning_rate": 1.1474683680309913e-05, | |
| "loss": 0.422, | |
| "mean_token_accuracy": 0.8569432102568713, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.5093453955790731, | |
| "grad_norm": 0.24245036453653618, | |
| "learning_rate": 1.1427444484754942e-05, | |
| "loss": 0.4345, | |
| "mean_token_accuracy": 0.8533666506270393, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.5107109328326364, | |
| "grad_norm": 0.3003745935393995, | |
| "learning_rate": 1.138017275003827e-05, | |
| "loss": 0.4141, | |
| "mean_token_accuracy": 0.859278335907566, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.5120764700861995, | |
| "grad_norm": 0.26886003162184047, | |
| "learning_rate": 1.133286955373779e-05, | |
| "loss": 0.4396, | |
| "mean_token_accuracy": 0.8502673324909059, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.5134420073397628, | |
| "grad_norm": 0.2390413269524559, | |
| "learning_rate": 1.1285535974148576e-05, | |
| "loss": 0.3997, | |
| "mean_token_accuracy": 0.8639690222498632, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.5148075445933259, | |
| "grad_norm": 0.2544175262225982, | |
| "learning_rate": 1.1238173090258292e-05, | |
| "loss": 0.3998, | |
| "mean_token_accuracy": 0.8636630321983972, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.5161730818468891, | |
| "grad_norm": 0.23579375374714043, | |
| "learning_rate": 1.1190781981722622e-05, | |
| "loss": 0.4213, | |
| "mean_token_accuracy": 0.8574136326780977, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.5175386191004523, | |
| "grad_norm": 0.3009505232094782, | |
| "learning_rate": 1.1143363728840626e-05, | |
| "loss": 0.4048, | |
| "mean_token_accuracy": 0.8630919807826921, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.5189041563540155, | |
| "grad_norm": 0.3008130036826914, | |
| "learning_rate": 1.1095919412530136e-05, | |
| "loss": 0.4162, | |
| "mean_token_accuracy": 0.8576130359385502, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.5202696936075787, | |
| "grad_norm": 0.2952343133789482, | |
| "learning_rate": 1.1048450114303111e-05, | |
| "loss": 0.4412, | |
| "mean_token_accuracy": 0.850824829194507, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.5216352308611419, | |
| "grad_norm": 0.258661138214629, | |
| "learning_rate": 1.1000956916240985e-05, | |
| "loss": 0.4086, | |
| "mean_token_accuracy": 0.8620580382687907, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.5230007681147051, | |
| "grad_norm": 0.25494116388793103, | |
| "learning_rate": 1.0953440900969993e-05, | |
| "loss": 0.4207, | |
| "mean_token_accuracy": 0.8565843185391879, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.5243663053682683, | |
| "grad_norm": 0.27361519707795223, | |
| "learning_rate": 1.09059031516365e-05, | |
| "loss": 0.4102, | |
| "mean_token_accuracy": 0.8609546120002327, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.5257318426218315, | |
| "grad_norm": 0.2576222350047768, | |
| "learning_rate": 1.0858344751882304e-05, | |
| "loss": 0.4069, | |
| "mean_token_accuracy": 0.860766222892137, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.5270973798753947, | |
| "grad_norm": 0.23779312285209903, | |
| "learning_rate": 1.0810766785819947e-05, | |
| "loss": 0.4228, | |
| "mean_token_accuracy": 0.8588812441258473, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.5284629171289579, | |
| "grad_norm": 0.28934124430046615, | |
| "learning_rate": 1.0763170338007978e-05, | |
| "loss": 0.4107, | |
| "mean_token_accuracy": 0.8604884551869458, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.5298284543825211, | |
| "grad_norm": 0.2466549449805162, | |
| "learning_rate": 1.0715556493426263e-05, | |
| "loss": 0.4263, | |
| "mean_token_accuracy": 0.8557152597091608, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.5311939916360843, | |
| "grad_norm": 0.27325712567200283, | |
| "learning_rate": 1.0667926337451217e-05, | |
| "loss": 0.4149, | |
| "mean_token_accuracy": 0.8584272502154298, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.5325595288896475, | |
| "grad_norm": 0.25477351099504564, | |
| "learning_rate": 1.0620280955831088e-05, | |
| "loss": 0.4138, | |
| "mean_token_accuracy": 0.8593247446112272, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.5339250661432107, | |
| "grad_norm": 0.25978114932856317, | |
| "learning_rate": 1.0572621434661201e-05, | |
| "loss": 0.4189, | |
| "mean_token_accuracy": 0.8573288866024139, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.5352906033967739, | |
| "grad_norm": 0.26074134187169073, | |
| "learning_rate": 1.0524948860359194e-05, | |
| "loss": 0.428, | |
| "mean_token_accuracy": 0.8560287289485933, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.5366561406503371, | |
| "grad_norm": 0.2606859248765302, | |
| "learning_rate": 1.0477264319640253e-05, | |
| "loss": 0.4117, | |
| "mean_token_accuracy": 0.8600141371640406, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.5380216779039003, | |
| "grad_norm": 0.2260622200890888, | |
| "learning_rate": 1.0429568899492349e-05, | |
| "loss": 0.3984, | |
| "mean_token_accuracy": 0.8646881070534751, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.5393872151574635, | |
| "grad_norm": 0.2270692517916632, | |
| "learning_rate": 1.038186368715145e-05, | |
| "loss": 0.3935, | |
| "mean_token_accuracy": 0.8645081124206024, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.5407527524110267, | |
| "grad_norm": 0.25880192379351896, | |
| "learning_rate": 1.0334149770076747e-05, | |
| "loss": 0.4238, | |
| "mean_token_accuracy": 0.8556846129868956, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.5421182896645899, | |
| "grad_norm": 0.28051700201569574, | |
| "learning_rate": 1.0286428235925849e-05, | |
| "loss": 0.4347, | |
| "mean_token_accuracy": 0.8522094587514498, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.5434838269181531, | |
| "grad_norm": 0.25125772452792333, | |
| "learning_rate": 1.0238700172530009e-05, | |
| "loss": 0.4049, | |
| "mean_token_accuracy": 0.8625896021400488, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.5448493641717163, | |
| "grad_norm": 0.268673980738851, | |
| "learning_rate": 1.019096666786931e-05, | |
| "loss": 0.4203, | |
| "mean_token_accuracy": 0.8571522385547277, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.5462149014252795, | |
| "grad_norm": 0.2890895729277909, | |
| "learning_rate": 1.0143228810047877e-05, | |
| "loss": 0.4131, | |
| "mean_token_accuracy": 0.8595778477918344, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5475804386788428, | |
| "grad_norm": 0.2542593184479374, | |
| "learning_rate": 1.0095487687269055e-05, | |
| "loss": 0.4331, | |
| "mean_token_accuracy": 0.8549465126076115, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.5489459759324059, | |
| "grad_norm": 0.2899474295442422, | |
| "learning_rate": 1.0047744387810632e-05, | |
| "loss": 0.4303, | |
| "mean_token_accuracy": 0.8538087426441767, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.5503115131859692, | |
| "grad_norm": 0.2869992392687126, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4253, | |
| "mean_token_accuracy": 0.8565849096399576, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.5516770504395323, | |
| "grad_norm": 0.2804072799948344, | |
| "learning_rate": 9.95225561218937e-06, | |
| "loss": 0.4108, | |
| "mean_token_accuracy": 0.8595849788206829, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.5530425876930956, | |
| "grad_norm": 0.2643969180366311, | |
| "learning_rate": 9.904512312730948e-06, | |
| "loss": 0.4137, | |
| "mean_token_accuracy": 0.8603534103336365, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.5544081249466587, | |
| "grad_norm": 0.27062716368889733, | |
| "learning_rate": 9.856771189952127e-06, | |
| "loss": 0.4015, | |
| "mean_token_accuracy": 0.8625163495869607, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.555773662200222, | |
| "grad_norm": 0.2929202457665895, | |
| "learning_rate": 9.809033332130694e-06, | |
| "loss": 0.4245, | |
| "mean_token_accuracy": 0.8562819218107298, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.5571391994537851, | |
| "grad_norm": 0.2541385240616914, | |
| "learning_rate": 9.761299827469993e-06, | |
| "loss": 0.4152, | |
| "mean_token_accuracy": 0.8585790240492156, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.5585047367073483, | |
| "grad_norm": 0.24317703441372845, | |
| "learning_rate": 9.713571764074153e-06, | |
| "loss": 0.4156, | |
| "mean_token_accuracy": 0.8592119050098765, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.5598702739609115, | |
| "grad_norm": 0.2671919366662445, | |
| "learning_rate": 9.665850229923258e-06, | |
| "loss": 0.4166, | |
| "mean_token_accuracy": 0.8583581070445756, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5612358112144747, | |
| "grad_norm": 0.2366025343403895, | |
| "learning_rate": 9.618136312848552e-06, | |
| "loss": 0.4167, | |
| "mean_token_accuracy": 0.8573873504387994, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.5626013484680379, | |
| "grad_norm": 0.2531493471961078, | |
| "learning_rate": 9.570431100507653e-06, | |
| "loss": 0.3986, | |
| "mean_token_accuracy": 0.8631825906862007, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.5639668857216011, | |
| "grad_norm": 0.2604649082143078, | |
| "learning_rate": 9.522735680359752e-06, | |
| "loss": 0.419, | |
| "mean_token_accuracy": 0.8566154458082337, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.5653324229751643, | |
| "grad_norm": 0.24389282767116163, | |
| "learning_rate": 9.47505113964081e-06, | |
| "loss": 0.4239, | |
| "mean_token_accuracy": 0.8564176977638592, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.5666979602287275, | |
| "grad_norm": 0.2838315749775247, | |
| "learning_rate": 9.4273785653388e-06, | |
| "loss": 0.4044, | |
| "mean_token_accuracy": 0.8619406759583335, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.5680634974822907, | |
| "grad_norm": 0.24792685107778176, | |
| "learning_rate": 9.379719044168914e-06, | |
| "loss": 0.4245, | |
| "mean_token_accuracy": 0.8563493408114575, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.5694290347358539, | |
| "grad_norm": 0.252380204387106, | |
| "learning_rate": 9.332073662548785e-06, | |
| "loss": 0.4192, | |
| "mean_token_accuracy": 0.8552596982254129, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.5707945719894171, | |
| "grad_norm": 0.2633944099843147, | |
| "learning_rate": 9.28444350657374e-06, | |
| "loss": 0.3925, | |
| "mean_token_accuracy": 0.8656064042939838, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.5721601092429803, | |
| "grad_norm": 0.31236207476517847, | |
| "learning_rate": 9.236829661992024e-06, | |
| "loss": 0.4362, | |
| "mean_token_accuracy": 0.8508478194007478, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.5735256464965435, | |
| "grad_norm": 0.25934793423863595, | |
| "learning_rate": 9.189233214180057e-06, | |
| "loss": 0.4029, | |
| "mean_token_accuracy": 0.8626796404616222, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5748911837501067, | |
| "grad_norm": 0.2602440450848244, | |
| "learning_rate": 9.1416552481177e-06, | |
| "loss": 0.4011, | |
| "mean_token_accuracy": 0.8620878476008459, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.5762567210036699, | |
| "grad_norm": 0.3026174301084034, | |
| "learning_rate": 9.094096848363503e-06, | |
| "loss": 0.4132, | |
| "mean_token_accuracy": 0.8589640787061215, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.5776222582572331, | |
| "grad_norm": 0.23000512937699333, | |
| "learning_rate": 9.046559099030012e-06, | |
| "loss": 0.411, | |
| "mean_token_accuracy": 0.8593097755828117, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.5789877955107963, | |
| "grad_norm": 0.28198364889742744, | |
| "learning_rate": 8.999043083759016e-06, | |
| "loss": 0.426, | |
| "mean_token_accuracy": 0.8541148022282637, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.5803533327643595, | |
| "grad_norm": 0.261123335266971, | |
| "learning_rate": 8.951549885696889e-06, | |
| "loss": 0.4008, | |
| "mean_token_accuracy": 0.863803099113423, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.5817188700179227, | |
| "grad_norm": 0.2646118229304491, | |
| "learning_rate": 8.904080587469869e-06, | |
| "loss": 0.3884, | |
| "mean_token_accuracy": 0.8656690716827113, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.5830844072714859, | |
| "grad_norm": 0.23877427883247976, | |
| "learning_rate": 8.856636271159378e-06, | |
| "loss": 0.4252, | |
| "mean_token_accuracy": 0.8549220531025736, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.5844499445250491, | |
| "grad_norm": 0.27389213494804404, | |
| "learning_rate": 8.80921801827738e-06, | |
| "loss": 0.4139, | |
| "mean_token_accuracy": 0.8583442992618325, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.5858154817786123, | |
| "grad_norm": 0.3009404820928479, | |
| "learning_rate": 8.76182690974171e-06, | |
| "loss": 0.4325, | |
| "mean_token_accuracy": 0.854122146316857, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.5871810190321755, | |
| "grad_norm": 0.2723790212614728, | |
| "learning_rate": 8.714464025851428e-06, | |
| "loss": 0.4058, | |
| "mean_token_accuracy": 0.8629916382485093, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5885465562857387, | |
| "grad_norm": 0.25219631499358963, | |
| "learning_rate": 8.667130446262214e-06, | |
| "loss": 0.4198, | |
| "mean_token_accuracy": 0.856574897278309, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.5899120935393019, | |
| "grad_norm": 0.2772798105097925, | |
| "learning_rate": 8.619827249961732e-06, | |
| "loss": 0.4047, | |
| "mean_token_accuracy": 0.8614651709337668, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.5912776307928651, | |
| "grad_norm": 0.2561843385637697, | |
| "learning_rate": 8.57255551524506e-06, | |
| "loss": 0.4058, | |
| "mean_token_accuracy": 0.8620459681672171, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.5926431680464282, | |
| "grad_norm": 0.24701843749006636, | |
| "learning_rate": 8.525316319690092e-06, | |
| "loss": 0.389, | |
| "mean_token_accuracy": 0.8674125095943455, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.5940087052999915, | |
| "grad_norm": 0.2916250794706418, | |
| "learning_rate": 8.478110740132971e-06, | |
| "loss": 0.4222, | |
| "mean_token_accuracy": 0.8566364817285297, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.5953742425535546, | |
| "grad_norm": 0.2806760751496244, | |
| "learning_rate": 8.430939852643559e-06, | |
| "loss": 0.4169, | |
| "mean_token_accuracy": 0.8584203185837236, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.5967397798071179, | |
| "grad_norm": 0.27137099893978883, | |
| "learning_rate": 8.383804732500901e-06, | |
| "loss": 0.4103, | |
| "mean_token_accuracy": 0.8591519595394866, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.598105317060681, | |
| "grad_norm": 0.29739700836059735, | |
| "learning_rate": 8.336706454168701e-06, | |
| "loss": 0.4091, | |
| "mean_token_accuracy": 0.8611807240702429, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.5994708543142443, | |
| "grad_norm": 0.2571864812668034, | |
| "learning_rate": 8.289646091270848e-06, | |
| "loss": 0.4015, | |
| "mean_token_accuracy": 0.8637355095867093, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.6008363915678074, | |
| "grad_norm": 0.23339459771292811, | |
| "learning_rate": 8.242624716566928e-06, | |
| "loss": 0.3887, | |
| "mean_token_accuracy": 0.8657256008172974, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.6022019288213707, | |
| "grad_norm": 0.2992953835452048, | |
| "learning_rate": 8.195643401927777e-06, | |
| "loss": 0.3916, | |
| "mean_token_accuracy": 0.8653041052199545, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.6035674660749338, | |
| "grad_norm": 0.2585326431716542, | |
| "learning_rate": 8.148703218311053e-06, | |
| "loss": 0.4155, | |
| "mean_token_accuracy": 0.8596556131686985, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.6049330033284971, | |
| "grad_norm": 0.2503365003487542, | |
| "learning_rate": 8.101805235736804e-06, | |
| "loss": 0.4265, | |
| "mean_token_accuracy": 0.8549398314920128, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.6062985405820602, | |
| "grad_norm": 0.2820273966033343, | |
| "learning_rate": 8.054950523263097e-06, | |
| "loss": 0.4136, | |
| "mean_token_accuracy": 0.8588294366875941, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.6076640778356235, | |
| "grad_norm": 0.25413507701062166, | |
| "learning_rate": 8.008140148961642e-06, | |
| "loss": 0.4044, | |
| "mean_token_accuracy": 0.8625195591348364, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.6090296150891866, | |
| "grad_norm": 0.23238499595613277, | |
| "learning_rate": 7.96137517989343e-06, | |
| "loss": 0.4246, | |
| "mean_token_accuracy": 0.8561338059337695, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.6103951523427499, | |
| "grad_norm": 0.23798509908211196, | |
| "learning_rate": 7.914656682084436e-06, | |
| "loss": 0.4169, | |
| "mean_token_accuracy": 0.8577032958934101, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.611760689596313, | |
| "grad_norm": 0.25296096161140275, | |
| "learning_rate": 7.867985720501301e-06, | |
| "loss": 0.399, | |
| "mean_token_accuracy": 0.863544434056401, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.6131262268498763, | |
| "grad_norm": 0.22798661470070672, | |
| "learning_rate": 7.821363359027047e-06, | |
| "loss": 0.4028, | |
| "mean_token_accuracy": 0.8615431059914728, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.6144917641034394, | |
| "grad_norm": 0.2405890996462141, | |
| "learning_rate": 7.774790660436857e-06, | |
| "loss": 0.4279, | |
| "mean_token_accuracy": 0.8547220263658766, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.6158573013570027, | |
| "grad_norm": 0.29357731632515965, | |
| "learning_rate": 7.728268686373814e-06, | |
| "loss": 0.4077, | |
| "mean_token_accuracy": 0.8615052330629221, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.6172228386105658, | |
| "grad_norm": 0.28038212690179937, | |
| "learning_rate": 7.681798497324717e-06, | |
| "loss": 0.4379, | |
| "mean_token_accuracy": 0.8519818024977281, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.6185883758641291, | |
| "grad_norm": 0.25548937081717665, | |
| "learning_rate": 7.635381152595916e-06, | |
| "loss": 0.4175, | |
| "mean_token_accuracy": 0.857195676423713, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.6199539131176922, | |
| "grad_norm": 0.25398512091441905, | |
| "learning_rate": 7.5890177102891395e-06, | |
| "loss": 0.4039, | |
| "mean_token_accuracy": 0.8617371650977873, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.6213194503712555, | |
| "grad_norm": 0.26543560160540447, | |
| "learning_rate": 7.542709227277396e-06, | |
| "loss": 0.4164, | |
| "mean_token_accuracy": 0.857181019142036, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.6226849876248186, | |
| "grad_norm": 0.2459878250772752, | |
| "learning_rate": 7.496456759180876e-06, | |
| "loss": 0.4228, | |
| "mean_token_accuracy": 0.8573935855915926, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.6240505248783819, | |
| "grad_norm": 0.23786580710281324, | |
| "learning_rate": 7.4502613603428875e-06, | |
| "loss": 0.4088, | |
| "mean_token_accuracy": 0.8607801487917315, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 0.625416062131945, | |
| "grad_norm": 0.29589767964465397, | |
| "learning_rate": 7.404124083805819e-06, | |
| "loss": 0.4093, | |
| "mean_token_accuracy": 0.8612231610118724, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.6267815993855083, | |
| "grad_norm": 0.2887590659400021, | |
| "learning_rate": 7.358045981287141e-06, | |
| "loss": 0.4209, | |
| "mean_token_accuracy": 0.8560159432518372, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.6281471366390714, | |
| "grad_norm": 0.21799038062254464, | |
| "learning_rate": 7.312028103155426e-06, | |
| "loss": 0.3799, | |
| "mean_token_accuracy": 0.8704455922016139, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.6295126738926347, | |
| "grad_norm": 0.24324879715514983, | |
| "learning_rate": 7.266071498406417e-06, | |
| "loss": 0.4076, | |
| "mean_token_accuracy": 0.8610840750554581, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 0.6308782111461978, | |
| "grad_norm": 0.25428302492825916, | |
| "learning_rate": 7.220177214639088e-06, | |
| "loss": 0.4109, | |
| "mean_token_accuracy": 0.8587935610875843, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.632243748399761, | |
| "grad_norm": 0.27202575353290637, | |
| "learning_rate": 7.1743462980318045e-06, | |
| "loss": 0.4164, | |
| "mean_token_accuracy": 0.8589937783595583, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 0.6336092856533242, | |
| "grad_norm": 0.23695566303733537, | |
| "learning_rate": 7.128579793318429e-06, | |
| "loss": 0.4024, | |
| "mean_token_accuracy": 0.8627954415233714, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.6349748229068874, | |
| "grad_norm": 0.25513688392749734, | |
| "learning_rate": 7.0828787437645455e-06, | |
| "loss": 0.4093, | |
| "mean_token_accuracy": 0.8600665306531674, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.6363403601604506, | |
| "grad_norm": 0.26912983946640745, | |
| "learning_rate": 7.037244191143662e-06, | |
| "loss": 0.3995, | |
| "mean_token_accuracy": 0.8619904346664145, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.6377058974140138, | |
| "grad_norm": 0.2525033514112838, | |
| "learning_rate": 6.991677175713449e-06, | |
| "loss": 0.4223, | |
| "mean_token_accuracy": 0.8571298722103126, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 0.639071434667577, | |
| "grad_norm": 0.4126261428152561, | |
| "learning_rate": 6.946178736192053e-06, | |
| "loss": 0.4422, | |
| "mean_token_accuracy": 0.8495778270375115, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.6404369719211402, | |
| "grad_norm": 0.25938255418921724, | |
| "learning_rate": 6.900749909734406e-06, | |
| "loss": 0.433, | |
| "mean_token_accuracy": 0.8528369297456597, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 0.6418025091747034, | |
| "grad_norm": 0.2505229138650988, | |
| "learning_rate": 6.8553917319085676e-06, | |
| "loss": 0.4148, | |
| "mean_token_accuracy": 0.8588444157084706, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.6431680464282666, | |
| "grad_norm": 0.21310658415353784, | |
| "learning_rate": 6.810105236672155e-06, | |
| "loss": 0.3849, | |
| "mean_token_accuracy": 0.8671909034888359, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 0.6445335836818298, | |
| "grad_norm": 0.23204325202817497, | |
| "learning_rate": 6.76489145634873e-06, | |
| "loss": 0.4074, | |
| "mean_token_accuracy": 0.8617341122845665, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.645899120935393, | |
| "grad_norm": 0.27377537704030214, | |
| "learning_rate": 6.719751421604309e-06, | |
| "loss": 0.4052, | |
| "mean_token_accuracy": 0.8605801437822177, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 0.6472646581889562, | |
| "grad_norm": 0.26667506542190905, | |
| "learning_rate": 6.6746861614238425e-06, | |
| "loss": 0.4172, | |
| "mean_token_accuracy": 0.8587473642719939, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.6486301954425194, | |
| "grad_norm": 0.2359219117164129, | |
| "learning_rate": 6.629696703087755e-06, | |
| "loss": 0.3979, | |
| "mean_token_accuracy": 0.8643947766486794, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.6499957326960826, | |
| "grad_norm": 0.2405700433141458, | |
| "learning_rate": 6.584784072148554e-06, | |
| "loss": 0.4117, | |
| "mean_token_accuracy": 0.8583265490823113, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.6513612699496458, | |
| "grad_norm": 0.23752686356493705, | |
| "learning_rate": 6.5399492924074215e-06, | |
| "loss": 0.4079, | |
| "mean_token_accuracy": 0.8599794679393602, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 0.652726807203209, | |
| "grad_norm": 0.260987271653807, | |
| "learning_rate": 6.495193385890901e-06, | |
| "loss": 0.3978, | |
| "mean_token_accuracy": 0.863796731863899, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.6540923444567722, | |
| "grad_norm": 0.2389148131609978, | |
| "learning_rate": 6.450517372827591e-06, | |
| "loss": 0.4048, | |
| "mean_token_accuracy": 0.8625655846917668, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 0.6554578817103354, | |
| "grad_norm": 0.23754616663334344, | |
| "learning_rate": 6.405922271624874e-06, | |
| "loss": 0.3894, | |
| "mean_token_accuracy": 0.8650371535045855, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.6568234189638986, | |
| "grad_norm": 0.24392056114704547, | |
| "learning_rate": 6.3614090988457255e-06, | |
| "loss": 0.4078, | |
| "mean_token_accuracy": 0.860617293319343, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 0.6581889562174618, | |
| "grad_norm": 0.26180580733734304, | |
| "learning_rate": 6.3169788691855326e-06, | |
| "loss": 0.4226, | |
| "mean_token_accuracy": 0.8568173015759246, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.659554493471025, | |
| "grad_norm": 0.24631836835717166, | |
| "learning_rate": 6.2726325954489474e-06, | |
| "loss": 0.4214, | |
| "mean_token_accuracy": 0.8552957608827673, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.6609200307245882, | |
| "grad_norm": 0.2805589427499748, | |
| "learning_rate": 6.22837128852683e-06, | |
| "loss": 0.4209, | |
| "mean_token_accuracy": 0.857800855823934, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.6622855679781514, | |
| "grad_norm": 0.232371020464702, | |
| "learning_rate": 6.184195957373176e-06, | |
| "loss": 0.4109, | |
| "mean_token_accuracy": 0.8600986670212696, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.6636511052317146, | |
| "grad_norm": 0.24474802604092138, | |
| "learning_rate": 6.140107608982137e-06, | |
| "loss": 0.3952, | |
| "mean_token_accuracy": 0.8657601107656259, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.6650166424852778, | |
| "grad_norm": 0.25305102083356706, | |
| "learning_rate": 6.0961072483650526e-06, | |
| "loss": 0.4037, | |
| "mean_token_accuracy": 0.8629620469736944, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 0.666382179738841, | |
| "grad_norm": 0.22975337882908747, | |
| "learning_rate": 6.052195878527551e-06, | |
| "loss": 0.4038, | |
| "mean_token_accuracy": 0.8623166696689788, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.6677477169924042, | |
| "grad_norm": 0.24494072522854304, | |
| "learning_rate": 6.008374500446676e-06, | |
| "loss": 0.4203, | |
| "mean_token_accuracy": 0.8575659022852821, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 0.6691132542459673, | |
| "grad_norm": 0.24564454125360483, | |
| "learning_rate": 5.964644113048079e-06, | |
| "loss": 0.4108, | |
| "mean_token_accuracy": 0.8592899012026329, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.6704787914995306, | |
| "grad_norm": 0.2970193563961585, | |
| "learning_rate": 5.921005713183236e-06, | |
| "loss": 0.414, | |
| "mean_token_accuracy": 0.8578038401374278, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 0.6718443287530937, | |
| "grad_norm": 0.22522151812319466, | |
| "learning_rate": 5.877460295606739e-06, | |
| "loss": 0.3909, | |
| "mean_token_accuracy": 0.8663197254697791, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.673209866006657, | |
| "grad_norm": 0.24777229494666286, | |
| "learning_rate": 5.834008852953603e-06, | |
| "loss": 0.3987, | |
| "mean_token_accuracy": 0.8633314691916142, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 0.6745754032602201, | |
| "grad_norm": 0.27546512967034037, | |
| "learning_rate": 5.790652375716653e-06, | |
| "loss": 0.3887, | |
| "mean_token_accuracy": 0.8660309352349229, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.6759409405137834, | |
| "grad_norm": 0.24981866455290716, | |
| "learning_rate": 5.74739185222394e-06, | |
| "loss": 0.4182, | |
| "mean_token_accuracy": 0.8580747989016917, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.6773064777673465, | |
| "grad_norm": 0.23497798723811336, | |
| "learning_rate": 5.704228268616208e-06, | |
| "loss": 0.4034, | |
| "mean_token_accuracy": 0.8624885421549002, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.6786720150209098, | |
| "grad_norm": 0.22626842482811318, | |
| "learning_rate": 5.66116260882442e-06, | |
| "loss": 0.3743, | |
| "mean_token_accuracy": 0.8722626987396589, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 0.6800375522744729, | |
| "grad_norm": 0.273018811101812, | |
| "learning_rate": 5.618195854547333e-06, | |
| "loss": 0.4118, | |
| "mean_token_accuracy": 0.8591764611730132, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.6814030895280362, | |
| "grad_norm": 0.2517060577155918, | |
| "learning_rate": 5.575328985229098e-06, | |
| "loss": 0.4307, | |
| "mean_token_accuracy": 0.8540759587427639, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 0.6827686267815993, | |
| "grad_norm": 0.23433321670856827, | |
| "learning_rate": 5.532562978036964e-06, | |
| "loss": 0.3938, | |
| "mean_token_accuracy": 0.8640612399037781, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6841341640351626, | |
| "grad_norm": 0.2425154734570016, | |
| "learning_rate": 5.48989880783898e-06, | |
| "loss": 0.422, | |
| "mean_token_accuracy": 0.8574342653414885, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 0.6854997012887258, | |
| "grad_norm": 0.23347937555277293, | |
| "learning_rate": 5.4473374471817906e-06, | |
| "loss": 0.4191, | |
| "mean_token_accuracy": 0.8575295591207817, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 0.686865238542289, | |
| "grad_norm": 0.2545433711957671, | |
| "learning_rate": 5.404879866268438e-06, | |
| "loss": 0.4002, | |
| "mean_token_accuracy": 0.8639634173808828, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 0.6882307757958522, | |
| "grad_norm": 0.24243974966286372, | |
| "learning_rate": 5.362527032936278e-06, | |
| "loss": 0.4028, | |
| "mean_token_accuracy": 0.862244193071149, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.6895963130494154, | |
| "grad_norm": 0.23059100155723, | |
| "learning_rate": 5.320279912634907e-06, | |
| "loss": 0.4035, | |
| "mean_token_accuracy": 0.8626255651551141, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.6909618503029786, | |
| "grad_norm": 0.23428705595664845, | |
| "learning_rate": 5.278139468404133e-06, | |
| "loss": 0.4083, | |
| "mean_token_accuracy": 0.8603081045639313, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 0.6923273875565418, | |
| "grad_norm": 0.27274705668530197, | |
| "learning_rate": 5.236106660852058e-06, | |
| "loss": 0.4212, | |
| "mean_token_accuracy": 0.8560662849082573, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 0.693692924810105, | |
| "grad_norm": 0.2845641889872319, | |
| "learning_rate": 5.194182448133163e-06, | |
| "loss": 0.3944, | |
| "mean_token_accuracy": 0.8640756812761311, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 0.6950584620636682, | |
| "grad_norm": 0.22664565522900781, | |
| "learning_rate": 5.152367785926452e-06, | |
| "loss": 0.4094, | |
| "mean_token_accuracy": 0.8593605320038422, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 0.6964239993172314, | |
| "grad_norm": 0.22478587370218284, | |
| "learning_rate": 5.110663627413695e-06, | |
| "loss": 0.396, | |
| "mean_token_accuracy": 0.8648808615935969, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.6977895365707946, | |
| "grad_norm": 0.21591948807401068, | |
| "learning_rate": 5.069070923257685e-06, | |
| "loss": 0.399, | |
| "mean_token_accuracy": 0.8633928472317149, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 0.6991550738243578, | |
| "grad_norm": 0.26461022257675626, | |
| "learning_rate": 5.027590621580563e-06, | |
| "loss": 0.4394, | |
| "mean_token_accuracy": 0.8507808502833459, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.700520611077921, | |
| "grad_norm": 0.21356588082748598, | |
| "learning_rate": 4.986223667942213e-06, | |
| "loss": 0.4271, | |
| "mean_token_accuracy": 0.8533322294161114, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 0.7018861483314842, | |
| "grad_norm": 0.21403249722142711, | |
| "learning_rate": 4.944971005318716e-06, | |
| "loss": 0.3987, | |
| "mean_token_accuracy": 0.8636949003001929, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 0.7032516855850474, | |
| "grad_norm": 0.23897074188676798, | |
| "learning_rate": 4.903833574080825e-06, | |
| "loss": 0.3925, | |
| "mean_token_accuracy": 0.8641888082172495, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.7046172228386106, | |
| "grad_norm": 0.22004960179660596, | |
| "learning_rate": 4.862812311972567e-06, | |
| "loss": 0.4296, | |
| "mean_token_accuracy": 0.8554980900000537, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 0.7059827600921738, | |
| "grad_norm": 0.21948469336679155, | |
| "learning_rate": 4.82190815408983e-06, | |
| "loss": 0.4033, | |
| "mean_token_accuracy": 0.8607597379326859, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 0.707348297345737, | |
| "grad_norm": 0.23185408625161705, | |
| "learning_rate": 4.781122032859079e-06, | |
| "loss": 0.4137, | |
| "mean_token_accuracy": 0.8586254422034981, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 0.7087138345993002, | |
| "grad_norm": 0.23176435351640776, | |
| "learning_rate": 4.740454878016084e-06, | |
| "loss": 0.4109, | |
| "mean_token_accuracy": 0.8592659758915503, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 0.7100793718528634, | |
| "grad_norm": 0.24913384906741762, | |
| "learning_rate": 4.6999076165847214e-06, | |
| "loss": 0.4343, | |
| "mean_token_accuracy": 0.8513752575363313, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.7114449091064265, | |
| "grad_norm": 0.20722173613037567, | |
| "learning_rate": 4.659481172855859e-06, | |
| "loss": 0.4005, | |
| "mean_token_accuracy": 0.8633166282089912, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 0.7128104463599898, | |
| "grad_norm": 0.2181641440286907, | |
| "learning_rate": 4.619176468366274e-06, | |
| "loss": 0.3917, | |
| "mean_token_accuracy": 0.8660028930864674, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 0.714175983613553, | |
| "grad_norm": 0.25367503570810757, | |
| "learning_rate": 4.578994421877645e-06, | |
| "loss": 0.4096, | |
| "mean_token_accuracy": 0.8606353223418006, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 0.7155415208671162, | |
| "grad_norm": 0.21964433917606951, | |
| "learning_rate": 4.538935949355623e-06, | |
| "loss": 0.4115, | |
| "mean_token_accuracy": 0.8579058023067518, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 0.7169070581206793, | |
| "grad_norm": 0.23194379718009556, | |
| "learning_rate": 4.499001963948929e-06, | |
| "loss": 0.4226, | |
| "mean_token_accuracy": 0.8567179649132983, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.7182725953742426, | |
| "grad_norm": 0.23794903701170841, | |
| "learning_rate": 4.45919337596856e-06, | |
| "loss": 0.4044, | |
| "mean_token_accuracy": 0.8625243196220369, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 0.7196381326278057, | |
| "grad_norm": 0.24368825515761053, | |
| "learning_rate": 4.41951109286703e-06, | |
| "loss": 0.3946, | |
| "mean_token_accuracy": 0.8651203225887669, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 0.721003669881369, | |
| "grad_norm": 0.2753308025130241, | |
| "learning_rate": 4.379956019217675e-06, | |
| "loss": 0.415, | |
| "mean_token_accuracy": 0.8604877721998891, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 0.7223692071349321, | |
| "grad_norm": 0.21688742696840135, | |
| "learning_rate": 4.3405290566940475e-06, | |
| "loss": 0.397, | |
| "mean_token_accuracy": 0.8641772641669326, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 0.7237347443884954, | |
| "grad_norm": 0.21378940665861854, | |
| "learning_rate": 4.301231104049359e-06, | |
| "loss": 0.4174, | |
| "mean_token_accuracy": 0.8589884247304818, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.7251002816420585, | |
| "grad_norm": 0.2014478430774182, | |
| "learning_rate": 4.262063057095978e-06, | |
| "loss": 0.3961, | |
| "mean_token_accuracy": 0.864822926034779, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 0.7264658188956218, | |
| "grad_norm": 0.21961573395687345, | |
| "learning_rate": 4.2230258086850375e-06, | |
| "loss": 0.4018, | |
| "mean_token_accuracy": 0.8609590482386245, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 0.7278313561491849, | |
| "grad_norm": 0.22346096554308978, | |
| "learning_rate": 4.184120248686048e-06, | |
| "loss": 0.4296, | |
| "mean_token_accuracy": 0.8551864724213167, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 0.7291968934027482, | |
| "grad_norm": 0.2324113039004406, | |
| "learning_rate": 4.145347263966646e-06, | |
| "loss": 0.4106, | |
| "mean_token_accuracy": 0.8610600169630592, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 0.7305624306563113, | |
| "grad_norm": 0.21693513782961235, | |
| "learning_rate": 4.106707738372357e-06, | |
| "loss": 0.4047, | |
| "mean_token_accuracy": 0.8620982081151527, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.7319279679098746, | |
| "grad_norm": 0.24610715663770288, | |
| "learning_rate": 4.0682025527064486e-06, | |
| "loss": 0.4142, | |
| "mean_token_accuracy": 0.8586329074626934, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 0.7332935051634377, | |
| "grad_norm": 0.2496537037626066, | |
| "learning_rate": 4.029832584709864e-06, | |
| "loss": 0.4133, | |
| "mean_token_accuracy": 0.8591482955360918, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 0.734659042417001, | |
| "grad_norm": 0.25290330143622763, | |
| "learning_rate": 3.991598709041196e-06, | |
| "loss": 0.4185, | |
| "mean_token_accuracy": 0.8575748615139153, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 0.7360245796705641, | |
| "grad_norm": 0.23375530633596542, | |
| "learning_rate": 3.953501797256768e-06, | |
| "loss": 0.4082, | |
| "mean_token_accuracy": 0.8608288259179717, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 0.7373901169241274, | |
| "grad_norm": 0.23750273636894062, | |
| "learning_rate": 3.915542717790759e-06, | |
| "loss": 0.4087, | |
| "mean_token_accuracy": 0.8604965692097658, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.7387556541776905, | |
| "grad_norm": 0.22536670381141075, | |
| "learning_rate": 3.877722335935394e-06, | |
| "loss": 0.4274, | |
| "mean_token_accuracy": 0.8535887513090156, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 0.7401211914312538, | |
| "grad_norm": 0.2337249133077971, | |
| "learning_rate": 3.840041513821243e-06, | |
| "loss": 0.4252, | |
| "mean_token_accuracy": 0.8543320104591253, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 0.7414867286848169, | |
| "grad_norm": 0.23006457934706245, | |
| "learning_rate": 3.802501110397553e-06, | |
| "loss": 0.4154, | |
| "mean_token_accuracy": 0.8595805110377353, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 0.7428522659383802, | |
| "grad_norm": 0.2264725327560074, | |
| "learning_rate": 3.7651019814126656e-06, | |
| "loss": 0.3978, | |
| "mean_token_accuracy": 0.8640107757543986, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.7442178031919433, | |
| "grad_norm": 0.21662115139126908, | |
| "learning_rate": 3.727844979394526e-06, | |
| "loss": 0.4171, | |
| "mean_token_accuracy": 0.8583177721469002, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.7455833404455066, | |
| "grad_norm": 0.24171854514556812, | |
| "learning_rate": 3.6907309536312276e-06, | |
| "loss": 0.4157, | |
| "mean_token_accuracy": 0.8568488785351885, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 0.7469488776990697, | |
| "grad_norm": 0.20105071964248444, | |
| "learning_rate": 3.6537607501516716e-06, | |
| "loss": 0.3969, | |
| "mean_token_accuracy": 0.8654952584767601, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 0.748314414952633, | |
| "grad_norm": 0.22792825494280627, | |
| "learning_rate": 3.616935211706275e-06, | |
| "loss": 0.3932, | |
| "mean_token_accuracy": 0.8653115392045422, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 0.7496799522061961, | |
| "grad_norm": 0.2314969835671945, | |
| "learning_rate": 3.5802551777477477e-06, | |
| "loss": 0.3936, | |
| "mean_token_accuracy": 0.8653473275464092, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 0.7510454894597594, | |
| "grad_norm": 0.21855230450778143, | |
| "learning_rate": 3.543721484411976e-06, | |
| "loss": 0.4186, | |
| "mean_token_accuracy": 0.8572386847718454, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.7524110267133225, | |
| "grad_norm": 0.2179108335195832, | |
| "learning_rate": 3.5073349644989563e-06, | |
| "loss": 0.3948, | |
| "mean_token_accuracy": 0.8645206681557961, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 0.7537765639668857, | |
| "grad_norm": 0.2216215829360909, | |
| "learning_rate": 3.4710964474537967e-06, | |
| "loss": 0.3988, | |
| "mean_token_accuracy": 0.8638077505445968, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 0.7551421012204489, | |
| "grad_norm": 0.2693768802921377, | |
| "learning_rate": 3.435006759347835e-06, | |
| "loss": 0.4142, | |
| "mean_token_accuracy": 0.8578324828930424, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 0.7565076384740121, | |
| "grad_norm": 0.2226472320558841, | |
| "learning_rate": 3.3990667228597816e-06, | |
| "loss": 0.4028, | |
| "mean_token_accuracy": 0.8616278351864007, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 0.7578731757275753, | |
| "grad_norm": 0.23046863738461842, | |
| "learning_rate": 3.3632771572569878e-06, | |
| "loss": 0.4096, | |
| "mean_token_accuracy": 0.858772543145848, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.7592387129811385, | |
| "grad_norm": 0.21268282367594163, | |
| "learning_rate": 3.3276388783767644e-06, | |
| "loss": 0.4126, | |
| "mean_token_accuracy": 0.8590454775080085, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 0.7606042502347017, | |
| "grad_norm": 0.2258472656546357, | |
| "learning_rate": 3.292152698607768e-06, | |
| "loss": 0.3868, | |
| "mean_token_accuracy": 0.8656557706456838, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 0.7619697874882649, | |
| "grad_norm": 0.21882486076601618, | |
| "learning_rate": 3.256819426871507e-06, | |
| "loss": 0.4104, | |
| "mean_token_accuracy": 0.8588001715592681, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 0.7633353247418281, | |
| "grad_norm": 0.20092513277692103, | |
| "learning_rate": 3.221639868603893e-06, | |
| "loss": 0.4017, | |
| "mean_token_accuracy": 0.8617123529703908, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 0.7647008619953913, | |
| "grad_norm": 0.2121347752460375, | |
| "learning_rate": 3.1866148257368666e-06, | |
| "loss": 0.3975, | |
| "mean_token_accuracy": 0.8637410178015515, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.7660663992489545, | |
| "grad_norm": 0.22278871306746495, | |
| "learning_rate": 3.15174509668014e-06, | |
| "loss": 0.4025, | |
| "mean_token_accuracy": 0.8612379444311294, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 0.7674319365025177, | |
| "grad_norm": 0.23213748039939272, | |
| "learning_rate": 3.117031476302975e-06, | |
| "loss": 0.4055, | |
| "mean_token_accuracy": 0.8602274083124468, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 0.7687974737560809, | |
| "grad_norm": 0.2254901351758214, | |
| "learning_rate": 3.082474755916084e-06, | |
| "loss": 0.4075, | |
| "mean_token_accuracy": 0.8596426308589397, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 0.7701630110096441, | |
| "grad_norm": 0.2005034006194305, | |
| "learning_rate": 3.0480757232535773e-06, | |
| "loss": 0.3981, | |
| "mean_token_accuracy": 0.8630627269518893, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 0.7715285482632073, | |
| "grad_norm": 0.19345269846448648, | |
| "learning_rate": 3.0138351624550165e-06, | |
| "loss": 0.4049, | |
| "mean_token_accuracy": 0.8611821074357777, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.7728940855167705, | |
| "grad_norm": 0.20062995179443605, | |
| "learning_rate": 2.9797538540475223e-06, | |
| "loss": 0.3988, | |
| "mean_token_accuracy": 0.8634990578142336, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 0.7742596227703337, | |
| "grad_norm": 0.23164980231281976, | |
| "learning_rate": 2.945832574928006e-06, | |
| "loss": 0.4215, | |
| "mean_token_accuracy": 0.8557358468375269, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 0.7756251600238969, | |
| "grad_norm": 0.20342056199797945, | |
| "learning_rate": 2.9120720983454465e-06, | |
| "loss": 0.4121, | |
| "mean_token_accuracy": 0.8570414758839693, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 0.7769906972774601, | |
| "grad_norm": 0.22910941122839265, | |
| "learning_rate": 2.8784731938832556e-06, | |
| "loss": 0.4087, | |
| "mean_token_accuracy": 0.8596079874910577, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 0.7783562345310233, | |
| "grad_norm": 0.21416157641581723, | |
| "learning_rate": 2.845036627441755e-06, | |
| "loss": 0.3833, | |
| "mean_token_accuracy": 0.8691450521802337, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.7797217717845865, | |
| "grad_norm": 0.1977219313107081, | |
| "learning_rate": 2.8117631612207084e-06, | |
| "loss": 0.4121, | |
| "mean_token_accuracy": 0.8610763435801025, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 0.7810873090381497, | |
| "grad_norm": 0.20321588988831457, | |
| "learning_rate": 2.778653553701932e-06, | |
| "loss": 0.4057, | |
| "mean_token_accuracy": 0.8604487017005163, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 0.7824528462917129, | |
| "grad_norm": 0.21685215914126496, | |
| "learning_rate": 2.745708559632032e-06, | |
| "loss": 0.3965, | |
| "mean_token_accuracy": 0.8631075593834338, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 0.7838183835452761, | |
| "grad_norm": 0.21247170342394922, | |
| "learning_rate": 2.7129289300051788e-06, | |
| "loss": 0.4066, | |
| "mean_token_accuracy": 0.8607861090034046, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 0.7851839207988393, | |
| "grad_norm": 0.22155579546490167, | |
| "learning_rate": 2.6803154120460007e-06, | |
| "loss": 0.4072, | |
| "mean_token_accuracy": 0.8621495738744454, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.7865494580524025, | |
| "grad_norm": 0.20936523040531713, | |
| "learning_rate": 2.647868749192536e-06, | |
| "loss": 0.4187, | |
| "mean_token_accuracy": 0.8561885820547165, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.7879149953059656, | |
| "grad_norm": 0.2169635727623937, | |
| "learning_rate": 2.6155896810793036e-06, | |
| "loss": 0.4118, | |
| "mean_token_accuracy": 0.8589148130153295, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 0.7892805325595289, | |
| "grad_norm": 0.20681767217525526, | |
| "learning_rate": 2.5834789435204245e-06, | |
| "loss": 0.4088, | |
| "mean_token_accuracy": 0.8606342267092991, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 0.790646069813092, | |
| "grad_norm": 0.2213528976805314, | |
| "learning_rate": 2.5515372684928687e-06, | |
| "loss": 0.4222, | |
| "mean_token_accuracy": 0.8550210063940413, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 0.7920116070666553, | |
| "grad_norm": 0.22149304881479331, | |
| "learning_rate": 2.5197653841197546e-06, | |
| "loss": 0.4114, | |
| "mean_token_accuracy": 0.8591611937568427, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.7933771443202184, | |
| "grad_norm": 0.19889209166062613, | |
| "learning_rate": 2.48816401465375e-06, | |
| "loss": 0.4057, | |
| "mean_token_accuracy": 0.861537611573371, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 0.7947426815737817, | |
| "grad_norm": 0.24049345793108362, | |
| "learning_rate": 2.4567338804605756e-06, | |
| "loss": 0.4, | |
| "mean_token_accuracy": 0.86237969255477, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 0.7961082188273448, | |
| "grad_norm": 0.23885430272558048, | |
| "learning_rate": 2.425475698002577e-06, | |
| "loss": 0.4154, | |
| "mean_token_accuracy": 0.8575069291832286, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 0.7974737560809081, | |
| "grad_norm": 0.22824330369698562, | |
| "learning_rate": 2.394390179822382e-06, | |
| "loss": 0.4182, | |
| "mean_token_accuracy": 0.8576008710768586, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 0.7988392933344712, | |
| "grad_norm": 0.21974915679821652, | |
| "learning_rate": 2.3634780345266805e-06, | |
| "loss": 0.3963, | |
| "mean_token_accuracy": 0.8647796993722388, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.8002048305880345, | |
| "grad_norm": 0.2310148673403252, | |
| "learning_rate": 2.332739966770048e-06, | |
| "loss": 0.4005, | |
| "mean_token_accuracy": 0.8606528012118717, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 0.8015703678415976, | |
| "grad_norm": 0.21268357727111933, | |
| "learning_rate": 2.3021766772388986e-06, | |
| "loss": 0.4104, | |
| "mean_token_accuracy": 0.8596851429925462, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 0.8029359050951609, | |
| "grad_norm": 0.22436026449039995, | |
| "learning_rate": 2.271788862635513e-06, | |
| "loss": 0.415, | |
| "mean_token_accuracy": 0.8579554313395271, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 0.804301442348724, | |
| "grad_norm": 0.2401381634931562, | |
| "learning_rate": 2.2415772156621387e-06, | |
| "loss": 0.4281, | |
| "mean_token_accuracy": 0.8525374331069293, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 0.8056669796022873, | |
| "grad_norm": 0.23642180241330024, | |
| "learning_rate": 2.211542425005223e-06, | |
| "loss": 0.427, | |
| "mean_token_accuracy": 0.8546586111147729, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.8070325168558504, | |
| "grad_norm": 0.2115384062221628, | |
| "learning_rate": 2.1816851753197023e-06, | |
| "loss": 0.4114, | |
| "mean_token_accuracy": 0.8595850270091632, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 0.8083980541094137, | |
| "grad_norm": 0.21400197301419835, | |
| "learning_rate": 2.1520061472133903e-06, | |
| "loss": 0.4163, | |
| "mean_token_accuracy": 0.8604726451795677, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 0.8097635913629768, | |
| "grad_norm": 0.2017117241684196, | |
| "learning_rate": 2.1225060172314773e-06, | |
| "loss": 0.3995, | |
| "mean_token_accuracy": 0.8632549469473196, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 0.8111291286165401, | |
| "grad_norm": 0.19894011888129304, | |
| "learning_rate": 2.0931854578410904e-06, | |
| "loss": 0.3991, | |
| "mean_token_accuracy": 0.8642028385916231, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 0.8124946658701032, | |
| "grad_norm": 0.2298631393329404, | |
| "learning_rate": 2.064045137415982e-06, | |
| "loss": 0.4166, | |
| "mean_token_accuracy": 0.8576577016298331, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.8138602031236665, | |
| "grad_norm": 0.19264903302284703, | |
| "learning_rate": 2.0350857202212883e-06, | |
| "loss": 0.4031, | |
| "mean_token_accuracy": 0.8630792968512606, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 0.8152257403772296, | |
| "grad_norm": 0.24074756197143313, | |
| "learning_rate": 2.0063078663983716e-06, | |
| "loss": 0.3853, | |
| "mean_token_accuracy": 0.867487619049804, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 0.8165912776307929, | |
| "grad_norm": 0.22693604058548678, | |
| "learning_rate": 1.977712231949799e-06, | |
| "loss": 0.4025, | |
| "mean_token_accuracy": 0.8613917192076461, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 0.817956814884356, | |
| "grad_norm": 0.2158385478614588, | |
| "learning_rate": 1.9492994687243715e-06, | |
| "loss": 0.4148, | |
| "mean_token_accuracy": 0.8566573413371225, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 0.8193223521379193, | |
| "grad_norm": 0.2531283651455049, | |
| "learning_rate": 1.9210702244022616e-06, | |
| "loss": 0.4223, | |
| "mean_token_accuracy": 0.8552704816161152, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.8206878893914825, | |
| "grad_norm": 0.22124530534828935, | |
| "learning_rate": 1.8930251424802648e-06, | |
| "loss": 0.4173, | |
| "mean_token_accuracy": 0.857422562323455, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 0.8220534266450457, | |
| "grad_norm": 0.22916123571012667, | |
| "learning_rate": 1.8651648622571128e-06, | |
| "loss": 0.4093, | |
| "mean_token_accuracy": 0.8592705248451191, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 0.8234189638986089, | |
| "grad_norm": 0.23653504780930373, | |
| "learning_rate": 1.8374900188189172e-06, | |
| "loss": 0.4115, | |
| "mean_token_accuracy": 0.859768333670514, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 0.8247845011521721, | |
| "grad_norm": 0.2335856356579208, | |
| "learning_rate": 1.8100012430246838e-06, | |
| "loss": 0.3975, | |
| "mean_token_accuracy": 0.8634662846156729, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 0.8261500384057353, | |
| "grad_norm": 0.25602917462585195, | |
| "learning_rate": 1.7826991614919264e-06, | |
| "loss": 0.399, | |
| "mean_token_accuracy": 0.8641089744244426, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.8275155756592985, | |
| "grad_norm": 0.2764089716972941, | |
| "learning_rate": 1.7555843965823992e-06, | |
| "loss": 0.4155, | |
| "mean_token_accuracy": 0.8591620612370429, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 0.8288811129128617, | |
| "grad_norm": 0.22584056903219576, | |
| "learning_rate": 1.728657566387888e-06, | |
| "loss": 0.3899, | |
| "mean_token_accuracy": 0.8659440792321365, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 0.8302466501664248, | |
| "grad_norm": 0.20560426304959245, | |
| "learning_rate": 1.7019192847161425e-06, | |
| "loss": 0.4031, | |
| "mean_token_accuracy": 0.8641318011003372, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 0.8316121874199881, | |
| "grad_norm": 0.2153106481024899, | |
| "learning_rate": 1.6753701610768724e-06, | |
| "loss": 0.4017, | |
| "mean_token_accuracy": 0.8615005117867286, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 0.8329777246735512, | |
| "grad_norm": 0.21220319167956136, | |
| "learning_rate": 1.6490108006678495e-06, | |
| "loss": 0.4252, | |
| "mean_token_accuracy": 0.8564356615328043, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.8343432619271145, | |
| "grad_norm": 0.22552058407128117, | |
| "learning_rate": 1.6228418043611227e-06, | |
| "loss": 0.4168, | |
| "mean_token_accuracy": 0.8588175846725294, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 0.8357087991806776, | |
| "grad_norm": 0.24876240121034418, | |
| "learning_rate": 1.5968637686893186e-06, | |
| "loss": 0.4094, | |
| "mean_token_accuracy": 0.8588914171490348, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 0.8370743364342409, | |
| "grad_norm": 0.21709497476785303, | |
| "learning_rate": 1.57107728583203e-06, | |
| "loss": 0.3742, | |
| "mean_token_accuracy": 0.8702544465648172, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 0.838439873687804, | |
| "grad_norm": 0.2300739934314909, | |
| "learning_rate": 1.5454829436023411e-06, | |
| "loss": 0.4011, | |
| "mean_token_accuracy": 0.8626383013253395, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 0.8398054109413673, | |
| "grad_norm": 0.2233661831188188, | |
| "learning_rate": 1.5200813254334013e-06, | |
| "loss": 0.4013, | |
| "mean_token_accuracy": 0.8617884600134901, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.8411709481949304, | |
| "grad_norm": 0.2352272063568746, | |
| "learning_rate": 1.4948730103651498e-06, | |
| "loss": 0.3927, | |
| "mean_token_accuracy": 0.8648909153839965, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 0.8425364854484937, | |
| "grad_norm": 0.2040512264971376, | |
| "learning_rate": 1.4698585730311e-06, | |
| "loss": 0.409, | |
| "mean_token_accuracy": 0.8602341351681331, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 0.8439020227020568, | |
| "grad_norm": 0.2318894554427145, | |
| "learning_rate": 1.445038583645243e-06, | |
| "loss": 0.4161, | |
| "mean_token_accuracy": 0.8594872435807032, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 0.8452675599556201, | |
| "grad_norm": 0.20445356255956093, | |
| "learning_rate": 1.4204136079890585e-06, | |
| "loss": 0.4031, | |
| "mean_token_accuracy": 0.8638401437613202, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 0.8466330972091832, | |
| "grad_norm": 0.21641039619872623, | |
| "learning_rate": 1.3959842073986085e-06, | |
| "loss": 0.3941, | |
| "mean_token_accuracy": 0.8646075956314934, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.8479986344627465, | |
| "grad_norm": 0.20773725495938888, | |
| "learning_rate": 1.3717509387517393e-06, | |
| "loss": 0.4082, | |
| "mean_token_accuracy": 0.860045570440718, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 0.8493641717163096, | |
| "grad_norm": 0.21271505922050996, | |
| "learning_rate": 1.3477143544553994e-06, | |
| "loss": 0.4082, | |
| "mean_token_accuracy": 0.8599915463398872, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 0.8507297089698729, | |
| "grad_norm": 0.22401954459630313, | |
| "learning_rate": 1.3238750024330338e-06, | |
| "loss": 0.4192, | |
| "mean_token_accuracy": 0.8563828512549275, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 0.852095246223436, | |
| "grad_norm": 0.22260874162883426, | |
| "learning_rate": 1.300233426112103e-06, | |
| "loss": 0.4048, | |
| "mean_token_accuracy": 0.8611301206735221, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 0.8534607834769993, | |
| "grad_norm": 0.21152827385871018, | |
| "learning_rate": 1.2767901644116943e-06, | |
| "loss": 0.4146, | |
| "mean_token_accuracy": 0.8595371254035457, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.8548263207305624, | |
| "grad_norm": 0.22819406153754265, | |
| "learning_rate": 1.2535457517302262e-06, | |
| "loss": 0.4084, | |
| "mean_token_accuracy": 0.860028563771859, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 0.8561918579841257, | |
| "grad_norm": 0.20005533487863839, | |
| "learning_rate": 1.2305007179332851e-06, | |
| "loss": 0.4017, | |
| "mean_token_accuracy": 0.8622309813185363, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 0.8575573952376888, | |
| "grad_norm": 0.20176762634176584, | |
| "learning_rate": 1.2076555883415342e-06, | |
| "loss": 0.4277, | |
| "mean_token_accuracy": 0.8536327333990061, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 0.8589229324912521, | |
| "grad_norm": 0.2011241526651703, | |
| "learning_rate": 1.1850108837187336e-06, | |
| "loss": 0.4086, | |
| "mean_token_accuracy": 0.8592316098999795, | |
| "step": 629 | |
| }, | |
| { | |
| "epoch": 0.8602884697448152, | |
| "grad_norm": 0.22681886387157124, | |
| "learning_rate": 1.1625671202598875e-06, | |
| "loss": 0.4146, | |
| "mean_token_accuracy": 0.8590380670131639, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.8616540069983785, | |
| "grad_norm": 0.21912007754709245, | |
| "learning_rate": 1.1403248095794629e-06, | |
| "loss": 0.3988, | |
| "mean_token_accuracy": 0.862137042972187, | |
| "step": 631 | |
| }, | |
| { | |
| "epoch": 0.8630195442519416, | |
| "grad_norm": 0.1991013202311376, | |
| "learning_rate": 1.1182844586997266e-06, | |
| "loss": 0.4096, | |
| "mean_token_accuracy": 0.8605257073793695, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 0.8643850815055049, | |
| "grad_norm": 0.23467672143416735, | |
| "learning_rate": 1.0964465700391979e-06, | |
| "loss": 0.4108, | |
| "mean_token_accuracy": 0.8596458375918582, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 0.865750618759068, | |
| "grad_norm": 0.21912392965364608, | |
| "learning_rate": 1.074811641401189e-06, | |
| "loss": 0.4262, | |
| "mean_token_accuracy": 0.8559964604503356, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 0.8671161560126313, | |
| "grad_norm": 0.20615463939848364, | |
| "learning_rate": 1.0533801659624531e-06, | |
| "loss": 0.4204, | |
| "mean_token_accuracy": 0.8560936656792137, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.8684816932661944, | |
| "grad_norm": 0.19348528692872238, | |
| "learning_rate": 1.0321526322619536e-06, | |
| "loss": 0.4037, | |
| "mean_token_accuracy": 0.8607088903270385, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 0.8698472305197577, | |
| "grad_norm": 0.20196629259646406, | |
| "learning_rate": 1.0111295241897156e-06, | |
| "loss": 0.4049, | |
| "mean_token_accuracy": 0.8631067104794972, | |
| "step": 637 | |
| }, | |
| { | |
| "epoch": 0.8712127677733208, | |
| "grad_norm": 0.2106867654585255, | |
| "learning_rate": 9.903113209758098e-07, | |
| "loss": 0.4042, | |
| "mean_token_accuracy": 0.8603632609752478, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 0.872578305026884, | |
| "grad_norm": 0.20073052012465886, | |
| "learning_rate": 9.696984971794066e-07, | |
| "loss": 0.3981, | |
| "mean_token_accuracy": 0.8624080819096971, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 0.8739438422804472, | |
| "grad_norm": 0.20533835471618186, | |
| "learning_rate": 9.492915226779809e-07, | |
| "loss": 0.3915, | |
| "mean_token_accuracy": 0.8646312124436534, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.8753093795340104, | |
| "grad_norm": 0.19390504484310184, | |
| "learning_rate": 9.290908626565931e-07, | |
| "loss": 0.4064, | |
| "mean_token_accuracy": 0.8629363178699737, | |
| "step": 641 | |
| }, | |
| { | |
| "epoch": 0.8766749167875736, | |
| "grad_norm": 0.21843007849082088, | |
| "learning_rate": 9.090969775972736e-07, | |
| "loss": 0.4028, | |
| "mean_token_accuracy": 0.8616676869214925, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 0.8780404540411368, | |
| "grad_norm": 0.2219811855088307, | |
| "learning_rate": 8.89310323268544e-07, | |
| "loss": 0.4001, | |
| "mean_token_accuracy": 0.8622636044555403, | |
| "step": 643 | |
| }, | |
| { | |
| "epoch": 0.8794059912947, | |
| "grad_norm": 0.24251626425587186, | |
| "learning_rate": 8.697313507150184e-07, | |
| "loss": 0.4033, | |
| "mean_token_accuracy": 0.8601248910938767, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 0.8807715285482632, | |
| "grad_norm": 0.19641237184380816, | |
| "learning_rate": 8.503605062471187e-07, | |
| "loss": 0.4061, | |
| "mean_token_accuracy": 0.8596051420978411, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.8821370658018264, | |
| "grad_norm": 0.20131416744818392, | |
| "learning_rate": 8.311982314309109e-07, | |
| "loss": 0.4113, | |
| "mean_token_accuracy": 0.8598440955132611, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 0.8835026030553896, | |
| "grad_norm": 0.21190627535161183, | |
| "learning_rate": 8.122449630780238e-07, | |
| "loss": 0.416, | |
| "mean_token_accuracy": 0.8575635125425897, | |
| "step": 647 | |
| }, | |
| { | |
| "epoch": 0.8848681403089528, | |
| "grad_norm": 0.2167630347166107, | |
| "learning_rate": 7.935011332357113e-07, | |
| "loss": 0.4242, | |
| "mean_token_accuracy": 0.8560313013882472, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 0.886233677562516, | |
| "grad_norm": 0.19473548273128163, | |
| "learning_rate": 7.749671691769911e-07, | |
| "loss": 0.3845, | |
| "mean_token_accuracy": 0.8675812913768379, | |
| "step": 649 | |
| }, | |
| { | |
| "epoch": 0.8875992148160792, | |
| "grad_norm": 0.2060396098320868, | |
| "learning_rate": 7.566434933909006e-07, | |
| "loss": 0.4075, | |
| "mean_token_accuracy": 0.8595329796977811, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.8889647520696424, | |
| "grad_norm": 0.20145474208727362, | |
| "learning_rate": 7.385305235728801e-07, | |
| "loss": 0.4082, | |
| "mean_token_accuracy": 0.8592664341512477, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 0.8903302893232056, | |
| "grad_norm": 0.19909231314570291, | |
| "learning_rate": 7.206286726152434e-07, | |
| "loss": 0.4054, | |
| "mean_token_accuracy": 0.8594394158909082, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 0.8916958265767688, | |
| "grad_norm": 0.1919793886619024, | |
| "learning_rate": 7.029383485977625e-07, | |
| "loss": 0.4035, | |
| "mean_token_accuracy": 0.8634067122462318, | |
| "step": 653 | |
| }, | |
| { | |
| "epoch": 0.893061363830332, | |
| "grad_norm": 0.20544119012492554, | |
| "learning_rate": 6.854599547783736e-07, | |
| "loss": 0.4058, | |
| "mean_token_accuracy": 0.8596657002546876, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 0.8944269010838952, | |
| "grad_norm": 0.2228496860347481, | |
| "learning_rate": 6.681938895839746e-07, | |
| "loss": 0.4002, | |
| "mean_token_accuracy": 0.862631288272368, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.8957924383374584, | |
| "grad_norm": 0.19018327112122752, | |
| "learning_rate": 6.511405466013532e-07, | |
| "loss": 0.418, | |
| "mean_token_accuracy": 0.8573043089730195, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 0.8971579755910216, | |
| "grad_norm": 0.19393215931484176, | |
| "learning_rate": 6.343003145682114e-07, | |
| "loss": 0.4026, | |
| "mean_token_accuracy": 0.8614415069733373, | |
| "step": 657 | |
| }, | |
| { | |
| "epoch": 0.8985235128445848, | |
| "grad_norm": 0.20746626572888896, | |
| "learning_rate": 6.176735773642962e-07, | |
| "loss": 0.4043, | |
| "mean_token_accuracy": 0.8609983269079549, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 0.899889050098148, | |
| "grad_norm": 0.23075626629760573, | |
| "learning_rate": 6.012607140026605e-07, | |
| "loss": 0.4122, | |
| "mean_token_accuracy": 0.8590837704293297, | |
| "step": 659 | |
| }, | |
| { | |
| "epoch": 0.9012545873517112, | |
| "grad_norm": 0.21081958088922736, | |
| "learning_rate": 5.850620986210198e-07, | |
| "loss": 0.3963, | |
| "mean_token_accuracy": 0.8651597082259812, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.9026201246052744, | |
| "grad_norm": 0.21363684414515646, | |
| "learning_rate": 5.69078100473216e-07, | |
| "loss": 0.4012, | |
| "mean_token_accuracy": 0.8625379850354203, | |
| "step": 661 | |
| }, | |
| { | |
| "epoch": 0.9039856618588376, | |
| "grad_norm": 0.19570623175347346, | |
| "learning_rate": 5.533090839208133e-07, | |
| "loss": 0.414, | |
| "mean_token_accuracy": 0.8574432460527732, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 0.9053511991124008, | |
| "grad_norm": 0.20136402481480803, | |
| "learning_rate": 5.377554084247772e-07, | |
| "loss": 0.4064, | |
| "mean_token_accuracy": 0.8597524152182047, | |
| "step": 663 | |
| }, | |
| { | |
| "epoch": 0.906716736365964, | |
| "grad_norm": 0.20829638794206976, | |
| "learning_rate": 5.224174285372973e-07, | |
| "loss": 0.4066, | |
| "mean_token_accuracy": 0.8612470283150278, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 0.9080822736195272, | |
| "grad_norm": 0.20559896049133375, | |
| "learning_rate": 5.072954938936925e-07, | |
| "loss": 0.3977, | |
| "mean_token_accuracy": 0.8636562342939346, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.9094478108730903, | |
| "grad_norm": 0.24180013304406187, | |
| "learning_rate": 4.923899492044437e-07, | |
| "loss": 0.4071, | |
| "mean_token_accuracy": 0.8618068633494765, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 0.9108133481266536, | |
| "grad_norm": 0.20544052086394768, | |
| "learning_rate": 4.777011342473392e-07, | |
| "loss": 0.4069, | |
| "mean_token_accuracy": 0.8607662403616217, | |
| "step": 667 | |
| }, | |
| { | |
| "epoch": 0.9121788853802167, | |
| "grad_norm": 0.20218844698835448, | |
| "learning_rate": 4.632293838597246e-07, | |
| "loss": 0.4086, | |
| "mean_token_accuracy": 0.8614379669227725, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 0.91354442263378, | |
| "grad_norm": 0.19849540081460978, | |
| "learning_rate": 4.4897502793087576e-07, | |
| "loss": 0.4041, | |
| "mean_token_accuracy": 0.8642634187828342, | |
| "step": 669 | |
| }, | |
| { | |
| "epoch": 0.9149099598873431, | |
| "grad_norm": 0.20745329155441244, | |
| "learning_rate": 4.3493839139447716e-07, | |
| "loss": 0.4348, | |
| "mean_token_accuracy": 0.8541953811730532, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.9162754971409064, | |
| "grad_norm": 0.2287863594624806, | |
| "learning_rate": 4.2111979422120863e-07, | |
| "loss": 0.4053, | |
| "mean_token_accuracy": 0.8608941533244443, | |
| "step": 671 | |
| }, | |
| { | |
| "epoch": 0.9176410343944695, | |
| "grad_norm": 0.1925356796364094, | |
| "learning_rate": 4.075195514114594e-07, | |
| "loss": 0.3803, | |
| "mean_token_accuracy": 0.8682317326543381, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 0.9190065716480328, | |
| "grad_norm": 0.18548297023650745, | |
| "learning_rate": 3.941379729881456e-07, | |
| "loss": 0.3944, | |
| "mean_token_accuracy": 0.864907673080918, | |
| "step": 673 | |
| }, | |
| { | |
| "epoch": 0.9203721089015959, | |
| "grad_norm": 0.21719662700717418, | |
| "learning_rate": 3.8097536398963965e-07, | |
| "loss": 0.3976, | |
| "mean_token_accuracy": 0.8632850150504787, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 0.9217376461551592, | |
| "grad_norm": 0.21291688480092102, | |
| "learning_rate": 3.6803202446282217e-07, | |
| "loss": 0.4331, | |
| "mean_token_accuracy": 0.8520979656086647, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.9231031834087223, | |
| "grad_norm": 0.19317640877157852, | |
| "learning_rate": 3.553082494562354e-07, | |
| "loss": 0.4044, | |
| "mean_token_accuracy": 0.860968928653511, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 0.9244687206622856, | |
| "grad_norm": 0.20029096615777936, | |
| "learning_rate": 3.4280432901336423e-07, | |
| "loss": 0.4327, | |
| "mean_token_accuracy": 0.8523410994278862, | |
| "step": 677 | |
| }, | |
| { | |
| "epoch": 0.9258342579158487, | |
| "grad_norm": 0.19089996560484057, | |
| "learning_rate": 3.3052054816602455e-07, | |
| "loss": 0.4118, | |
| "mean_token_accuracy": 0.8587402967704104, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 0.927199795169412, | |
| "grad_norm": 0.21562854865168446, | |
| "learning_rate": 3.1845718692785743e-07, | |
| "loss": 0.3998, | |
| "mean_token_accuracy": 0.863486058955801, | |
| "step": 679 | |
| }, | |
| { | |
| "epoch": 0.9285653324229751, | |
| "grad_norm": 0.2074181207840677, | |
| "learning_rate": 3.0661452028795335e-07, | |
| "loss": 0.418, | |
| "mean_token_accuracy": 0.8581571716176858, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.9299308696765384, | |
| "grad_norm": 0.19559493828044613, | |
| "learning_rate": 2.949928182045869e-07, | |
| "loss": 0.3859, | |
| "mean_token_accuracy": 0.8668547803361453, | |
| "step": 681 | |
| }, | |
| { | |
| "epoch": 0.9312964069301015, | |
| "grad_norm": 0.20748047485633067, | |
| "learning_rate": 2.835923455990508e-07, | |
| "loss": 0.4038, | |
| "mean_token_accuracy": 0.862158416990831, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 0.9326619441836648, | |
| "grad_norm": 0.20156584121794527, | |
| "learning_rate": 2.7241336234962943e-07, | |
| "loss": 0.3962, | |
| "mean_token_accuracy": 0.864386738221901, | |
| "step": 683 | |
| }, | |
| { | |
| "epoch": 0.9340274814372279, | |
| "grad_norm": 0.18328492764164395, | |
| "learning_rate": 2.614561232856672e-07, | |
| "loss": 0.3818, | |
| "mean_token_accuracy": 0.8677611815631928, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 0.9353930186907912, | |
| "grad_norm": 0.20904145120342477, | |
| "learning_rate": 2.507208781817638e-07, | |
| "loss": 0.4285, | |
| "mean_token_accuracy": 0.8544474486773861, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.9367585559443543, | |
| "grad_norm": 0.20813281413241, | |
| "learning_rate": 2.402078717520795e-07, | |
| "loss": 0.413, | |
| "mean_token_accuracy": 0.8585056450386133, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 0.9381240931979176, | |
| "grad_norm": 0.19274067038403314, | |
| "learning_rate": 2.2991734364475214e-07, | |
| "loss": 0.4253, | |
| "mean_token_accuracy": 0.8552607489362258, | |
| "step": 687 | |
| }, | |
| { | |
| "epoch": 0.9394896304514807, | |
| "grad_norm": 0.18657512512018273, | |
| "learning_rate": 2.1984952843644104e-07, | |
| "loss": 0.4048, | |
| "mean_token_accuracy": 0.8605767691236879, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 0.940855167705044, | |
| "grad_norm": 0.20415038653399858, | |
| "learning_rate": 2.1000465562697858e-07, | |
| "loss": 0.403, | |
| "mean_token_accuracy": 0.8614550447092127, | |
| "step": 689 | |
| }, | |
| { | |
| "epoch": 0.9422207049586071, | |
| "grad_norm": 0.18802302287270456, | |
| "learning_rate": 2.0038294963413251e-07, | |
| "loss": 0.4102, | |
| "mean_token_accuracy": 0.8589263672995899, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.9435862422121704, | |
| "grad_norm": 0.20800845771223211, | |
| "learning_rate": 1.9098462978849875e-07, | |
| "loss": 0.4061, | |
| "mean_token_accuracy": 0.8600364168613337, | |
| "step": 691 | |
| }, | |
| { | |
| "epoch": 0.9449517794657335, | |
| "grad_norm": 0.21444000519203277, | |
| "learning_rate": 1.8180991032849426e-07, | |
| "loss": 0.4256, | |
| "mean_token_accuracy": 0.8541757101430708, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 0.9463173167192968, | |
| "grad_norm": 0.2049726644452516, | |
| "learning_rate": 1.7285900039547997e-07, | |
| "loss": 0.3977, | |
| "mean_token_accuracy": 0.8644176084772834, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 0.9476828539728599, | |
| "grad_norm": 0.19845365768398393, | |
| "learning_rate": 1.6413210402898895e-07, | |
| "loss": 0.422, | |
| "mean_token_accuracy": 0.8564894930291981, | |
| "step": 694 | |
| }, | |
| { | |
| "epoch": 0.9490483912264231, | |
| "grad_norm": 0.20134810058434158, | |
| "learning_rate": 1.556294201620734e-07, | |
| "loss": 0.3986, | |
| "mean_token_accuracy": 0.8628961715481271, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.9504139284799863, | |
| "grad_norm": 0.20332703044962025, | |
| "learning_rate": 1.4735114261677842e-07, | |
| "loss": 0.4159, | |
| "mean_token_accuracy": 0.8602014785454245, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 0.9517794657335495, | |
| "grad_norm": 0.20262245169618026, | |
| "learning_rate": 1.3929746009971434e-07, | |
| "loss": 0.4205, | |
| "mean_token_accuracy": 0.8558728368817288, | |
| "step": 697 | |
| }, | |
| { | |
| "epoch": 0.9531450029871127, | |
| "grad_norm": 0.1986654702159638, | |
| "learning_rate": 1.3146855619776134e-07, | |
| "loss": 0.4164, | |
| "mean_token_accuracy": 0.8583016759076048, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 0.9545105402406759, | |
| "grad_norm": 0.20538122761572822, | |
| "learning_rate": 1.2386460937387824e-07, | |
| "loss": 0.4122, | |
| "mean_token_accuracy": 0.8596236188677384, | |
| "step": 699 | |
| }, | |
| { | |
| "epoch": 0.9558760774942391, | |
| "grad_norm": 0.18771743571795288, | |
| "learning_rate": 1.1648579296304252e-07, | |
| "loss": 0.389, | |
| "mean_token_accuracy": 0.8655003903926352, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.9572416147478023, | |
| "grad_norm": 0.18918786074101446, | |
| "learning_rate": 1.0933227516829348e-07, | |
| "loss": 0.3804, | |
| "mean_token_accuracy": 0.8682751397466016, | |
| "step": 701 | |
| }, | |
| { | |
| "epoch": 0.9586071520013656, | |
| "grad_norm": 0.21947757366560472, | |
| "learning_rate": 1.0240421905689746e-07, | |
| "loss": 0.4128, | |
| "mean_token_accuracy": 0.858528673911104, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 0.9599726892549287, | |
| "grad_norm": 0.197515434889879, | |
| "learning_rate": 9.570178255663532e-08, | |
| "loss": 0.3926, | |
| "mean_token_accuracy": 0.8648129525692759, | |
| "step": 703 | |
| }, | |
| { | |
| "epoch": 0.961338226508492, | |
| "grad_norm": 0.1879435454356452, | |
| "learning_rate": 8.922511845219972e-08, | |
| "loss": 0.4138, | |
| "mean_token_accuracy": 0.8581702095308102, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 0.9627037637620551, | |
| "grad_norm": 0.18890815403985045, | |
| "learning_rate": 8.297437438170797e-08, | |
| "loss": 0.3907, | |
| "mean_token_accuracy": 0.865506216595041, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.9640693010156184, | |
| "grad_norm": 0.286341814862515, | |
| "learning_rate": 7.694969283334575e-08, | |
| "loss": 0.4113, | |
| "mean_token_accuracy": 0.8591874470356847, | |
| "step": 706 | |
| }, | |
| { | |
| "epoch": 0.9654348382691815, | |
| "grad_norm": 0.20273945460392515, | |
| "learning_rate": 7.115121114211198e-08, | |
| "loss": 0.3943, | |
| "mean_token_accuracy": 0.8657841466466322, | |
| "step": 707 | |
| }, | |
| { | |
| "epoch": 0.9668003755227448, | |
| "grad_norm": 0.20652336093906176, | |
| "learning_rate": 6.557906148669025e-08, | |
| "loss": 0.4253, | |
| "mean_token_accuracy": 0.8548559094292058, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 0.9681659127763079, | |
| "grad_norm": 0.19629348230830565, | |
| "learning_rate": 6.023337088643665e-08, | |
| "loss": 0.3905, | |
| "mean_token_accuracy": 0.866214795452116, | |
| "step": 709 | |
| }, | |
| { | |
| "epoch": 0.9695314500298712, | |
| "grad_norm": 0.18612630221965354, | |
| "learning_rate": 5.51142611984834e-08, | |
| "loss": 0.4012, | |
| "mean_token_accuracy": 0.8630792198659271, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.9708969872834343, | |
| "grad_norm": 0.20471539063414793, | |
| "learning_rate": 5.022184911495864e-08, | |
| "loss": 0.4088, | |
| "mean_token_accuracy": 0.8615388696106141, | |
| "step": 711 | |
| }, | |
| { | |
| "epoch": 0.9722625245369976, | |
| "grad_norm": 0.20615922685602697, | |
| "learning_rate": 4.555624616033427e-08, | |
| "loss": 0.4423, | |
| "mean_token_accuracy": 0.8509749138520655, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 0.9736280617905607, | |
| "grad_norm": 0.1905175337604626, | |
| "learning_rate": 4.111755868887346e-08, | |
| "loss": 0.3965, | |
| "mean_token_accuracy": 0.8641722387632514, | |
| "step": 713 | |
| }, | |
| { | |
| "epoch": 0.974993599044124, | |
| "grad_norm": 0.21850511626165098, | |
| "learning_rate": 3.690588788221372e-08, | |
| "loss": 0.4073, | |
| "mean_token_accuracy": 0.8611413177219913, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 0.9763591362976871, | |
| "grad_norm": 0.19471127549408462, | |
| "learning_rate": 3.2921329747056527e-08, | |
| "loss": 0.4208, | |
| "mean_token_accuracy": 0.8565208481356911, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.9777246735512504, | |
| "grad_norm": 0.20634538207455483, | |
| "learning_rate": 2.916397511298019e-08, | |
| "loss": 0.4022, | |
| "mean_token_accuracy": 0.8626477917792553, | |
| "step": 716 | |
| }, | |
| { | |
| "epoch": 0.9790902108048135, | |
| "grad_norm": 0.17898675224982286, | |
| "learning_rate": 2.563390963037149e-08, | |
| "loss": 0.4084, | |
| "mean_token_accuracy": 0.858742060603683, | |
| "step": 717 | |
| }, | |
| { | |
| "epoch": 0.9804557480583768, | |
| "grad_norm": 0.20186274942480947, | |
| "learning_rate": 2.2331213768468363e-08, | |
| "loss": 0.4112, | |
| "mean_token_accuracy": 0.8604082683839368, | |
| "step": 718 | |
| }, | |
| { | |
| "epoch": 0.9818212853119399, | |
| "grad_norm": 0.20068940005862676, | |
| "learning_rate": 1.925596281353026e-08, | |
| "loss": 0.4061, | |
| "mean_token_accuracy": 0.8611462253446862, | |
| "step": 719 | |
| }, | |
| { | |
| "epoch": 0.9831868225655032, | |
| "grad_norm": 0.1949558079302809, | |
| "learning_rate": 1.6408226867118404e-08, | |
| "loss": 0.3991, | |
| "mean_token_accuracy": 0.8624983217087219, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.9845523598190663, | |
| "grad_norm": 0.20269545070379796, | |
| "learning_rate": 1.3788070844501511e-08, | |
| "loss": 0.3964, | |
| "mean_token_accuracy": 0.8662190034977201, | |
| "step": 721 | |
| }, | |
| { | |
| "epoch": 0.9859178970726296, | |
| "grad_norm": 0.19803360839942985, | |
| "learning_rate": 1.1395554473171421e-08, | |
| "loss": 0.4237, | |
| "mean_token_accuracy": 0.8558418698658579, | |
| "step": 722 | |
| }, | |
| { | |
| "epoch": 0.9872834343261927, | |
| "grad_norm": 0.20911643943570965, | |
| "learning_rate": 9.2307322914853e-09, | |
| "loss": 0.4014, | |
| "mean_token_accuracy": 0.8616119983012264, | |
| "step": 723 | |
| }, | |
| { | |
| "epoch": 0.988648971579756, | |
| "grad_norm": 0.20281334643579427, | |
| "learning_rate": 7.293653647421073e-09, | |
| "loss": 0.3948, | |
| "mean_token_accuracy": 0.8643835215948916, | |
| "step": 724 | |
| }, | |
| { | |
| "epoch": 0.9900145088333191, | |
| "grad_norm": 0.1901095357986784, | |
| "learning_rate": 5.584362697453882e-09, | |
| "loss": 0.4048, | |
| "mean_token_accuracy": 0.8623485947951701, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.9913800460868823, | |
| "grad_norm": 0.20379731214220034, | |
| "learning_rate": 4.1028984055457856e-09, | |
| "loss": 0.3983, | |
| "mean_token_accuracy": 0.8660865418772313, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 0.9927455833404455, | |
| "grad_norm": 0.21708635868498447, | |
| "learning_rate": 2.8492945422620157e-09, | |
| "loss": 0.4147, | |
| "mean_token_accuracy": 0.8587614489423447, | |
| "step": 727 | |
| }, | |
| { | |
| "epoch": 0.9941111205940087, | |
| "grad_norm": 0.1929220391858935, | |
| "learning_rate": 1.8235796839982667e-09, | |
| "loss": 0.435, | |
| "mean_token_accuracy": 0.8512693561141161, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 0.9954766578475719, | |
| "grad_norm": 0.20428589636781275, | |
| "learning_rate": 1.0257772123312137e-09, | |
| "loss": 0.3959, | |
| "mean_token_accuracy": 0.8652902515910991, | |
| "step": 729 | |
| }, | |
| { | |
| "epoch": 0.9968421951011351, | |
| "grad_norm": 0.1904705638210679, | |
| "learning_rate": 4.5590531348227443e-10, | |
| "loss": 0.4054, | |
| "mean_token_accuracy": 0.8607762718652394, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.9982077323546983, | |
| "grad_norm": 0.19964279194650797, | |
| "learning_rate": 1.1397697790793693e-10, | |
| "loss": 0.4149, | |
| "mean_token_accuracy": 0.8587917368074317, | |
| "step": 731 | |
| }, | |
| { | |
| "epoch": 0.9995732696082615, | |
| "grad_norm": 0.19717881385523706, | |
| "learning_rate": 0.0, | |
| "loss": 0.4046, | |
| "mean_token_accuracy": 0.8610176941050893, | |
| "step": 732 | |
| }, | |
| { | |
| "epoch": 0.9995732696082615, | |
| "step": 732, | |
| "total_flos": 2.7844372399870968e+19, | |
| "train_loss": 0.4556342206203221, | |
| "train_runtime": 169635.2867, | |
| "train_samples_per_second": 0.553, | |
| "train_steps_per_second": 0.004 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 732, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.7844372399870968e+19, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |