| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 645, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004662004662004662, | |
| "grad_norm": 206.61297042982778, | |
| "learning_rate": 0.0, | |
| "loss": 6.0047, | |
| "mean_token_accuracy": 0.26166820526123047, | |
| "num_tokens": 9481.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.009324009324009324, | |
| "grad_norm": 237.90030179764935, | |
| "learning_rate": 1.5384615384615387e-07, | |
| "loss": 5.9896, | |
| "mean_token_accuracy": 0.26549550890922546, | |
| "num_tokens": 18486.0, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.013986013986013986, | |
| "grad_norm": 191.565778458644, | |
| "learning_rate": 3.0769230769230774e-07, | |
| "loss": 5.8697, | |
| "mean_token_accuracy": 0.26646704971790314, | |
| "num_tokens": 28054.0, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.018648018648018648, | |
| "grad_norm": 173.73423360043478, | |
| "learning_rate": 4.615384615384616e-07, | |
| "loss": 5.7664, | |
| "mean_token_accuracy": 0.28080685436725616, | |
| "num_tokens": 38100.0, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.023310023310023312, | |
| "grad_norm": 188.53311568741194, | |
| "learning_rate": 6.153846153846155e-07, | |
| "loss": 6.1116, | |
| "mean_token_accuracy": 0.24714654684066772, | |
| "num_tokens": 46674.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.027972027972027972, | |
| "grad_norm": 138.50157116139866, | |
| "learning_rate": 7.692307692307694e-07, | |
| "loss": 5.7375, | |
| "mean_token_accuracy": 0.2682064026594162, | |
| "num_tokens": 55892.0, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.03263403263403263, | |
| "grad_norm": 137.331644930486, | |
| "learning_rate": 9.230769230769232e-07, | |
| "loss": 5.7865, | |
| "mean_token_accuracy": 0.2567756175994873, | |
| "num_tokens": 64557.0, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.037296037296037296, | |
| "grad_norm": 113.93237748074404, | |
| "learning_rate": 1.076923076923077e-06, | |
| "loss": 5.4186, | |
| "mean_token_accuracy": 0.2943734973669052, | |
| "num_tokens": 74534.0, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.04195804195804196, | |
| "grad_norm": 97.42189347532377, | |
| "learning_rate": 1.230769230769231e-06, | |
| "loss": 5.1123, | |
| "mean_token_accuracy": 0.29780860245227814, | |
| "num_tokens": 84839.0, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.046620046620046623, | |
| "grad_norm": 115.13017773253503, | |
| "learning_rate": 1.3846153846153848e-06, | |
| "loss": 5.0274, | |
| "mean_token_accuracy": 0.30082090198993683, | |
| "num_tokens": 94486.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.05128205128205128, | |
| "grad_norm": 92.86208928347652, | |
| "learning_rate": 1.5384615384615387e-06, | |
| "loss": 4.8261, | |
| "mean_token_accuracy": 0.31195709109306335, | |
| "num_tokens": 103901.0, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.055944055944055944, | |
| "grad_norm": 74.98678911328179, | |
| "learning_rate": 1.6923076923076926e-06, | |
| "loss": 4.3226, | |
| "mean_token_accuracy": 0.332173153758049, | |
| "num_tokens": 113421.0, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.06060606060606061, | |
| "grad_norm": 65.42071488824216, | |
| "learning_rate": 1.8461538461538465e-06, | |
| "loss": 4.1213, | |
| "mean_token_accuracy": 0.34372538328170776, | |
| "num_tokens": 122888.0, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.06526806526806526, | |
| "grad_norm": 51.76685372566078, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 3.8684, | |
| "mean_token_accuracy": 0.36909155547618866, | |
| "num_tokens": 132046.0, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.06993006993006994, | |
| "grad_norm": 47.330915768289, | |
| "learning_rate": 2.153846153846154e-06, | |
| "loss": 3.5595, | |
| "mean_token_accuracy": 0.4020439237356186, | |
| "num_tokens": 141586.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.07459207459207459, | |
| "grad_norm": 39.592533375514385, | |
| "learning_rate": 2.307692307692308e-06, | |
| "loss": 3.0525, | |
| "mean_token_accuracy": 0.46130137145519257, | |
| "num_tokens": 151366.0, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.07925407925407925, | |
| "grad_norm": 38.681099075404326, | |
| "learning_rate": 2.461538461538462e-06, | |
| "loss": 2.8418, | |
| "mean_token_accuracy": 0.47952745854854584, | |
| "num_tokens": 160361.0, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.08391608391608392, | |
| "grad_norm": 26.957564440869795, | |
| "learning_rate": 2.615384615384616e-06, | |
| "loss": 2.4668, | |
| "mean_token_accuracy": 0.5341931283473969, | |
| "num_tokens": 170716.0, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.08857808857808858, | |
| "grad_norm": 29.725894005554363, | |
| "learning_rate": 2.7692307692307697e-06, | |
| "loss": 2.4476, | |
| "mean_token_accuracy": 0.530603438615799, | |
| "num_tokens": 179378.0, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.09324009324009325, | |
| "grad_norm": 22.45654494245414, | |
| "learning_rate": 2.9230769230769236e-06, | |
| "loss": 1.9608, | |
| "mean_token_accuracy": 0.6144967377185822, | |
| "num_tokens": 189155.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0979020979020979, | |
| "grad_norm": 22.420677559887096, | |
| "learning_rate": 3.0769230769230774e-06, | |
| "loss": 1.8408, | |
| "mean_token_accuracy": 0.6224100291728973, | |
| "num_tokens": 198128.0, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.10256410256410256, | |
| "grad_norm": 20.045412685294725, | |
| "learning_rate": 3.2307692307692313e-06, | |
| "loss": 1.5358, | |
| "mean_token_accuracy": 0.668096661567688, | |
| "num_tokens": 207526.0, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.10722610722610723, | |
| "grad_norm": 17.826429643351467, | |
| "learning_rate": 3.384615384615385e-06, | |
| "loss": 1.3385, | |
| "mean_token_accuracy": 0.7064461410045624, | |
| "num_tokens": 217437.0, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.11188811188811189, | |
| "grad_norm": 14.795248732817978, | |
| "learning_rate": 3.538461538461539e-06, | |
| "loss": 1.1874, | |
| "mean_token_accuracy": 0.728604257106781, | |
| "num_tokens": 226725.0, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.11655011655011654, | |
| "grad_norm": 16.367290617286876, | |
| "learning_rate": 3.692307692307693e-06, | |
| "loss": 0.9682, | |
| "mean_token_accuracy": 0.7707604169845581, | |
| "num_tokens": 235721.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.12121212121212122, | |
| "grad_norm": 9.588756303110177, | |
| "learning_rate": 3.846153846153847e-06, | |
| "loss": 0.8884, | |
| "mean_token_accuracy": 0.7929337918758392, | |
| "num_tokens": 244662.0, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.1258741258741259, | |
| "grad_norm": 7.015049125629419, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.7978, | |
| "mean_token_accuracy": 0.8093675374984741, | |
| "num_tokens": 254047.0, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.13053613053613053, | |
| "grad_norm": 5.597049269802009, | |
| "learning_rate": 4.1538461538461545e-06, | |
| "loss": 0.7933, | |
| "mean_token_accuracy": 0.8108646273612976, | |
| "num_tokens": 262770.0, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.1351981351981352, | |
| "grad_norm": 12.495340171422823, | |
| "learning_rate": 4.307692307692308e-06, | |
| "loss": 0.7206, | |
| "mean_token_accuracy": 0.8183692693710327, | |
| "num_tokens": 272936.0, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.13986013986013987, | |
| "grad_norm": 4.8146353297051325, | |
| "learning_rate": 4.461538461538462e-06, | |
| "loss": 0.7378, | |
| "mean_token_accuracy": 0.8188252151012421, | |
| "num_tokens": 282677.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1445221445221445, | |
| "grad_norm": 4.397488614777519, | |
| "learning_rate": 4.615384615384616e-06, | |
| "loss": 0.7168, | |
| "mean_token_accuracy": 0.8155234456062317, | |
| "num_tokens": 291851.0, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.14918414918414918, | |
| "grad_norm": 3.8407813105595707, | |
| "learning_rate": 4.76923076923077e-06, | |
| "loss": 0.653, | |
| "mean_token_accuracy": 0.8314312100410461, | |
| "num_tokens": 300775.0, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.15384615384615385, | |
| "grad_norm": 3.9490135740465506, | |
| "learning_rate": 4.923076923076924e-06, | |
| "loss": 0.6982, | |
| "mean_token_accuracy": 0.8216440379619598, | |
| "num_tokens": 309003.0, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.1585081585081585, | |
| "grad_norm": 3.1111593206661423, | |
| "learning_rate": 5.076923076923077e-06, | |
| "loss": 0.6367, | |
| "mean_token_accuracy": 0.8357088267803192, | |
| "num_tokens": 318722.0, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.16317016317016317, | |
| "grad_norm": 3.8540200089616787, | |
| "learning_rate": 5.230769230769232e-06, | |
| "loss": 0.6725, | |
| "mean_token_accuracy": 0.8249399065971375, | |
| "num_tokens": 328363.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.16783216783216784, | |
| "grad_norm": 3.2279851487302533, | |
| "learning_rate": 5.384615384615385e-06, | |
| "loss": 0.6361, | |
| "mean_token_accuracy": 0.8352257907390594, | |
| "num_tokens": 337298.0, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.17249417249417248, | |
| "grad_norm": 3.2846323967634277, | |
| "learning_rate": 5.538461538461539e-06, | |
| "loss": 0.585, | |
| "mean_token_accuracy": 0.8430881202220917, | |
| "num_tokens": 346068.0, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.17715617715617715, | |
| "grad_norm": 3.201411224226897, | |
| "learning_rate": 5.692307692307692e-06, | |
| "loss": 0.6067, | |
| "mean_token_accuracy": 0.8457746207714081, | |
| "num_tokens": 354566.0, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.18181818181818182, | |
| "grad_norm": 3.603688713367759, | |
| "learning_rate": 5.846153846153847e-06, | |
| "loss": 0.5355, | |
| "mean_token_accuracy": 0.860386848449707, | |
| "num_tokens": 363662.0, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.1864801864801865, | |
| "grad_norm": 3.095744451514778, | |
| "learning_rate": 6e-06, | |
| "loss": 0.6101, | |
| "mean_token_accuracy": 0.8404548466205597, | |
| "num_tokens": 372778.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.19114219114219114, | |
| "grad_norm": 2.851157082704997, | |
| "learning_rate": 6.153846153846155e-06, | |
| "loss": 0.5961, | |
| "mean_token_accuracy": 0.8459599018096924, | |
| "num_tokens": 381990.0, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.1958041958041958, | |
| "grad_norm": 2.920481788097288, | |
| "learning_rate": 6.307692307692308e-06, | |
| "loss": 0.6033, | |
| "mean_token_accuracy": 0.8439716398715973, | |
| "num_tokens": 390764.0, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.20046620046620048, | |
| "grad_norm": 2.6590805207702877, | |
| "learning_rate": 6.461538461538463e-06, | |
| "loss": 0.5411, | |
| "mean_token_accuracy": 0.8539808988571167, | |
| "num_tokens": 399544.0, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.20512820512820512, | |
| "grad_norm": 2.869045415372655, | |
| "learning_rate": 6.615384615384616e-06, | |
| "loss": 0.5994, | |
| "mean_token_accuracy": 0.8429957032203674, | |
| "num_tokens": 408895.0, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.2097902097902098, | |
| "grad_norm": 2.5632574285139693, | |
| "learning_rate": 6.76923076923077e-06, | |
| "loss": 0.5474, | |
| "mean_token_accuracy": 0.8559859097003937, | |
| "num_tokens": 417528.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.21445221445221446, | |
| "grad_norm": 3.0716143926843213, | |
| "learning_rate": 6.923076923076923e-06, | |
| "loss": 0.5659, | |
| "mean_token_accuracy": 0.8473467230796814, | |
| "num_tokens": 426632.0, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.2191142191142191, | |
| "grad_norm": 2.816132536269355, | |
| "learning_rate": 7.076923076923078e-06, | |
| "loss": 0.6149, | |
| "mean_token_accuracy": 0.8401601612567902, | |
| "num_tokens": 435698.0, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.22377622377622378, | |
| "grad_norm": 2.4694488487146544, | |
| "learning_rate": 7.230769230769231e-06, | |
| "loss": 0.5604, | |
| "mean_token_accuracy": 0.8553016185760498, | |
| "num_tokens": 444865.0, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.22843822843822845, | |
| "grad_norm": 2.534416309991065, | |
| "learning_rate": 7.384615384615386e-06, | |
| "loss": 0.5465, | |
| "mean_token_accuracy": 0.8501911461353302, | |
| "num_tokens": 454293.0, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.2331002331002331, | |
| "grad_norm": 2.5987473992591994, | |
| "learning_rate": 7.538461538461539e-06, | |
| "loss": 0.5779, | |
| "mean_token_accuracy": 0.846989244222641, | |
| "num_tokens": 463669.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.23776223776223776, | |
| "grad_norm": 2.9304334416656035, | |
| "learning_rate": 7.692307692307694e-06, | |
| "loss": 0.5951, | |
| "mean_token_accuracy": 0.842944860458374, | |
| "num_tokens": 472913.0, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.24242424242424243, | |
| "grad_norm": 2.557007651858603, | |
| "learning_rate": 7.846153846153847e-06, | |
| "loss": 0.5522, | |
| "mean_token_accuracy": 0.8499407768249512, | |
| "num_tokens": 482966.0, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.24708624708624707, | |
| "grad_norm": 2.413067290036643, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.556, | |
| "mean_token_accuracy": 0.8527302443981171, | |
| "num_tokens": 491361.0, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.2517482517482518, | |
| "grad_norm": 2.3167133059321765, | |
| "learning_rate": 8.153846153846154e-06, | |
| "loss": 0.5474, | |
| "mean_token_accuracy": 0.8524684011936188, | |
| "num_tokens": 500582.0, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.2564102564102564, | |
| "grad_norm": 2.459059208267421, | |
| "learning_rate": 8.307692307692309e-06, | |
| "loss": 0.5474, | |
| "mean_token_accuracy": 0.8470076322555542, | |
| "num_tokens": 510444.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.26107226107226106, | |
| "grad_norm": 2.3850596138441955, | |
| "learning_rate": 8.461538461538462e-06, | |
| "loss": 0.5477, | |
| "mean_token_accuracy": 0.8558304607868195, | |
| "num_tokens": 519595.0, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.26573426573426573, | |
| "grad_norm": 2.425014600765099, | |
| "learning_rate": 8.615384615384617e-06, | |
| "loss": 0.5312, | |
| "mean_token_accuracy": 0.856484979391098, | |
| "num_tokens": 529256.0, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.2703962703962704, | |
| "grad_norm": 2.41544921845608, | |
| "learning_rate": 8.76923076923077e-06, | |
| "loss": 0.5581, | |
| "mean_token_accuracy": 0.8501502573490143, | |
| "num_tokens": 538883.0, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.27505827505827507, | |
| "grad_norm": 2.3601513682778537, | |
| "learning_rate": 8.923076923076925e-06, | |
| "loss": 0.5381, | |
| "mean_token_accuracy": 0.8557923436164856, | |
| "num_tokens": 547820.0, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.27972027972027974, | |
| "grad_norm": 2.5228638625713815, | |
| "learning_rate": 9.076923076923078e-06, | |
| "loss": 0.5478, | |
| "mean_token_accuracy": 0.8510215282440186, | |
| "num_tokens": 556572.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.28438228438228436, | |
| "grad_norm": 2.4788861740314925, | |
| "learning_rate": 9.230769230769232e-06, | |
| "loss": 0.5702, | |
| "mean_token_accuracy": 0.8453376293182373, | |
| "num_tokens": 565988.0, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.289044289044289, | |
| "grad_norm": 2.443670735215224, | |
| "learning_rate": 9.384615384615385e-06, | |
| "loss": 0.5701, | |
| "mean_token_accuracy": 0.8493904769420624, | |
| "num_tokens": 575388.0, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.2937062937062937, | |
| "grad_norm": 2.51188327450779, | |
| "learning_rate": 9.53846153846154e-06, | |
| "loss": 0.6161, | |
| "mean_token_accuracy": 0.8381120562553406, | |
| "num_tokens": 584991.0, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.29836829836829837, | |
| "grad_norm": 2.4568944814979763, | |
| "learning_rate": 9.692307692307693e-06, | |
| "loss": 0.5544, | |
| "mean_token_accuracy": 0.8492381870746613, | |
| "num_tokens": 594666.0, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.30303030303030304, | |
| "grad_norm": 2.3543409090851872, | |
| "learning_rate": 9.846153846153848e-06, | |
| "loss": 0.5627, | |
| "mean_token_accuracy": 0.8481525778770447, | |
| "num_tokens": 603192.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.3076923076923077, | |
| "grad_norm": 2.516015882056836, | |
| "learning_rate": 1e-05, | |
| "loss": 0.5518, | |
| "mean_token_accuracy": 0.8509586453437805, | |
| "num_tokens": 612501.0, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.3123543123543124, | |
| "grad_norm": 2.5702229906928626, | |
| "learning_rate": 9.999933987646821e-06, | |
| "loss": 0.5652, | |
| "mean_token_accuracy": 0.8515127599239349, | |
| "num_tokens": 621564.0, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.317016317016317, | |
| "grad_norm": 2.311235457272914, | |
| "learning_rate": 9.99973595252401e-06, | |
| "loss": 0.5007, | |
| "mean_token_accuracy": 0.8656518757343292, | |
| "num_tokens": 630726.0, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.32167832167832167, | |
| "grad_norm": 2.3626760252822736, | |
| "learning_rate": 9.999405900441683e-06, | |
| "loss": 0.5291, | |
| "mean_token_accuracy": 0.8573567271232605, | |
| "num_tokens": 639950.0, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.32634032634032634, | |
| "grad_norm": 2.5106829719874733, | |
| "learning_rate": 9.998943841083179e-06, | |
| "loss": 0.5333, | |
| "mean_token_accuracy": 0.8539510667324066, | |
| "num_tokens": 649402.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.331002331002331, | |
| "grad_norm": 2.374126097719514, | |
| "learning_rate": 9.99834978800478e-06, | |
| "loss": 0.5136, | |
| "mean_token_accuracy": 0.8584230840206146, | |
| "num_tokens": 658474.0, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.3356643356643357, | |
| "grad_norm": 2.2734777943143945, | |
| "learning_rate": 9.997623758635298e-06, | |
| "loss": 0.5469, | |
| "mean_token_accuracy": 0.8496910333633423, | |
| "num_tokens": 667908.0, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.34032634032634035, | |
| "grad_norm": 2.5255087017576043, | |
| "learning_rate": 9.996765774275587e-06, | |
| "loss": 0.527, | |
| "mean_token_accuracy": 0.8526964783668518, | |
| "num_tokens": 677532.0, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.34498834498834496, | |
| "grad_norm": 2.2143213966162216, | |
| "learning_rate": 9.995775860097897e-06, | |
| "loss": 0.5219, | |
| "mean_token_accuracy": 0.8588562309741974, | |
| "num_tokens": 686796.0, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.34965034965034963, | |
| "grad_norm": 2.1934169241509665, | |
| "learning_rate": 9.994654045145142e-06, | |
| "loss": 0.5401, | |
| "mean_token_accuracy": 0.8564260303974152, | |
| "num_tokens": 695776.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.3543123543123543, | |
| "grad_norm": 2.153560775154425, | |
| "learning_rate": 9.993400362330058e-06, | |
| "loss": 0.5446, | |
| "mean_token_accuracy": 0.8583995997905731, | |
| "num_tokens": 704530.0, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.358974358974359, | |
| "grad_norm": 2.128801081198065, | |
| "learning_rate": 9.992014848434221e-06, | |
| "loss": 0.5363, | |
| "mean_token_accuracy": 0.8531051576137543, | |
| "num_tokens": 714123.0, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.36363636363636365, | |
| "grad_norm": 2.312071712839466, | |
| "learning_rate": 9.990497544106981e-06, | |
| "loss": 0.5344, | |
| "mean_token_accuracy": 0.8557191491127014, | |
| "num_tokens": 723719.0, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.3682983682983683, | |
| "grad_norm": 2.4795067817061573, | |
| "learning_rate": 9.988848493864259e-06, | |
| "loss": 0.5597, | |
| "mean_token_accuracy": 0.8507181704044342, | |
| "num_tokens": 733447.0, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.372960372960373, | |
| "grad_norm": 2.2998442504007452, | |
| "learning_rate": 9.987067746087251e-06, | |
| "loss": 0.5467, | |
| "mean_token_accuracy": 0.8498886525630951, | |
| "num_tokens": 742567.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3776223776223776, | |
| "grad_norm": 2.220780163026348, | |
| "learning_rate": 9.985155353021004e-06, | |
| "loss": 0.538, | |
| "mean_token_accuracy": 0.8577711582183838, | |
| "num_tokens": 751488.0, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.3822843822843823, | |
| "grad_norm": 2.23769737502521, | |
| "learning_rate": 9.983111370772877e-06, | |
| "loss": 0.5375, | |
| "mean_token_accuracy": 0.8561404347419739, | |
| "num_tokens": 760518.0, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.38694638694638694, | |
| "grad_norm": 2.3468431520556243, | |
| "learning_rate": 9.980935859310907e-06, | |
| "loss": 0.5071, | |
| "mean_token_accuracy": 0.8601345419883728, | |
| "num_tokens": 769855.0, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.3916083916083916, | |
| "grad_norm": 2.420088401775177, | |
| "learning_rate": 9.97862888246204e-06, | |
| "loss": 0.5517, | |
| "mean_token_accuracy": 0.8494808971881866, | |
| "num_tokens": 780309.0, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.3962703962703963, | |
| "grad_norm": 2.1832026417273864, | |
| "learning_rate": 9.976190507910265e-06, | |
| "loss": 0.5007, | |
| "mean_token_accuracy": 0.8591891229152679, | |
| "num_tokens": 790560.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.40093240093240096, | |
| "grad_norm": 2.217081826161623, | |
| "learning_rate": 9.97362080719462e-06, | |
| "loss": 0.4945, | |
| "mean_token_accuracy": 0.8628792762756348, | |
| "num_tokens": 799972.0, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.40559440559440557, | |
| "grad_norm": 2.1021603924715877, | |
| "learning_rate": 9.970919855707103e-06, | |
| "loss": 0.5149, | |
| "mean_token_accuracy": 0.8582651615142822, | |
| "num_tokens": 808998.0, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.41025641025641024, | |
| "grad_norm": 2.2137525851668163, | |
| "learning_rate": 9.968087732690452e-06, | |
| "loss": 0.5643, | |
| "mean_token_accuracy": 0.8490354120731354, | |
| "num_tokens": 818322.0, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.4149184149184149, | |
| "grad_norm": 2.1357303202881193, | |
| "learning_rate": 9.965124521235827e-06, | |
| "loss": 0.5465, | |
| "mean_token_accuracy": 0.8538801968097687, | |
| "num_tokens": 828468.0, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.4195804195804196, | |
| "grad_norm": 2.144386382606401, | |
| "learning_rate": 9.962030308280363e-06, | |
| "loss": 0.5598, | |
| "mean_token_accuracy": 0.8531892895698547, | |
| "num_tokens": 836909.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.42424242424242425, | |
| "grad_norm": 2.120216517402183, | |
| "learning_rate": 9.958805184604631e-06, | |
| "loss": 0.508, | |
| "mean_token_accuracy": 0.8650859892368317, | |
| "num_tokens": 845651.0, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.4289044289044289, | |
| "grad_norm": 2.3227262446712014, | |
| "learning_rate": 9.955449244829966e-06, | |
| "loss": 0.5533, | |
| "mean_token_accuracy": 0.850572019815445, | |
| "num_tokens": 854858.0, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.43356643356643354, | |
| "grad_norm": 2.0524529217213985, | |
| "learning_rate": 9.95196258741569e-06, | |
| "loss": 0.5097, | |
| "mean_token_accuracy": 0.8616639971733093, | |
| "num_tokens": 863666.0, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.4382284382284382, | |
| "grad_norm": 2.178830324772212, | |
| "learning_rate": 9.948345314656234e-06, | |
| "loss": 0.513, | |
| "mean_token_accuracy": 0.8623040318489075, | |
| "num_tokens": 873307.0, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.4428904428904429, | |
| "grad_norm": 2.178878661646046, | |
| "learning_rate": 9.94459753267812e-06, | |
| "loss": 0.5377, | |
| "mean_token_accuracy": 0.851812869310379, | |
| "num_tokens": 882337.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.44755244755244755, | |
| "grad_norm": 2.2151123798224366, | |
| "learning_rate": 9.94071935143687e-06, | |
| "loss": 0.527, | |
| "mean_token_accuracy": 0.8585948050022125, | |
| "num_tokens": 891528.0, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.4522144522144522, | |
| "grad_norm": 2.153917534671565, | |
| "learning_rate": 9.936710884713752e-06, | |
| "loss": 0.5136, | |
| "mean_token_accuracy": 0.8649525940418243, | |
| "num_tokens": 900782.0, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.4568764568764569, | |
| "grad_norm": 2.0747029950901057, | |
| "learning_rate": 9.932572250112469e-06, | |
| "loss": 0.5389, | |
| "mean_token_accuracy": 0.8555485904216766, | |
| "num_tokens": 909811.0, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.46153846153846156, | |
| "grad_norm": 1.983982143931433, | |
| "learning_rate": 9.92830356905569e-06, | |
| "loss": 0.4814, | |
| "mean_token_accuracy": 0.8693816661834717, | |
| "num_tokens": 919377.0, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.4662004662004662, | |
| "grad_norm": 2.1327543713500603, | |
| "learning_rate": 9.923904966781496e-06, | |
| "loss": 0.5515, | |
| "mean_token_accuracy": 0.8478606641292572, | |
| "num_tokens": 929132.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.47086247086247085, | |
| "grad_norm": 2.396154142849225, | |
| "learning_rate": 9.919376572339703e-06, | |
| "loss": 0.5521, | |
| "mean_token_accuracy": 0.8526241481304169, | |
| "num_tokens": 938942.0, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.4755244755244755, | |
| "grad_norm": 2.1649063337799603, | |
| "learning_rate": 9.914718518588076e-06, | |
| "loss": 0.5689, | |
| "mean_token_accuracy": 0.8482916951179504, | |
| "num_tokens": 948271.0, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.4801864801864802, | |
| "grad_norm": 2.0645529438999506, | |
| "learning_rate": 9.909930942188436e-06, | |
| "loss": 0.5219, | |
| "mean_token_accuracy": 0.8553557991981506, | |
| "num_tokens": 957852.0, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.48484848484848486, | |
| "grad_norm": 2.221579905396912, | |
| "learning_rate": 9.905013983602639e-06, | |
| "loss": 0.5092, | |
| "mean_token_accuracy": 0.8628838658332825, | |
| "num_tokens": 967455.0, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.48951048951048953, | |
| "grad_norm": 2.1265754888438533, | |
| "learning_rate": 9.899967787088468e-06, | |
| "loss": 0.5494, | |
| "mean_token_accuracy": 0.849203497171402, | |
| "num_tokens": 977978.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.49417249417249415, | |
| "grad_norm": 2.1206984788420145, | |
| "learning_rate": 9.89479250069539e-06, | |
| "loss": 0.513, | |
| "mean_token_accuracy": 0.8621029257774353, | |
| "num_tokens": 987423.0, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.4988344988344988, | |
| "grad_norm": 2.2958737279282917, | |
| "learning_rate": 9.889488276260222e-06, | |
| "loss": 0.5562, | |
| "mean_token_accuracy": 0.8555387854576111, | |
| "num_tokens": 996656.0, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.5034965034965035, | |
| "grad_norm": 2.2321368229532443, | |
| "learning_rate": 9.88405526940267e-06, | |
| "loss": 0.5579, | |
| "mean_token_accuracy": 0.854218989610672, | |
| "num_tokens": 1005952.0, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.5081585081585082, | |
| "grad_norm": 2.145064834641496, | |
| "learning_rate": 9.87849363952076e-06, | |
| "loss": 0.525, | |
| "mean_token_accuracy": 0.8584134578704834, | |
| "num_tokens": 1015020.0, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.5128205128205128, | |
| "grad_norm": 2.2441739750519316, | |
| "learning_rate": 9.872803549786177e-06, | |
| "loss": 0.542, | |
| "mean_token_accuracy": 0.8544652462005615, | |
| "num_tokens": 1023815.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.5174825174825175, | |
| "grad_norm": 2.259110803475594, | |
| "learning_rate": 9.866985167139453e-06, | |
| "loss": 0.5356, | |
| "mean_token_accuracy": 0.8533321619033813, | |
| "num_tokens": 1032375.0, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.5221445221445221, | |
| "grad_norm": 2.1512979801268095, | |
| "learning_rate": 9.861038662285093e-06, | |
| "loss": 0.5272, | |
| "mean_token_accuracy": 0.8559562265872955, | |
| "num_tokens": 1041287.0, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.5268065268065268, | |
| "grad_norm": 2.0895533173369616, | |
| "learning_rate": 9.854964209686555e-06, | |
| "loss": 0.5578, | |
| "mean_token_accuracy": 0.8457550704479218, | |
| "num_tokens": 1052277.0, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.5314685314685315, | |
| "grad_norm": 2.07804701176361, | |
| "learning_rate": 9.848761987561132e-06, | |
| "loss": 0.5587, | |
| "mean_token_accuracy": 0.8497881889343262, | |
| "num_tokens": 1061624.0, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.5361305361305362, | |
| "grad_norm": 2.029504980400925, | |
| "learning_rate": 9.842432177874725e-06, | |
| "loss": 0.5222, | |
| "mean_token_accuracy": 0.8595547080039978, | |
| "num_tokens": 1070577.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.5407925407925408, | |
| "grad_norm": 1.9605500161506988, | |
| "learning_rate": 9.835974966336504e-06, | |
| "loss": 0.49, | |
| "mean_token_accuracy": 0.8654608428478241, | |
| "num_tokens": 1080819.0, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.5454545454545454, | |
| "grad_norm": 2.2122252988560223, | |
| "learning_rate": 9.82939054239346e-06, | |
| "loss": 0.5679, | |
| "mean_token_accuracy": 0.8448387086391449, | |
| "num_tokens": 1089899.0, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.5501165501165501, | |
| "grad_norm": 2.072420046717628, | |
| "learning_rate": 9.822679099224844e-06, | |
| "loss": 0.5611, | |
| "mean_token_accuracy": 0.8533997237682343, | |
| "num_tokens": 1099422.0, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.5547785547785548, | |
| "grad_norm": 2.2414814409383754, | |
| "learning_rate": 9.815840833736508e-06, | |
| "loss": 0.5222, | |
| "mean_token_accuracy": 0.8619909286499023, | |
| "num_tokens": 1108059.0, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.5594405594405595, | |
| "grad_norm": 1.945252737941495, | |
| "learning_rate": 9.808875946555109e-06, | |
| "loss": 0.5034, | |
| "mean_token_accuracy": 0.8639355599880219, | |
| "num_tokens": 1117344.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5641025641025641, | |
| "grad_norm": 1.8638080342388037, | |
| "learning_rate": 9.801784642022254e-06, | |
| "loss": 0.4565, | |
| "mean_token_accuracy": 0.8745285272598267, | |
| "num_tokens": 1126493.0, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.5687645687645687, | |
| "grad_norm": 2.1910077292347006, | |
| "learning_rate": 9.794567128188466e-06, | |
| "loss": 0.5313, | |
| "mean_token_accuracy": 0.8547643721103668, | |
| "num_tokens": 1135454.0, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.5734265734265734, | |
| "grad_norm": 2.320759590932711, | |
| "learning_rate": 9.787223616807118e-06, | |
| "loss": 0.5518, | |
| "mean_token_accuracy": 0.8485196530818939, | |
| "num_tokens": 1145120.0, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.578088578088578, | |
| "grad_norm": 2.205905846852017, | |
| "learning_rate": 9.779754323328192e-06, | |
| "loss": 0.5135, | |
| "mean_token_accuracy": 0.8605582118034363, | |
| "num_tokens": 1154569.0, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.5827505827505828, | |
| "grad_norm": 1.9377848593966973, | |
| "learning_rate": 9.772159466891971e-06, | |
| "loss": 0.4979, | |
| "mean_token_accuracy": 0.8665166199207306, | |
| "num_tokens": 1163964.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5874125874125874, | |
| "grad_norm": 2.143716998667602, | |
| "learning_rate": 9.764439270322612e-06, | |
| "loss": 0.5332, | |
| "mean_token_accuracy": 0.8567988276481628, | |
| "num_tokens": 1172705.0, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.5920745920745921, | |
| "grad_norm": 1.9575201068560006, | |
| "learning_rate": 9.756593960121598e-06, | |
| "loss": 0.4744, | |
| "mean_token_accuracy": 0.8642941415309906, | |
| "num_tokens": 1182428.0, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.5967365967365967, | |
| "grad_norm": 2.057073316572943, | |
| "learning_rate": 9.748623766461101e-06, | |
| "loss": 0.5236, | |
| "mean_token_accuracy": 0.8576280772686005, | |
| "num_tokens": 1191692.0, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.6013986013986014, | |
| "grad_norm": 2.315109150829485, | |
| "learning_rate": 9.740528923177227e-06, | |
| "loss": 0.5901, | |
| "mean_token_accuracy": 0.8435404002666473, | |
| "num_tokens": 1200804.0, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.6060606060606061, | |
| "grad_norm": 2.160355676496633, | |
| "learning_rate": 9.732309667763158e-06, | |
| "loss": 0.5131, | |
| "mean_token_accuracy": 0.8617720901966095, | |
| "num_tokens": 1209500.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.6107226107226107, | |
| "grad_norm": 2.1245956205330776, | |
| "learning_rate": 9.723966241362178e-06, | |
| "loss": 0.5508, | |
| "mean_token_accuracy": 0.8526682257652283, | |
| "num_tokens": 1218360.0, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.6153846153846154, | |
| "grad_norm": 1.9474554252940732, | |
| "learning_rate": 9.7154988887606e-06, | |
| "loss": 0.5172, | |
| "mean_token_accuracy": 0.8601753413677216, | |
| "num_tokens": 1227995.0, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.62004662004662, | |
| "grad_norm": 2.0598106310347886, | |
| "learning_rate": 9.706907858380593e-06, | |
| "loss": 0.5292, | |
| "mean_token_accuracy": 0.853904515504837, | |
| "num_tokens": 1237369.0, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.6247086247086248, | |
| "grad_norm": 2.167359943385511, | |
| "learning_rate": 9.69819340227288e-06, | |
| "loss": 0.5337, | |
| "mean_token_accuracy": 0.8580853343009949, | |
| "num_tokens": 1246811.0, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.6293706293706294, | |
| "grad_norm": 2.0419581939057703, | |
| "learning_rate": 9.68935577610935e-06, | |
| "loss": 0.5584, | |
| "mean_token_accuracy": 0.8511963188648224, | |
| "num_tokens": 1257728.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.634032634032634, | |
| "grad_norm": 2.179167067571999, | |
| "learning_rate": 9.680395239175563e-06, | |
| "loss": 0.547, | |
| "mean_token_accuracy": 0.8516505658626556, | |
| "num_tokens": 1267082.0, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.6386946386946387, | |
| "grad_norm": 2.1523271385107843, | |
| "learning_rate": 9.671312054363126e-06, | |
| "loss": 0.5097, | |
| "mean_token_accuracy": 0.8639609515666962, | |
| "num_tokens": 1276457.0, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.6433566433566433, | |
| "grad_norm": 2.2140190778794717, | |
| "learning_rate": 9.662106488162001e-06, | |
| "loss": 0.5353, | |
| "mean_token_accuracy": 0.8548583686351776, | |
| "num_tokens": 1285816.0, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.6480186480186481, | |
| "grad_norm": 1.9892790673234197, | |
| "learning_rate": 9.652778810652669e-06, | |
| "loss": 0.5141, | |
| "mean_token_accuracy": 0.85847008228302, | |
| "num_tokens": 1294833.0, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.6526806526806527, | |
| "grad_norm": 2.138516925288519, | |
| "learning_rate": 9.643329295498215e-06, | |
| "loss": 0.5171, | |
| "mean_token_accuracy": 0.8615297675132751, | |
| "num_tokens": 1304626.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.6573426573426573, | |
| "grad_norm": 2.2563369591379376, | |
| "learning_rate": 9.633758219936299e-06, | |
| "loss": 0.5306, | |
| "mean_token_accuracy": 0.8586675524711609, | |
| "num_tokens": 1314352.0, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.662004662004662, | |
| "grad_norm": 2.1291835975325446, | |
| "learning_rate": 9.624065864771017e-06, | |
| "loss": 0.5387, | |
| "mean_token_accuracy": 0.854125052690506, | |
| "num_tokens": 1323815.0, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 1.9289636352361732, | |
| "learning_rate": 9.614252514364671e-06, | |
| "loss": 0.5302, | |
| "mean_token_accuracy": 0.8544224202632904, | |
| "num_tokens": 1333085.0, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.6713286713286714, | |
| "grad_norm": 2.142108471781227, | |
| "learning_rate": 9.604318456629415e-06, | |
| "loss": 0.5841, | |
| "mean_token_accuracy": 0.8435969054698944, | |
| "num_tokens": 1342490.0, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.675990675990676, | |
| "grad_norm": 2.256848010848965, | |
| "learning_rate": 9.594263983018818e-06, | |
| "loss": 0.5644, | |
| "mean_token_accuracy": 0.8460557162761688, | |
| "num_tokens": 1351579.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.6806526806526807, | |
| "grad_norm": 2.0607158153594343, | |
| "learning_rate": 9.584089388519307e-06, | |
| "loss": 0.525, | |
| "mean_token_accuracy": 0.8550526797771454, | |
| "num_tokens": 1363017.0, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.6853146853146853, | |
| "grad_norm": 2.0404326207258223, | |
| "learning_rate": 9.573794971641518e-06, | |
| "loss": 0.5067, | |
| "mean_token_accuracy": 0.8611325323581696, | |
| "num_tokens": 1372276.0, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.6899766899766899, | |
| "grad_norm": 2.017662742079808, | |
| "learning_rate": 9.563381034411529e-06, | |
| "loss": 0.5173, | |
| "mean_token_accuracy": 0.8577908575534821, | |
| "num_tokens": 1381474.0, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.6946386946386947, | |
| "grad_norm": 2.2063297261619668, | |
| "learning_rate": 9.55284788236201e-06, | |
| "loss": 0.5575, | |
| "mean_token_accuracy": 0.8511577248573303, | |
| "num_tokens": 1390844.0, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.6993006993006993, | |
| "grad_norm": 2.2322560935923295, | |
| "learning_rate": 9.542195824523251e-06, | |
| "loss": 0.4993, | |
| "mean_token_accuracy": 0.8618028461933136, | |
| "num_tokens": 1399649.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.703962703962704, | |
| "grad_norm": 2.1940354248783667, | |
| "learning_rate": 9.531425173414095e-06, | |
| "loss": 0.5766, | |
| "mean_token_accuracy": 0.8469588458538055, | |
| "num_tokens": 1409096.0, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.7086247086247086, | |
| "grad_norm": 2.276091425733425, | |
| "learning_rate": 9.520536245032783e-06, | |
| "loss": 0.6052, | |
| "mean_token_accuracy": 0.8366052806377411, | |
| "num_tokens": 1417944.0, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.7132867132867133, | |
| "grad_norm": 2.199611556073951, | |
| "learning_rate": 9.509529358847655e-06, | |
| "loss": 0.5454, | |
| "mean_token_accuracy": 0.853013813495636, | |
| "num_tokens": 1427079.0, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.717948717948718, | |
| "grad_norm": 2.080422260259384, | |
| "learning_rate": 9.498404837787811e-06, | |
| "loss": 0.5353, | |
| "mean_token_accuracy": 0.8578924536705017, | |
| "num_tokens": 1436521.0, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.7226107226107226, | |
| "grad_norm": 1.9852264975254732, | |
| "learning_rate": 9.48716300823361e-06, | |
| "loss": 0.5259, | |
| "mean_token_accuracy": 0.862900048494339, | |
| "num_tokens": 1445553.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.7272727272727273, | |
| "grad_norm": 2.0020640140351738, | |
| "learning_rate": 9.475804200007104e-06, | |
| "loss": 0.5327, | |
| "mean_token_accuracy": 0.8615141212940216, | |
| "num_tokens": 1453774.0, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.7319347319347319, | |
| "grad_norm": 2.1863487215175392, | |
| "learning_rate": 9.464328746362367e-06, | |
| "loss": 0.5588, | |
| "mean_token_accuracy": 0.8489395678043365, | |
| "num_tokens": 1462611.0, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.7365967365967366, | |
| "grad_norm": 2.4020487040671084, | |
| "learning_rate": 9.452736983975708e-06, | |
| "loss": 0.4923, | |
| "mean_token_accuracy": 0.8653963208198547, | |
| "num_tokens": 1471736.0, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.7412587412587412, | |
| "grad_norm": 2.1281438451825987, | |
| "learning_rate": 9.441029252935804e-06, | |
| "loss": 0.5397, | |
| "mean_token_accuracy": 0.8503198325634003, | |
| "num_tokens": 1482117.0, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.745920745920746, | |
| "grad_norm": 2.1763048717368756, | |
| "learning_rate": 9.429205896733705e-06, | |
| "loss": 0.5566, | |
| "mean_token_accuracy": 0.8503492772579193, | |
| "num_tokens": 1491292.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.7505827505827506, | |
| "grad_norm": 1.9354492116167334, | |
| "learning_rate": 9.417267262252775e-06, | |
| "loss": 0.503, | |
| "mean_token_accuracy": 0.8627839088439941, | |
| "num_tokens": 1500735.0, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.7552447552447552, | |
| "grad_norm": 2.0708959320246887, | |
| "learning_rate": 9.405213699758507e-06, | |
| "loss": 0.5067, | |
| "mean_token_accuracy": 0.8632858693599701, | |
| "num_tokens": 1510190.0, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.7599067599067599, | |
| "grad_norm": 2.11618902292271, | |
| "learning_rate": 9.393045562888245e-06, | |
| "loss": 0.5424, | |
| "mean_token_accuracy": 0.8534725308418274, | |
| "num_tokens": 1519207.0, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.7645687645687645, | |
| "grad_norm": 2.027410237379273, | |
| "learning_rate": 9.380763208640809e-06, | |
| "loss": 0.5197, | |
| "mean_token_accuracy": 0.8590981066226959, | |
| "num_tokens": 1528808.0, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.7692307692307693, | |
| "grad_norm": 2.008086929554116, | |
| "learning_rate": 9.368366997366027e-06, | |
| "loss": 0.5071, | |
| "mean_token_accuracy": 0.8610014617443085, | |
| "num_tokens": 1537765.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.7738927738927739, | |
| "grad_norm": 2.1365369175139763, | |
| "learning_rate": 9.355857292754152e-06, | |
| "loss": 0.5299, | |
| "mean_token_accuracy": 0.8569334447383881, | |
| "num_tokens": 1547822.0, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.7785547785547785, | |
| "grad_norm": 1.8545592775377633, | |
| "learning_rate": 9.343234461825204e-06, | |
| "loss": 0.4791, | |
| "mean_token_accuracy": 0.869120866060257, | |
| "num_tokens": 1557620.0, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.7832167832167832, | |
| "grad_norm": 2.028039178942425, | |
| "learning_rate": 9.330498874918191e-06, | |
| "loss": 0.5106, | |
| "mean_token_accuracy": 0.8565393388271332, | |
| "num_tokens": 1567111.0, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.7878787878787878, | |
| "grad_norm": 2.0701117186586817, | |
| "learning_rate": 9.317650905680254e-06, | |
| "loss": 0.4913, | |
| "mean_token_accuracy": 0.8696520030498505, | |
| "num_tokens": 1575836.0, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.7925407925407926, | |
| "grad_norm": 2.1194332133731235, | |
| "learning_rate": 9.304690931055694e-06, | |
| "loss": 0.5311, | |
| "mean_token_accuracy": 0.8621737062931061, | |
| "num_tokens": 1584822.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7972027972027972, | |
| "grad_norm": 2.1282940280365747, | |
| "learning_rate": 9.29161933127492e-06, | |
| "loss": 0.5601, | |
| "mean_token_accuracy": 0.8475571274757385, | |
| "num_tokens": 1594428.0, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.8018648018648019, | |
| "grad_norm": 2.0641332133798937, | |
| "learning_rate": 9.278436489843298e-06, | |
| "loss": 0.5373, | |
| "mean_token_accuracy": 0.8547400534152985, | |
| "num_tokens": 1603786.0, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.8065268065268065, | |
| "grad_norm": 2.0160796623607804, | |
| "learning_rate": 9.265142793529883e-06, | |
| "loss": 0.4971, | |
| "mean_token_accuracy": 0.8660332560539246, | |
| "num_tokens": 1612747.0, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.8111888111888111, | |
| "grad_norm": 1.9129699635418105, | |
| "learning_rate": 9.251738632356086e-06, | |
| "loss": 0.4926, | |
| "mean_token_accuracy": 0.869436115026474, | |
| "num_tokens": 1621819.0, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.8158508158508159, | |
| "grad_norm": 2.3884591059212537, | |
| "learning_rate": 9.238224399584232e-06, | |
| "loss": 0.5476, | |
| "mean_token_accuracy": 0.8577151894569397, | |
| "num_tokens": 1631685.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.8205128205128205, | |
| "grad_norm": 1.8831561540253134, | |
| "learning_rate": 9.224600491706009e-06, | |
| "loss": 0.4786, | |
| "mean_token_accuracy": 0.8686897456645966, | |
| "num_tokens": 1640794.0, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.8251748251748252, | |
| "grad_norm": 2.0432546539992247, | |
| "learning_rate": 9.210867308430847e-06, | |
| "loss": 0.5173, | |
| "mean_token_accuracy": 0.8568342328071594, | |
| "num_tokens": 1650186.0, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.8298368298368298, | |
| "grad_norm": 2.0098051546686357, | |
| "learning_rate": 9.197025252674192e-06, | |
| "loss": 0.5181, | |
| "mean_token_accuracy": 0.8594348132610321, | |
| "num_tokens": 1659577.0, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.8344988344988346, | |
| "grad_norm": 2.0448629279973214, | |
| "learning_rate": 9.183074730545674e-06, | |
| "loss": 0.521, | |
| "mean_token_accuracy": 0.8540050983428955, | |
| "num_tokens": 1669821.0, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.8391608391608392, | |
| "grad_norm": 2.139730175214238, | |
| "learning_rate": 9.169016151337202e-06, | |
| "loss": 0.5651, | |
| "mean_token_accuracy": 0.8448547720909119, | |
| "num_tokens": 1679579.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.8438228438228438, | |
| "grad_norm": 2.0464895767510742, | |
| "learning_rate": 9.15484992751095e-06, | |
| "loss": 0.5221, | |
| "mean_token_accuracy": 0.8626176416873932, | |
| "num_tokens": 1688922.0, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.8484848484848485, | |
| "grad_norm": 1.9954837428115382, | |
| "learning_rate": 9.140576474687263e-06, | |
| "loss": 0.5279, | |
| "mean_token_accuracy": 0.8613512217998505, | |
| "num_tokens": 1697949.0, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.8531468531468531, | |
| "grad_norm": 1.9567685319451251, | |
| "learning_rate": 9.126196211632456e-06, | |
| "loss": 0.5274, | |
| "mean_token_accuracy": 0.8570147156715393, | |
| "num_tokens": 1708365.0, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.8578088578088578, | |
| "grad_norm": 1.9210555989780966, | |
| "learning_rate": 9.11170956024653e-06, | |
| "loss": 0.5269, | |
| "mean_token_accuracy": 0.8548833727836609, | |
| "num_tokens": 1717340.0, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.8624708624708625, | |
| "grad_norm": 1.9318021919862647, | |
| "learning_rate": 9.097116945550794e-06, | |
| "loss": 0.5359, | |
| "mean_token_accuracy": 0.8578689694404602, | |
| "num_tokens": 1727305.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.8671328671328671, | |
| "grad_norm": 2.026417344807131, | |
| "learning_rate": 9.082418795675397e-06, | |
| "loss": 0.5617, | |
| "mean_token_accuracy": 0.8492590188980103, | |
| "num_tokens": 1737093.0, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.8717948717948718, | |
| "grad_norm": 1.8991450738896973, | |
| "learning_rate": 9.067615541846768e-06, | |
| "loss": 0.5235, | |
| "mean_token_accuracy": 0.859586089849472, | |
| "num_tokens": 1746772.0, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.8764568764568764, | |
| "grad_norm": 1.9522407091267568, | |
| "learning_rate": 9.052707618374958e-06, | |
| "loss": 0.4765, | |
| "mean_token_accuracy": 0.8663023710250854, | |
| "num_tokens": 1756307.0, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.8811188811188811, | |
| "grad_norm": 2.048727988149695, | |
| "learning_rate": 9.037695462640908e-06, | |
| "loss": 0.5331, | |
| "mean_token_accuracy": 0.8549124300479889, | |
| "num_tokens": 1766010.0, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.8857808857808858, | |
| "grad_norm": 1.9934806378380259, | |
| "learning_rate": 9.022579515083601e-06, | |
| "loss": 0.5421, | |
| "mean_token_accuracy": 0.8553925156593323, | |
| "num_tokens": 1774885.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.8904428904428905, | |
| "grad_norm": 2.0689376168576286, | |
| "learning_rate": 9.007360219187163e-06, | |
| "loss": 0.514, | |
| "mean_token_accuracy": 0.8535875976085663, | |
| "num_tokens": 1785248.0, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.8951048951048951, | |
| "grad_norm": 2.009916861001166, | |
| "learning_rate": 8.99203802146783e-06, | |
| "loss": 0.497, | |
| "mean_token_accuracy": 0.8642706871032715, | |
| "num_tokens": 1794525.0, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.8997668997668997, | |
| "grad_norm": 2.0465375766970277, | |
| "learning_rate": 8.976613371460856e-06, | |
| "loss": 0.5271, | |
| "mean_token_accuracy": 0.8549227714538574, | |
| "num_tokens": 1804413.0, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.9044289044289044, | |
| "grad_norm": 1.917310132189426, | |
| "learning_rate": 8.961086721707331e-06, | |
| "loss": 0.4938, | |
| "mean_token_accuracy": 0.8640461564064026, | |
| "num_tokens": 1813695.0, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 1.915744851468885, | |
| "learning_rate": 8.945458527740892e-06, | |
| "loss": 0.4707, | |
| "mean_token_accuracy": 0.8734670579433441, | |
| "num_tokens": 1823851.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.9137529137529138, | |
| "grad_norm": 1.9274655587588858, | |
| "learning_rate": 8.929729248074364e-06, | |
| "loss": 0.5021, | |
| "mean_token_accuracy": 0.8575229346752167, | |
| "num_tokens": 1833828.0, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.9184149184149184, | |
| "grad_norm": 2.026284467899399, | |
| "learning_rate": 8.913899344186312e-06, | |
| "loss": 0.5287, | |
| "mean_token_accuracy": 0.8602744936943054, | |
| "num_tokens": 1842998.0, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.9230769230769231, | |
| "grad_norm": 2.05400953187219, | |
| "learning_rate": 8.897969280507494e-06, | |
| "loss": 0.5324, | |
| "mean_token_accuracy": 0.862190306186676, | |
| "num_tokens": 1851919.0, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.9277389277389277, | |
| "grad_norm": 2.0766187536409078, | |
| "learning_rate": 8.881939524407238e-06, | |
| "loss": 0.5412, | |
| "mean_token_accuracy": 0.8545754849910736, | |
| "num_tokens": 1860906.0, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.9324009324009324, | |
| "grad_norm": 1.989815369304169, | |
| "learning_rate": 8.86581054617973e-06, | |
| "loss": 0.5453, | |
| "mean_token_accuracy": 0.8528104722499847, | |
| "num_tokens": 1870470.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.9370629370629371, | |
| "grad_norm": 2.088847720398196, | |
| "learning_rate": 8.849582819030217e-06, | |
| "loss": 0.5432, | |
| "mean_token_accuracy": 0.8563026189804077, | |
| "num_tokens": 1879984.0, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.9417249417249417, | |
| "grad_norm": 2.078290662100905, | |
| "learning_rate": 8.833256819061126e-06, | |
| "loss": 0.4983, | |
| "mean_token_accuracy": 0.8666514456272125, | |
| "num_tokens": 1889348.0, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.9463869463869464, | |
| "grad_norm": 1.9223822960306767, | |
| "learning_rate": 8.81683302525809e-06, | |
| "loss": 0.5411, | |
| "mean_token_accuracy": 0.8597702980041504, | |
| "num_tokens": 1898543.0, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.951048951048951, | |
| "grad_norm": 1.8855235649147306, | |
| "learning_rate": 8.800311919475902e-06, | |
| "loss": 0.5014, | |
| "mean_token_accuracy": 0.8621995449066162, | |
| "num_tokens": 1908052.0, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.9557109557109557, | |
| "grad_norm": 1.9276050868545773, | |
| "learning_rate": 8.783693986424365e-06, | |
| "loss": 0.504, | |
| "mean_token_accuracy": 0.8629100322723389, | |
| "num_tokens": 1917592.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.9603729603729604, | |
| "grad_norm": 2.143001225129588, | |
| "learning_rate": 8.76697971365409e-06, | |
| "loss": 0.5387, | |
| "mean_token_accuracy": 0.8589153587818146, | |
| "num_tokens": 1927404.0, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.965034965034965, | |
| "grad_norm": 2.1777112033624557, | |
| "learning_rate": 8.750169591542177e-06, | |
| "loss": 0.5252, | |
| "mean_token_accuracy": 0.8522741794586182, | |
| "num_tokens": 1936273.0, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.9696969696969697, | |
| "grad_norm": 1.91067041727946, | |
| "learning_rate": 8.733264113277832e-06, | |
| "loss": 0.4991, | |
| "mean_token_accuracy": 0.8641117215156555, | |
| "num_tokens": 1944830.0, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.9743589743589743, | |
| "grad_norm": 2.004004655149868, | |
| "learning_rate": 8.716263774847902e-06, | |
| "loss": 0.5276, | |
| "mean_token_accuracy": 0.8581711649894714, | |
| "num_tokens": 1954306.0, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.9790209790209791, | |
| "grad_norm": 1.968917380031634, | |
| "learning_rate": 8.69916907502232e-06, | |
| "loss": 0.5138, | |
| "mean_token_accuracy": 0.8624255359172821, | |
| "num_tokens": 1964061.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.9836829836829837, | |
| "grad_norm": 1.9939460274690515, | |
| "learning_rate": 8.681980515339464e-06, | |
| "loss": 0.4816, | |
| "mean_token_accuracy": 0.8704831600189209, | |
| "num_tokens": 1973286.0, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.9883449883449883, | |
| "grad_norm": 2.057525517897735, | |
| "learning_rate": 8.664698600091462e-06, | |
| "loss": 0.5221, | |
| "mean_token_accuracy": 0.8580233752727509, | |
| "num_tokens": 1983373.0, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.993006993006993, | |
| "grad_norm": 1.79381112298063, | |
| "learning_rate": 8.64732383630937e-06, | |
| "loss": 0.4376, | |
| "mean_token_accuracy": 0.8793376386165619, | |
| "num_tokens": 1992207.0, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.9976689976689976, | |
| "grad_norm": 1.933384153410609, | |
| "learning_rate": 8.629856733748325e-06, | |
| "loss": 0.505, | |
| "mean_token_accuracy": 0.85846146941185, | |
| "num_tokens": 2001400.0, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.933384153410609, | |
| "learning_rate": 8.612297804872562e-06, | |
| "loss": 0.4925, | |
| "mean_token_accuracy": 0.8816215991973877, | |
| "num_tokens": 2003256.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.0046620046620047, | |
| "grad_norm": 2.819257871473016, | |
| "learning_rate": 8.594647564840407e-06, | |
| "loss": 0.3933, | |
| "mean_token_accuracy": 0.8927328288555145, | |
| "num_tokens": 2012460.0, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.0093240093240092, | |
| "grad_norm": 1.7635713882513158, | |
| "learning_rate": 8.57690653148913e-06, | |
| "loss": 0.392, | |
| "mean_token_accuracy": 0.887998104095459, | |
| "num_tokens": 2022109.0, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.013986013986014, | |
| "grad_norm": 1.7892473419907957, | |
| "learning_rate": 8.559075225319786e-06, | |
| "loss": 0.3855, | |
| "mean_token_accuracy": 0.8922514021396637, | |
| "num_tokens": 2030710.0, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.0186480186480187, | |
| "grad_norm": 1.6823650842914313, | |
| "learning_rate": 8.54115416948192e-06, | |
| "loss": 0.356, | |
| "mean_token_accuracy": 0.9053717851638794, | |
| "num_tokens": 2039777.0, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 1.0233100233100234, | |
| "grad_norm": 1.939934426049098, | |
| "learning_rate": 8.523143889758228e-06, | |
| "loss": 0.3694, | |
| "mean_token_accuracy": 0.8991726338863373, | |
| "num_tokens": 2049184.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.027972027972028, | |
| "grad_norm": 1.752148060977119, | |
| "learning_rate": 8.505044914549131e-06, | |
| "loss": 0.39, | |
| "mean_token_accuracy": 0.8928222954273224, | |
| "num_tokens": 2058845.0, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 1.0326340326340326, | |
| "grad_norm": 2.1158285136058033, | |
| "learning_rate": 8.48685777485727e-06, | |
| "loss": 0.3596, | |
| "mean_token_accuracy": 0.8957322537899017, | |
| "num_tokens": 2067868.0, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.0372960372960374, | |
| "grad_norm": 2.1547545690668026, | |
| "learning_rate": 8.46858300427193e-06, | |
| "loss": 0.3831, | |
| "mean_token_accuracy": 0.8963182866573334, | |
| "num_tokens": 2077258.0, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.0419580419580419, | |
| "grad_norm": 1.9477915170139626, | |
| "learning_rate": 8.450221138953383e-06, | |
| "loss": 0.3642, | |
| "mean_token_accuracy": 0.8969131708145142, | |
| "num_tokens": 2086219.0, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.0466200466200466, | |
| "grad_norm": 1.9674246406742486, | |
| "learning_rate": 8.431772717617154e-06, | |
| "loss": 0.348, | |
| "mean_token_accuracy": 0.9007111489772797, | |
| "num_tokens": 2095430.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.0512820512820513, | |
| "grad_norm": 2.0055164195358235, | |
| "learning_rate": 8.413238281518225e-06, | |
| "loss": 0.3575, | |
| "mean_token_accuracy": 0.8980507254600525, | |
| "num_tokens": 2104162.0, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.055944055944056, | |
| "grad_norm": 1.989341084975444, | |
| "learning_rate": 8.394618374435148e-06, | |
| "loss": 0.3634, | |
| "mean_token_accuracy": 0.8973170518875122, | |
| "num_tokens": 2114619.0, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.0606060606060606, | |
| "grad_norm": 2.1674333688477074, | |
| "learning_rate": 8.375913542654093e-06, | |
| "loss": 0.3665, | |
| "mean_token_accuracy": 0.8995259404182434, | |
| "num_tokens": 2124605.0, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.0652680652680653, | |
| "grad_norm": 2.0014110756016703, | |
| "learning_rate": 8.357124334952818e-06, | |
| "loss": 0.3884, | |
| "mean_token_accuracy": 0.889716625213623, | |
| "num_tokens": 2133888.0, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.06993006993007, | |
| "grad_norm": 1.9633860039478925, | |
| "learning_rate": 8.33825130258458e-06, | |
| "loss": 0.3578, | |
| "mean_token_accuracy": 0.8981423377990723, | |
| "num_tokens": 2142444.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.0745920745920745, | |
| "grad_norm": 2.013885901701478, | |
| "learning_rate": 8.319294999261941e-06, | |
| "loss": 0.335, | |
| "mean_token_accuracy": 0.9031325578689575, | |
| "num_tokens": 2151447.0, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.0792540792540792, | |
| "grad_norm": 1.8717843659595645, | |
| "learning_rate": 8.300255981140544e-06, | |
| "loss": 0.3334, | |
| "mean_token_accuracy": 0.905355840921402, | |
| "num_tokens": 2161278.0, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.083916083916084, | |
| "grad_norm": 1.8282559453848013, | |
| "learning_rate": 8.281134806802783e-06, | |
| "loss": 0.3789, | |
| "mean_token_accuracy": 0.8943277895450592, | |
| "num_tokens": 2170704.0, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.0885780885780885, | |
| "grad_norm": 2.0761431582961705, | |
| "learning_rate": 8.261932037241418e-06, | |
| "loss": 0.3726, | |
| "mean_token_accuracy": 0.8970981240272522, | |
| "num_tokens": 2180907.0, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.0932400932400932, | |
| "grad_norm": 1.8858855419887754, | |
| "learning_rate": 8.242648235843123e-06, | |
| "loss": 0.3135, | |
| "mean_token_accuracy": 0.9130119383335114, | |
| "num_tokens": 2190790.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.097902097902098, | |
| "grad_norm": 1.8790292725099809, | |
| "learning_rate": 8.223283968371945e-06, | |
| "loss": 0.3485, | |
| "mean_token_accuracy": 0.904576301574707, | |
| "num_tokens": 2199657.0, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.1025641025641026, | |
| "grad_norm": 1.9939775129861892, | |
| "learning_rate": 8.203839802952708e-06, | |
| "loss": 0.3518, | |
| "mean_token_accuracy": 0.9006724953651428, | |
| "num_tokens": 2209030.0, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.1072261072261071, | |
| "grad_norm": 2.318895943210041, | |
| "learning_rate": 8.184316310054355e-06, | |
| "loss": 0.3702, | |
| "mean_token_accuracy": 0.8964027166366577, | |
| "num_tokens": 2218265.0, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.1118881118881119, | |
| "grad_norm": 2.1196825455595003, | |
| "learning_rate": 8.164714062473201e-06, | |
| "loss": 0.3995, | |
| "mean_token_accuracy": 0.889014482498169, | |
| "num_tokens": 2227845.0, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.1165501165501166, | |
| "grad_norm": 2.1570886496790247, | |
| "learning_rate": 8.14503363531613e-06, | |
| "loss": 0.367, | |
| "mean_token_accuracy": 0.900328516960144, | |
| "num_tokens": 2237505.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.121212121212121, | |
| "grad_norm": 1.892665235627306, | |
| "learning_rate": 8.125275605983725e-06, | |
| "loss": 0.3458, | |
| "mean_token_accuracy": 0.9030775427818298, | |
| "num_tokens": 2246730.0, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.1258741258741258, | |
| "grad_norm": 1.848983532276759, | |
| "learning_rate": 8.10544055415332e-06, | |
| "loss": 0.3527, | |
| "mean_token_accuracy": 0.9045140743255615, | |
| "num_tokens": 2255845.0, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.1305361305361306, | |
| "grad_norm": 1.919023337141047, | |
| "learning_rate": 8.085529061762007e-06, | |
| "loss": 0.3663, | |
| "mean_token_accuracy": 0.8982928693294525, | |
| "num_tokens": 2264305.0, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.1351981351981353, | |
| "grad_norm": 1.8321927781965666, | |
| "learning_rate": 8.065541712989546e-06, | |
| "loss": 0.337, | |
| "mean_token_accuracy": 0.9019491672515869, | |
| "num_tokens": 2273361.0, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.1398601398601398, | |
| "grad_norm": 2.0269533829700013, | |
| "learning_rate": 8.04547909424124e-06, | |
| "loss": 0.3614, | |
| "mean_token_accuracy": 0.9004016220569611, | |
| "num_tokens": 2283178.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.1445221445221445, | |
| "grad_norm": 1.9130074634196965, | |
| "learning_rate": 8.025341794130722e-06, | |
| "loss": 0.3482, | |
| "mean_token_accuracy": 0.9014346897602081, | |
| "num_tokens": 2292865.0, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.1491841491841492, | |
| "grad_norm": 1.8547690304895703, | |
| "learning_rate": 8.005130403462687e-06, | |
| "loss": 0.3427, | |
| "mean_token_accuracy": 0.9018321931362152, | |
| "num_tokens": 2302310.0, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.1538461538461537, | |
| "grad_norm": 1.8535777462460932, | |
| "learning_rate": 7.98484551521556e-06, | |
| "loss": 0.3529, | |
| "mean_token_accuracy": 0.9022264182567596, | |
| "num_tokens": 2312189.0, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.1585081585081585, | |
| "grad_norm": 2.435521274717378, | |
| "learning_rate": 7.964487724524105e-06, | |
| "loss": 0.3926, | |
| "mean_token_accuracy": 0.8963950574398041, | |
| "num_tokens": 2321143.0, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.1631701631701632, | |
| "grad_norm": 2.3200358499575375, | |
| "learning_rate": 7.944057628661948e-06, | |
| "loss": 0.3675, | |
| "mean_token_accuracy": 0.8951332271099091, | |
| "num_tokens": 2330428.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.167832167832168, | |
| "grad_norm": 2.033123158008929, | |
| "learning_rate": 7.923555827024069e-06, | |
| "loss": 0.3621, | |
| "mean_token_accuracy": 0.8988972008228302, | |
| "num_tokens": 2339659.0, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.1724941724941724, | |
| "grad_norm": 1.9890967367444383, | |
| "learning_rate": 7.902982921109215e-06, | |
| "loss": 0.3734, | |
| "mean_token_accuracy": 0.8945567905902863, | |
| "num_tokens": 2348827.0, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.1771561771561772, | |
| "grad_norm": 1.9463270812100881, | |
| "learning_rate": 7.882339514502236e-06, | |
| "loss": 0.3334, | |
| "mean_token_accuracy": 0.9068220555782318, | |
| "num_tokens": 2359744.0, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.1818181818181819, | |
| "grad_norm": 1.8550101013313145, | |
| "learning_rate": 7.861626212856404e-06, | |
| "loss": 0.3528, | |
| "mean_token_accuracy": 0.9038570523262024, | |
| "num_tokens": 2368857.0, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.1864801864801864, | |
| "grad_norm": 1.8873717371173877, | |
| "learning_rate": 7.840843623875621e-06, | |
| "loss": 0.3496, | |
| "mean_token_accuracy": 0.9028986990451813, | |
| "num_tokens": 2378219.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.191142191142191, | |
| "grad_norm": 1.9979141390235062, | |
| "learning_rate": 7.8199923572966e-06, | |
| "loss": 0.3789, | |
| "mean_token_accuracy": 0.8938669860363007, | |
| "num_tokens": 2387549.0, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.1958041958041958, | |
| "grad_norm": 2.016685234805154, | |
| "learning_rate": 7.799073024870972e-06, | |
| "loss": 0.3611, | |
| "mean_token_accuracy": 0.8983322978019714, | |
| "num_tokens": 2397564.0, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.2004662004662006, | |
| "grad_norm": 1.9415097190450272, | |
| "learning_rate": 7.778086240347343e-06, | |
| "loss": 0.3468, | |
| "mean_token_accuracy": 0.9009680449962616, | |
| "num_tokens": 2406482.0, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.205128205128205, | |
| "grad_norm": 2.2166846370519435, | |
| "learning_rate": 7.757032619453285e-06, | |
| "loss": 0.3436, | |
| "mean_token_accuracy": 0.9036185145378113, | |
| "num_tokens": 2415453.0, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.2097902097902098, | |
| "grad_norm": 1.7953859731976027, | |
| "learning_rate": 7.735912779877266e-06, | |
| "loss": 0.3557, | |
| "mean_token_accuracy": 0.9016382992267609, | |
| "num_tokens": 2424373.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.2144522144522145, | |
| "grad_norm": 2.0224193055579796, | |
| "learning_rate": 7.714727341250533e-06, | |
| "loss": 0.3206, | |
| "mean_token_accuracy": 0.910454124212265, | |
| "num_tokens": 2433081.0, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.219114219114219, | |
| "grad_norm": 2.105326719368714, | |
| "learning_rate": 7.693476925128937e-06, | |
| "loss": 0.4086, | |
| "mean_token_accuracy": 0.8892490267753601, | |
| "num_tokens": 2442248.0, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.2237762237762237, | |
| "grad_norm": 2.054961145659142, | |
| "learning_rate": 7.672162154974686e-06, | |
| "loss": 0.3334, | |
| "mean_token_accuracy": 0.9058608114719391, | |
| "num_tokens": 2451308.0, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.2284382284382285, | |
| "grad_norm": 2.0190722916864448, | |
| "learning_rate": 7.650783656138065e-06, | |
| "loss": 0.3842, | |
| "mean_token_accuracy": 0.8919758200645447, | |
| "num_tokens": 2461071.0, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.2331002331002332, | |
| "grad_norm": 2.0459573081488664, | |
| "learning_rate": 7.629342055839077e-06, | |
| "loss": 0.3466, | |
| "mean_token_accuracy": 0.9024505317211151, | |
| "num_tokens": 2471649.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.2377622377622377, | |
| "grad_norm": 2.045255678828029, | |
| "learning_rate": 7.607837983149057e-06, | |
| "loss": 0.3902, | |
| "mean_token_accuracy": 0.8912324607372284, | |
| "num_tokens": 2481584.0, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.2424242424242424, | |
| "grad_norm": 2.0728524474968553, | |
| "learning_rate": 7.586272068972196e-06, | |
| "loss": 0.4005, | |
| "mean_token_accuracy": 0.8886791169643402, | |
| "num_tokens": 2491498.0, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 1.2470862470862472, | |
| "grad_norm": 1.9682634266288053, | |
| "learning_rate": 7.564644946027049e-06, | |
| "loss": 0.3578, | |
| "mean_token_accuracy": 0.9002366364002228, | |
| "num_tokens": 2501066.0, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.2517482517482517, | |
| "grad_norm": 1.7696539616448115, | |
| "learning_rate": 7.5429572488279615e-06, | |
| "loss": 0.3167, | |
| "mean_token_accuracy": 0.9112659692764282, | |
| "num_tokens": 2510213.0, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 1.2564102564102564, | |
| "grad_norm": 1.869201624827092, | |
| "learning_rate": 7.521209613666457e-06, | |
| "loss": 0.3298, | |
| "mean_token_accuracy": 0.9096185564994812, | |
| "num_tokens": 2518850.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.2610722610722611, | |
| "grad_norm": 1.7896035852004597, | |
| "learning_rate": 7.499402678592568e-06, | |
| "loss": 0.3452, | |
| "mean_token_accuracy": 0.902194082736969, | |
| "num_tokens": 2528729.0, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 1.2657342657342658, | |
| "grad_norm": 1.8578212430775984, | |
| "learning_rate": 7.477537083396114e-06, | |
| "loss": 0.3377, | |
| "mean_token_accuracy": 0.9030162990093231, | |
| "num_tokens": 2538576.0, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.2703962703962703, | |
| "grad_norm": 1.8583558341062996, | |
| "learning_rate": 7.45561346958794e-06, | |
| "loss": 0.3327, | |
| "mean_token_accuracy": 0.9063755571842194, | |
| "num_tokens": 2548020.0, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 1.275058275058275, | |
| "grad_norm": 1.945988010631059, | |
| "learning_rate": 7.433632480381083e-06, | |
| "loss": 0.3535, | |
| "mean_token_accuracy": 0.9046582877635956, | |
| "num_tokens": 2556984.0, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.2797202797202798, | |
| "grad_norm": 2.0099202078958958, | |
| "learning_rate": 7.4115947606719105e-06, | |
| "loss": 0.3612, | |
| "mean_token_accuracy": 0.899190753698349, | |
| "num_tokens": 2566255.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.2843822843822843, | |
| "grad_norm": 2.1032377215815, | |
| "learning_rate": 7.389500957021192e-06, | |
| "loss": 0.351, | |
| "mean_token_accuracy": 0.9027476012706757, | |
| "num_tokens": 2575331.0, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.289044289044289, | |
| "grad_norm": 2.0973543646568227, | |
| "learning_rate": 7.367351717635136e-06, | |
| "loss": 0.3561, | |
| "mean_token_accuracy": 0.899603396654129, | |
| "num_tokens": 2584609.0, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.2937062937062938, | |
| "grad_norm": 1.9524574422408152, | |
| "learning_rate": 7.345147692346373e-06, | |
| "loss": 0.3621, | |
| "mean_token_accuracy": 0.8965962827205658, | |
| "num_tokens": 2593822.0, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.2983682983682985, | |
| "grad_norm": 1.8684420517312623, | |
| "learning_rate": 7.3228895325948835e-06, | |
| "loss": 0.3404, | |
| "mean_token_accuracy": 0.9057947397232056, | |
| "num_tokens": 2602990.0, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 1.303030303030303, | |
| "grad_norm": 1.8423583661355911, | |
| "learning_rate": 7.3005778914088895e-06, | |
| "loss": 0.3181, | |
| "mean_token_accuracy": 0.9117348790168762, | |
| "num_tokens": 2612564.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.3076923076923077, | |
| "grad_norm": 1.8719988018492713, | |
| "learning_rate": 7.278213423385701e-06, | |
| "loss": 0.3633, | |
| "mean_token_accuracy": 0.8996273577213287, | |
| "num_tokens": 2621342.0, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 1.3123543123543124, | |
| "grad_norm": 2.097992632063848, | |
| "learning_rate": 7.255796784672496e-06, | |
| "loss": 0.3772, | |
| "mean_token_accuracy": 0.8940203785896301, | |
| "num_tokens": 2631008.0, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.317016317016317, | |
| "grad_norm": 2.1962950013308147, | |
| "learning_rate": 7.233328632947087e-06, | |
| "loss": 0.3176, | |
| "mean_token_accuracy": 0.9080378413200378, | |
| "num_tokens": 2640269.0, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 1.3216783216783217, | |
| "grad_norm": 1.8870826341467308, | |
| "learning_rate": 7.210809627398615e-06, | |
| "loss": 0.4153, | |
| "mean_token_accuracy": 0.8845455944538116, | |
| "num_tokens": 2650339.0, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.3263403263403264, | |
| "grad_norm": 2.4502915111296306, | |
| "learning_rate": 7.188240428708211e-06, | |
| "loss": 0.3789, | |
| "mean_token_accuracy": 0.8997355103492737, | |
| "num_tokens": 2659574.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.3310023310023311, | |
| "grad_norm": 1.8548118165118153, | |
| "learning_rate": 7.165621699029615e-06, | |
| "loss": 0.3484, | |
| "mean_token_accuracy": 0.9010252058506012, | |
| "num_tokens": 2670228.0, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.3356643356643356, | |
| "grad_norm": 1.9980570026677085, | |
| "learning_rate": 7.1429541019697505e-06, | |
| "loss": 0.3404, | |
| "mean_token_accuracy": 0.9027169346809387, | |
| "num_tokens": 2679552.0, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 1.3403263403263403, | |
| "grad_norm": 2.0041119431302143, | |
| "learning_rate": 7.120238302569245e-06, | |
| "loss": 0.3582, | |
| "mean_token_accuracy": 0.8998216986656189, | |
| "num_tokens": 2689039.0, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.3449883449883449, | |
| "grad_norm": 2.1117609014334198, | |
| "learning_rate": 7.097474967282936e-06, | |
| "loss": 0.3538, | |
| "mean_token_accuracy": 0.9006753861904144, | |
| "num_tokens": 2698186.0, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 1.3496503496503496, | |
| "grad_norm": 2.0486984888660893, | |
| "learning_rate": 7.0746647639602994e-06, | |
| "loss": 0.3735, | |
| "mean_token_accuracy": 0.8939989805221558, | |
| "num_tokens": 2707633.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.3543123543123543, | |
| "grad_norm": 2.0849324489543966, | |
| "learning_rate": 7.051808361825867e-06, | |
| "loss": 0.3615, | |
| "mean_token_accuracy": 0.8953780829906464, | |
| "num_tokens": 2717211.0, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 1.358974358974359, | |
| "grad_norm": 2.0186929363318846, | |
| "learning_rate": 7.028906431459593e-06, | |
| "loss": 0.3721, | |
| "mean_token_accuracy": 0.8980013132095337, | |
| "num_tokens": 2726028.0, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.3636363636363638, | |
| "grad_norm": 2.1845476421129746, | |
| "learning_rate": 7.0059596447771714e-06, | |
| "loss": 0.3582, | |
| "mean_token_accuracy": 0.9010344445705414, | |
| "num_tokens": 2735300.0, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 1.3682983682983683, | |
| "grad_norm": 1.895694935116521, | |
| "learning_rate": 6.982968675010332e-06, | |
| "loss": 0.3425, | |
| "mean_token_accuracy": 0.904049277305603, | |
| "num_tokens": 2745014.0, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.372960372960373, | |
| "grad_norm": 2.1465561267774618, | |
| "learning_rate": 6.959934196687079e-06, | |
| "loss": 0.3973, | |
| "mean_token_accuracy": 0.8906912803649902, | |
| "num_tokens": 2754679.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.3776223776223775, | |
| "grad_norm": 2.1936247573101584, | |
| "learning_rate": 6.93685688561191e-06, | |
| "loss": 0.3958, | |
| "mean_token_accuracy": 0.8900346755981445, | |
| "num_tokens": 2763778.0, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.3822843822843822, | |
| "grad_norm": 2.2014931147614707, | |
| "learning_rate": 6.913737418845985e-06, | |
| "loss": 0.3484, | |
| "mean_token_accuracy": 0.8970995843410492, | |
| "num_tokens": 2772924.0, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 1.386946386946387, | |
| "grad_norm": 1.9248709270571454, | |
| "learning_rate": 6.890576474687264e-06, | |
| "loss": 0.374, | |
| "mean_token_accuracy": 0.8932155966758728, | |
| "num_tokens": 2782527.0, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.3916083916083917, | |
| "grad_norm": 2.127937137873515, | |
| "learning_rate": 6.8673747326506e-06, | |
| "loss": 0.4019, | |
| "mean_token_accuracy": 0.8896147608757019, | |
| "num_tokens": 2792249.0, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 1.3962703962703964, | |
| "grad_norm": 2.2064832449632656, | |
| "learning_rate": 6.8441328734478115e-06, | |
| "loss": 0.3501, | |
| "mean_token_accuracy": 0.8990257680416107, | |
| "num_tokens": 2801881.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.400932400932401, | |
| "grad_norm": 1.8573619457430928, | |
| "learning_rate": 6.820851578967708e-06, | |
| "loss": 0.3872, | |
| "mean_token_accuracy": 0.8920771181583405, | |
| "num_tokens": 2811478.0, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 1.4055944055944056, | |
| "grad_norm": 2.0254753288605225, | |
| "learning_rate": 6.797531532256079e-06, | |
| "loss": 0.3734, | |
| "mean_token_accuracy": 0.8915270268917084, | |
| "num_tokens": 2821393.0, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.4102564102564101, | |
| "grad_norm": 2.0305791181245927, | |
| "learning_rate": 6.774173417495667e-06, | |
| "loss": 0.3913, | |
| "mean_token_accuracy": 0.8924152255058289, | |
| "num_tokens": 2830925.0, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 1.4149184149184149, | |
| "grad_norm": 2.004426882378005, | |
| "learning_rate": 6.750777919986075e-06, | |
| "loss": 0.3446, | |
| "mean_token_accuracy": 0.900405764579773, | |
| "num_tokens": 2840991.0, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.4195804195804196, | |
| "grad_norm": 1.888471422341293, | |
| "learning_rate": 6.727345726123684e-06, | |
| "loss": 0.4219, | |
| "mean_token_accuracy": 0.8845990300178528, | |
| "num_tokens": 2850900.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.4242424242424243, | |
| "grad_norm": 2.3906699897573067, | |
| "learning_rate": 6.703877523381495e-06, | |
| "loss": 0.339, | |
| "mean_token_accuracy": 0.90447598695755, | |
| "num_tokens": 2860353.0, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 1.428904428904429, | |
| "grad_norm": 1.84404450023143, | |
| "learning_rate": 6.680374000288968e-06, | |
| "loss": 0.3976, | |
| "mean_token_accuracy": 0.8878736197948456, | |
| "num_tokens": 2868792.0, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 1.4335664335664335, | |
| "grad_norm": 2.2669491579710908, | |
| "learning_rate": 6.656835846411824e-06, | |
| "loss": 0.3815, | |
| "mean_token_accuracy": 0.892831951379776, | |
| "num_tokens": 2877774.0, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 1.4382284382284383, | |
| "grad_norm": 2.049722778481883, | |
| "learning_rate": 6.633263752331808e-06, | |
| "loss": 0.341, | |
| "mean_token_accuracy": 0.9073670506477356, | |
| "num_tokens": 2886388.0, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 1.4428904428904428, | |
| "grad_norm": 1.9762730207593977, | |
| "learning_rate": 6.609658409626431e-06, | |
| "loss": 0.3336, | |
| "mean_token_accuracy": 0.9044366478919983, | |
| "num_tokens": 2896615.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.4475524475524475, | |
| "grad_norm": 2.0368692579640504, | |
| "learning_rate": 6.586020510848676e-06, | |
| "loss": 0.3983, | |
| "mean_token_accuracy": 0.8900512158870697, | |
| "num_tokens": 2905800.0, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 1.4522144522144522, | |
| "grad_norm": 2.2134931405904728, | |
| "learning_rate": 6.562350749506691e-06, | |
| "loss": 0.3625, | |
| "mean_token_accuracy": 0.8963052034378052, | |
| "num_tokens": 2914643.0, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 1.456876456876457, | |
| "grad_norm": 2.0611362055815206, | |
| "learning_rate": 6.538649820043427e-06, | |
| "loss": 0.3273, | |
| "mean_token_accuracy": 0.9081335067749023, | |
| "num_tokens": 2924023.0, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 1.4615384615384617, | |
| "grad_norm": 2.054315048122934, | |
| "learning_rate": 6.514918417816275e-06, | |
| "loss": 0.3644, | |
| "mean_token_accuracy": 0.8971601724624634, | |
| "num_tokens": 2933568.0, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 1.4662004662004662, | |
| "grad_norm": 1.8809804843640063, | |
| "learning_rate": 6.4911572390766575e-06, | |
| "loss": 0.3577, | |
| "mean_token_accuracy": 0.9007624089717865, | |
| "num_tokens": 2943220.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.470862470862471, | |
| "grad_norm": 2.2331671138955405, | |
| "learning_rate": 6.46736698094961e-06, | |
| "loss": 0.3637, | |
| "mean_token_accuracy": 0.8960618078708649, | |
| "num_tokens": 2953179.0, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 1.4755244755244754, | |
| "grad_norm": 1.9954505430707832, | |
| "learning_rate": 6.443548341413316e-06, | |
| "loss": 0.3644, | |
| "mean_token_accuracy": 0.8977023661136627, | |
| "num_tokens": 2961793.0, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 1.4801864801864801, | |
| "grad_norm": 1.9724699365395055, | |
| "learning_rate": 6.419702019278643e-06, | |
| "loss": 0.3758, | |
| "mean_token_accuracy": 0.8944825232028961, | |
| "num_tokens": 2971416.0, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 1.4848484848484849, | |
| "grad_norm": 2.0154500749099387, | |
| "learning_rate": 6.3958287141686294e-06, | |
| "loss": 0.3699, | |
| "mean_token_accuracy": 0.8967875242233276, | |
| "num_tokens": 2980488.0, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 1.4895104895104896, | |
| "grad_norm": 1.9968714801195095, | |
| "learning_rate": 6.371929126497963e-06, | |
| "loss": 0.3482, | |
| "mean_token_accuracy": 0.8994722068309784, | |
| "num_tokens": 2989801.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.494172494172494, | |
| "grad_norm": 1.8686828557788273, | |
| "learning_rate": 6.348003957452433e-06, | |
| "loss": 0.3408, | |
| "mean_token_accuracy": 0.9057431817054749, | |
| "num_tokens": 2998497.0, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 1.4988344988344988, | |
| "grad_norm": 1.961491939994152, | |
| "learning_rate": 6.324053908968353e-06, | |
| "loss": 0.3708, | |
| "mean_token_accuracy": 0.898443877696991, | |
| "num_tokens": 3008134.0, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 1.5034965034965035, | |
| "grad_norm": 2.074784243247548, | |
| "learning_rate": 6.300079683711973e-06, | |
| "loss": 0.3431, | |
| "mean_token_accuracy": 0.9046167731285095, | |
| "num_tokens": 3017363.0, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 1.508158508158508, | |
| "grad_norm": 2.0249158717108338, | |
| "learning_rate": 6.276081985058857e-06, | |
| "loss": 0.3313, | |
| "mean_token_accuracy": 0.9048315584659576, | |
| "num_tokens": 3026359.0, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 1.5128205128205128, | |
| "grad_norm": 2.0275965811268506, | |
| "learning_rate": 6.2520615170732555e-06, | |
| "loss": 0.364, | |
| "mean_token_accuracy": 0.8976201117038727, | |
| "num_tokens": 3035205.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.5174825174825175, | |
| "grad_norm": 2.1956087559799706, | |
| "learning_rate": 6.228018984487443e-06, | |
| "loss": 0.4091, | |
| "mean_token_accuracy": 0.8842986524105072, | |
| "num_tokens": 3044415.0, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 1.5221445221445222, | |
| "grad_norm": 2.454246418757256, | |
| "learning_rate": 6.20395509268104e-06, | |
| "loss": 0.3733, | |
| "mean_token_accuracy": 0.8995526134967804, | |
| "num_tokens": 3053666.0, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 1.526806526806527, | |
| "grad_norm": 2.1884276223847343, | |
| "learning_rate": 6.179870547660326e-06, | |
| "loss": 0.3684, | |
| "mean_token_accuracy": 0.8975951671600342, | |
| "num_tokens": 3063098.0, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 1.5314685314685315, | |
| "grad_norm": 2.1597054844225747, | |
| "learning_rate": 6.15576605603752e-06, | |
| "loss": 0.3949, | |
| "mean_token_accuracy": 0.8913069069385529, | |
| "num_tokens": 3072433.0, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 1.5361305361305362, | |
| "grad_norm": 2.113291542241382, | |
| "learning_rate": 6.13164232501005e-06, | |
| "loss": 0.3543, | |
| "mean_token_accuracy": 0.9012714624404907, | |
| "num_tokens": 3081854.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.5407925407925407, | |
| "grad_norm": 1.9680970926751453, | |
| "learning_rate": 6.107500062339806e-06, | |
| "loss": 0.3338, | |
| "mean_token_accuracy": 0.906892865896225, | |
| "num_tokens": 3090890.0, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 1.5454545454545454, | |
| "grad_norm": 1.9286059557341297, | |
| "learning_rate": 6.083339976332375e-06, | |
| "loss": 0.3927, | |
| "mean_token_accuracy": 0.8934414088726044, | |
| "num_tokens": 3100652.0, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 1.5501165501165501, | |
| "grad_norm": 2.086637851483338, | |
| "learning_rate": 6.05916277581626e-06, | |
| "loss": 0.3607, | |
| "mean_token_accuracy": 0.8994470834732056, | |
| "num_tokens": 3109678.0, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 1.5547785547785549, | |
| "grad_norm": 2.0191393522647414, | |
| "learning_rate": 6.034969170122079e-06, | |
| "loss": 0.3506, | |
| "mean_token_accuracy": 0.9034914076328278, | |
| "num_tokens": 3118426.0, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.5594405594405596, | |
| "grad_norm": 2.0591466593650005, | |
| "learning_rate": 6.010759869061768e-06, | |
| "loss": 0.3887, | |
| "mean_token_accuracy": 0.8912419378757477, | |
| "num_tokens": 3128508.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.564102564102564, | |
| "grad_norm": 1.9767624630895566, | |
| "learning_rate": 5.986535582907739e-06, | |
| "loss": 0.3392, | |
| "mean_token_accuracy": 0.9024395048618317, | |
| "num_tokens": 3138063.0, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.5687645687645686, | |
| "grad_norm": 1.968183719788235, | |
| "learning_rate": 5.96229702237205e-06, | |
| "loss": 0.3617, | |
| "mean_token_accuracy": 0.8971932530403137, | |
| "num_tokens": 3147501.0, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 1.5734265734265733, | |
| "grad_norm": 1.9372262604926125, | |
| "learning_rate": 5.938044898585555e-06, | |
| "loss": 0.3583, | |
| "mean_token_accuracy": 0.9022301137447357, | |
| "num_tokens": 3156896.0, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 1.578088578088578, | |
| "grad_norm": 2.007232850048927, | |
| "learning_rate": 5.913779923077035e-06, | |
| "loss": 0.3512, | |
| "mean_token_accuracy": 0.8955269753932953, | |
| "num_tokens": 3166931.0, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 1.5827505827505828, | |
| "grad_norm": 1.906142795024627, | |
| "learning_rate": 5.889502807752329e-06, | |
| "loss": 0.3772, | |
| "mean_token_accuracy": 0.8949233889579773, | |
| "num_tokens": 3176414.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.5874125874125875, | |
| "grad_norm": 2.1876602349847154, | |
| "learning_rate": 5.865214264873441e-06, | |
| "loss": 0.4217, | |
| "mean_token_accuracy": 0.8859592080116272, | |
| "num_tokens": 3185570.0, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 1.5920745920745922, | |
| "grad_norm": 2.036501182255171, | |
| "learning_rate": 5.840915007037648e-06, | |
| "loss": 0.3967, | |
| "mean_token_accuracy": 0.891197919845581, | |
| "num_tokens": 3194296.0, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 1.5967365967365967, | |
| "grad_norm": 2.165455182761244, | |
| "learning_rate": 5.816605747156588e-06, | |
| "loss": 0.3858, | |
| "mean_token_accuracy": 0.8901144564151764, | |
| "num_tokens": 3204108.0, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 1.6013986013986012, | |
| "grad_norm": 2.020010750738948, | |
| "learning_rate": 5.792287198435349e-06, | |
| "loss": 0.3321, | |
| "mean_token_accuracy": 0.9072842001914978, | |
| "num_tokens": 3213706.0, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 1.606060606060606, | |
| "grad_norm": 1.8267767685145804, | |
| "learning_rate": 5.767960074351545e-06, | |
| "loss": 0.3166, | |
| "mean_token_accuracy": 0.9123204052448273, | |
| "num_tokens": 3223445.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.6107226107226107, | |
| "grad_norm": 1.8249584358478421, | |
| "learning_rate": 5.74362508863438e-06, | |
| "loss": 0.3557, | |
| "mean_token_accuracy": 0.9038134515285492, | |
| "num_tokens": 3232428.0, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 1.6153846153846154, | |
| "grad_norm": 2.031208096971931, | |
| "learning_rate": 5.719282955243705e-06, | |
| "loss": 0.3729, | |
| "mean_token_accuracy": 0.8925433158874512, | |
| "num_tokens": 3241575.0, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 1.6200466200466201, | |
| "grad_norm": 2.1078269837893817, | |
| "learning_rate": 5.69493438834908e-06, | |
| "loss": 0.3927, | |
| "mean_token_accuracy": 0.89081871509552, | |
| "num_tokens": 3250577.0, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.6247086247086249, | |
| "grad_norm": 2.053266844522503, | |
| "learning_rate": 5.670580102308816e-06, | |
| "loss": 0.3708, | |
| "mean_token_accuracy": 0.8976758718490601, | |
| "num_tokens": 3259305.0, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 1.6293706293706294, | |
| "grad_norm": 2.2217560492827486, | |
| "learning_rate": 5.646220811649013e-06, | |
| "loss": 0.3631, | |
| "mean_token_accuracy": 0.9028047621250153, | |
| "num_tokens": 3268502.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.6340326340326339, | |
| "grad_norm": 1.9993214416997105, | |
| "learning_rate": 5.6218572310426065e-06, | |
| "loss": 0.3751, | |
| "mean_token_accuracy": 0.8933521211147308, | |
| "num_tokens": 3276568.0, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 1.6386946386946386, | |
| "grad_norm": 2.1734690923322115, | |
| "learning_rate": 5.59749007528839e-06, | |
| "loss": 0.3498, | |
| "mean_token_accuracy": 0.9007490277290344, | |
| "num_tokens": 3285881.0, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.6433566433566433, | |
| "grad_norm": 1.98059234929291, | |
| "learning_rate": 5.573120059290047e-06, | |
| "loss": 0.3484, | |
| "mean_token_accuracy": 0.9028733670711517, | |
| "num_tokens": 3295490.0, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 1.648018648018648, | |
| "grad_norm": 1.8606757870005566, | |
| "learning_rate": 5.5487478980351805e-06, | |
| "loss": 0.342, | |
| "mean_token_accuracy": 0.9035466313362122, | |
| "num_tokens": 3304645.0, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 1.6526806526806528, | |
| "grad_norm": 2.0090495432050726, | |
| "learning_rate": 5.524374306574331e-06, | |
| "loss": 0.3516, | |
| "mean_token_accuracy": 0.9037186503410339, | |
| "num_tokens": 3313188.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.6573426573426573, | |
| "grad_norm": 1.9688667212238595, | |
| "learning_rate": 5.500000000000001e-06, | |
| "loss": 0.3673, | |
| "mean_token_accuracy": 0.9001396596431732, | |
| "num_tokens": 3322514.0, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 1.662004662004662, | |
| "grad_norm": 2.1361286046991568, | |
| "learning_rate": 5.47562569342567e-06, | |
| "loss": 0.3553, | |
| "mean_token_accuracy": 0.9002968966960907, | |
| "num_tokens": 3332207.0, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 1.9983765145158914, | |
| "learning_rate": 5.451252101964821e-06, | |
| "loss": 0.345, | |
| "mean_token_accuracy": 0.9027390778064728, | |
| "num_tokens": 3342114.0, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 1.6713286713286712, | |
| "grad_norm": 2.221522630829983, | |
| "learning_rate": 5.426879940709956e-06, | |
| "loss": 0.4167, | |
| "mean_token_accuracy": 0.8860217034816742, | |
| "num_tokens": 3351663.0, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 1.675990675990676, | |
| "grad_norm": 2.351737145138835, | |
| "learning_rate": 5.402509924711612e-06, | |
| "loss": 0.3787, | |
| "mean_token_accuracy": 0.8979602456092834, | |
| "num_tokens": 3360915.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.6806526806526807, | |
| "grad_norm": 2.2067080103402366, | |
| "learning_rate": 5.378142768957396e-06, | |
| "loss": 0.3694, | |
| "mean_token_accuracy": 0.8943982720375061, | |
| "num_tokens": 3370401.0, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 1.6853146853146854, | |
| "grad_norm": 2.2270263309806366, | |
| "learning_rate": 5.353779188350989e-06, | |
| "loss": 0.3982, | |
| "mean_token_accuracy": 0.8878339529037476, | |
| "num_tokens": 3380711.0, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 1.68997668997669, | |
| "grad_norm": 1.8254660836200485, | |
| "learning_rate": 5.329419897691187e-06, | |
| "loss": 0.3084, | |
| "mean_token_accuracy": 0.9157176613807678, | |
| "num_tokens": 3390032.0, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 1.6946386946386947, | |
| "grad_norm": 1.8835789154592308, | |
| "learning_rate": 5.305065611650921e-06, | |
| "loss": 0.3653, | |
| "mean_token_accuracy": 0.8960134088993073, | |
| "num_tokens": 3399395.0, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 1.6993006993006992, | |
| "grad_norm": 1.9906870579116909, | |
| "learning_rate": 5.280717044756298e-06, | |
| "loss": 0.3656, | |
| "mean_token_accuracy": 0.8999285995960236, | |
| "num_tokens": 3408677.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.7039627039627039, | |
| "grad_norm": 2.0749533505721245, | |
| "learning_rate": 5.256374911365621e-06, | |
| "loss": 0.3439, | |
| "mean_token_accuracy": 0.9051547646522522, | |
| "num_tokens": 3418478.0, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 1.7086247086247086, | |
| "grad_norm": 1.8643996151955922, | |
| "learning_rate": 5.232039925648457e-06, | |
| "loss": 0.3449, | |
| "mean_token_accuracy": 0.9024885594844818, | |
| "num_tokens": 3427641.0, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 1.7132867132867133, | |
| "grad_norm": 1.8625024767097098, | |
| "learning_rate": 5.207712801564652e-06, | |
| "loss": 0.3371, | |
| "mean_token_accuracy": 0.9041755497455597, | |
| "num_tokens": 3437250.0, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 1.717948717948718, | |
| "grad_norm": 2.0436576954469823, | |
| "learning_rate": 5.1833942528434145e-06, | |
| "loss": 0.3721, | |
| "mean_token_accuracy": 0.8964725732803345, | |
| "num_tokens": 3446807.0, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 1.7226107226107226, | |
| "grad_norm": 1.9839081705779613, | |
| "learning_rate": 5.159084992962354e-06, | |
| "loss": 0.3303, | |
| "mean_token_accuracy": 0.9086009562015533, | |
| "num_tokens": 3456306.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.7272727272727273, | |
| "grad_norm": 2.0000086910401667, | |
| "learning_rate": 5.13478573512656e-06, | |
| "loss": 0.3667, | |
| "mean_token_accuracy": 0.8962165117263794, | |
| "num_tokens": 3466027.0, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 1.7319347319347318, | |
| "grad_norm": 2.038609761919514, | |
| "learning_rate": 5.110497192247671e-06, | |
| "loss": 0.3434, | |
| "mean_token_accuracy": 0.9010106325149536, | |
| "num_tokens": 3475253.0, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 1.7365967365967365, | |
| "grad_norm": 2.1715737014463894, | |
| "learning_rate": 5.086220076922965e-06, | |
| "loss": 0.3448, | |
| "mean_token_accuracy": 0.9069474339485168, | |
| "num_tokens": 3484178.0, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 1.7412587412587412, | |
| "grad_norm": 1.867996028633842, | |
| "learning_rate": 5.061955101414448e-06, | |
| "loss": 0.3381, | |
| "mean_token_accuracy": 0.9080156981945038, | |
| "num_tokens": 3494001.0, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 1.745920745920746, | |
| "grad_norm": 2.1685787201422837, | |
| "learning_rate": 5.0377029776279514e-06, | |
| "loss": 0.3693, | |
| "mean_token_accuracy": 0.902137279510498, | |
| "num_tokens": 3503472.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.7505827505827507, | |
| "grad_norm": 2.054942165163828, | |
| "learning_rate": 5.013464417092263e-06, | |
| "loss": 0.3739, | |
| "mean_token_accuracy": 0.8999918103218079, | |
| "num_tokens": 3512786.0, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 1.7552447552447552, | |
| "grad_norm": 2.0866190139943246, | |
| "learning_rate": 4.989240130938232e-06, | |
| "loss": 0.3298, | |
| "mean_token_accuracy": 0.9082687795162201, | |
| "num_tokens": 3522266.0, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 1.75990675990676, | |
| "grad_norm": 1.9164366875851901, | |
| "learning_rate": 4.9650308298779215e-06, | |
| "loss": 0.3728, | |
| "mean_token_accuracy": 0.8957255184650421, | |
| "num_tokens": 3531741.0, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 1.7645687645687644, | |
| "grad_norm": 2.1286294755388386, | |
| "learning_rate": 4.940837224183742e-06, | |
| "loss": 0.3469, | |
| "mean_token_accuracy": 0.9021869897842407, | |
| "num_tokens": 3541026.0, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 1.7692307692307692, | |
| "grad_norm": 1.9795732637258567, | |
| "learning_rate": 4.916660023667627e-06, | |
| "loss": 0.389, | |
| "mean_token_accuracy": 0.8947827517986298, | |
| "num_tokens": 3550336.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.7738927738927739, | |
| "grad_norm": 2.2291523062610845, | |
| "learning_rate": 4.892499937660195e-06, | |
| "loss": 0.3698, | |
| "mean_token_accuracy": 0.9012640714645386, | |
| "num_tokens": 3558944.0, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 1.7785547785547786, | |
| "grad_norm": 2.142983628191579, | |
| "learning_rate": 4.8683576749899505e-06, | |
| "loss": 0.3583, | |
| "mean_token_accuracy": 0.896765947341919, | |
| "num_tokens": 3567747.0, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 1.7832167832167833, | |
| "grad_norm": 2.1501200195656134, | |
| "learning_rate": 4.844233943962481e-06, | |
| "loss": 0.3476, | |
| "mean_token_accuracy": 0.9035030007362366, | |
| "num_tokens": 3576606.0, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 1.7878787878787878, | |
| "grad_norm": 1.8885760684090536, | |
| "learning_rate": 4.820129452339676e-06, | |
| "loss": 0.3913, | |
| "mean_token_accuracy": 0.8892778158187866, | |
| "num_tokens": 3586451.0, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 1.7925407925407926, | |
| "grad_norm": 2.3749625105377423, | |
| "learning_rate": 4.796044907318961e-06, | |
| "loss": 0.3493, | |
| "mean_token_accuracy": 0.9036368727684021, | |
| "num_tokens": 3595348.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.797202797202797, | |
| "grad_norm": 2.1616552910281044, | |
| "learning_rate": 4.771981015512559e-06, | |
| "loss": 0.404, | |
| "mean_token_accuracy": 0.888810396194458, | |
| "num_tokens": 3605507.0, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 1.8018648018648018, | |
| "grad_norm": 1.859001299709581, | |
| "learning_rate": 4.747938482926746e-06, | |
| "loss": 0.3422, | |
| "mean_token_accuracy": 0.9032579958438873, | |
| "num_tokens": 3615183.0, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 1.8065268065268065, | |
| "grad_norm": 2.1152225374726337, | |
| "learning_rate": 4.723918014941144e-06, | |
| "loss": 0.4379, | |
| "mean_token_accuracy": 0.8827772438526154, | |
| "num_tokens": 3624158.0, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 1.8111888111888113, | |
| "grad_norm": 2.149586435718605, | |
| "learning_rate": 4.69992031628803e-06, | |
| "loss": 0.3702, | |
| "mean_token_accuracy": 0.8978259861469269, | |
| "num_tokens": 3633033.0, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 1.815850815850816, | |
| "grad_norm": 2.0006781263840843, | |
| "learning_rate": 4.675946091031648e-06, | |
| "loss": 0.3604, | |
| "mean_token_accuracy": 0.901467889547348, | |
| "num_tokens": 3642167.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.8205128205128205, | |
| "grad_norm": 1.9865318354604984, | |
| "learning_rate": 4.65199604254757e-06, | |
| "loss": 0.3266, | |
| "mean_token_accuracy": 0.902495414018631, | |
| "num_tokens": 3651178.0, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 1.8251748251748252, | |
| "grad_norm": 1.8174794166059944, | |
| "learning_rate": 4.628070873502038e-06, | |
| "loss": 0.3595, | |
| "mean_token_accuracy": 0.8980917930603027, | |
| "num_tokens": 3660853.0, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 1.8298368298368297, | |
| "grad_norm": 1.8237936767860705, | |
| "learning_rate": 4.604171285831373e-06, | |
| "loss": 0.3247, | |
| "mean_token_accuracy": 0.9076565206050873, | |
| "num_tokens": 3671265.0, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 1.8344988344988344, | |
| "grad_norm": 2.967734088138488, | |
| "learning_rate": 4.5802979807213585e-06, | |
| "loss": 0.3284, | |
| "mean_token_accuracy": 0.9076884090900421, | |
| "num_tokens": 3681935.0, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 1.8391608391608392, | |
| "grad_norm": 1.7848473602660753, | |
| "learning_rate": 4.556451658586687e-06, | |
| "loss": 0.3327, | |
| "mean_token_accuracy": 0.9050241410732269, | |
| "num_tokens": 3690809.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.843822843822844, | |
| "grad_norm": 2.132215578463288, | |
| "learning_rate": 4.532633019050392e-06, | |
| "loss": 0.3344, | |
| "mean_token_accuracy": 0.9007371068000793, | |
| "num_tokens": 3700254.0, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 1.8484848484848486, | |
| "grad_norm": 1.802687602683309, | |
| "learning_rate": 4.508842760923344e-06, | |
| "loss": 0.3319, | |
| "mean_token_accuracy": 0.9053416550159454, | |
| "num_tokens": 3709572.0, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 1.8531468531468531, | |
| "grad_norm": 2.141026355729347, | |
| "learning_rate": 4.4850815821837265e-06, | |
| "loss": 0.3686, | |
| "mean_token_accuracy": 0.8985418379306793, | |
| "num_tokens": 3718477.0, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 1.8578088578088578, | |
| "grad_norm": 2.064794666005344, | |
| "learning_rate": 4.4613501799565755e-06, | |
| "loss": 0.3776, | |
| "mean_token_accuracy": 0.8916601240634918, | |
| "num_tokens": 3728055.0, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 1.8624708624708624, | |
| "grad_norm": 2.3103287859074477, | |
| "learning_rate": 4.43764925049331e-06, | |
| "loss": 0.3689, | |
| "mean_token_accuracy": 0.8986863493919373, | |
| "num_tokens": 3736756.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.867132867132867, | |
| "grad_norm": 2.1582853507409765, | |
| "learning_rate": 4.413979489151326e-06, | |
| "loss": 0.3664, | |
| "mean_token_accuracy": 0.8990582525730133, | |
| "num_tokens": 3746049.0, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 1.8717948717948718, | |
| "grad_norm": 2.346758495250227, | |
| "learning_rate": 4.3903415903735725e-06, | |
| "loss": 0.3933, | |
| "mean_token_accuracy": 0.893925815820694, | |
| "num_tokens": 3755161.0, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 1.8764568764568765, | |
| "grad_norm": 2.2276831775466626, | |
| "learning_rate": 4.366736247668194e-06, | |
| "loss": 0.3578, | |
| "mean_token_accuracy": 0.9008547365665436, | |
| "num_tokens": 3763950.0, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 1.8811188811188813, | |
| "grad_norm": 2.005267203414508, | |
| "learning_rate": 4.343164153588176e-06, | |
| "loss": 0.3592, | |
| "mean_token_accuracy": 0.8986201584339142, | |
| "num_tokens": 3773509.0, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 1.8857808857808858, | |
| "grad_norm": 2.07305174671755, | |
| "learning_rate": 4.3196259997110326e-06, | |
| "loss": 0.374, | |
| "mean_token_accuracy": 0.8963338732719421, | |
| "num_tokens": 3782734.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.8904428904428905, | |
| "grad_norm": 1.8657634956210976, | |
| "learning_rate": 4.296122476618507e-06, | |
| "loss": 0.2938, | |
| "mean_token_accuracy": 0.9148504436016083, | |
| "num_tokens": 3792797.0, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 1.895104895104895, | |
| "grad_norm": 1.7854646245845165, | |
| "learning_rate": 4.2726542738763185e-06, | |
| "loss": 0.3506, | |
| "mean_token_accuracy": 0.9033430516719818, | |
| "num_tokens": 3801566.0, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 1.8997668997668997, | |
| "grad_norm": 2.1277604879204297, | |
| "learning_rate": 4.249222080013927e-06, | |
| "loss": 0.4041, | |
| "mean_token_accuracy": 0.8863241672515869, | |
| "num_tokens": 3810506.0, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 1.9044289044289044, | |
| "grad_norm": 2.203068756546148, | |
| "learning_rate": 4.2258265825043365e-06, | |
| "loss": 0.3564, | |
| "mean_token_accuracy": 0.8995753824710846, | |
| "num_tokens": 3819472.0, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 1.9090909090909092, | |
| "grad_norm": 1.9056766783093182, | |
| "learning_rate": 4.202468467743922e-06, | |
| "loss": 0.3437, | |
| "mean_token_accuracy": 0.9032955765724182, | |
| "num_tokens": 3828386.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.913752913752914, | |
| "grad_norm": 2.0477422656146715, | |
| "learning_rate": 4.1791484210322945e-06, | |
| "loss": 0.3963, | |
| "mean_token_accuracy": 0.8910067081451416, | |
| "num_tokens": 3837876.0, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 1.9184149184149184, | |
| "grad_norm": 2.003989704673306, | |
| "learning_rate": 4.15586712655219e-06, | |
| "loss": 0.3766, | |
| "mean_token_accuracy": 0.894858181476593, | |
| "num_tokens": 3847317.0, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 1.9230769230769231, | |
| "grad_norm": 2.040339338795703, | |
| "learning_rate": 4.1326252673494006e-06, | |
| "loss": 0.3611, | |
| "mean_token_accuracy": 0.9015864133834839, | |
| "num_tokens": 3856516.0, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 1.9277389277389276, | |
| "grad_norm": 1.9997790559031594, | |
| "learning_rate": 4.109423525312738e-06, | |
| "loss": 0.3524, | |
| "mean_token_accuracy": 0.9032379388809204, | |
| "num_tokens": 3866154.0, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 1.9324009324009324, | |
| "grad_norm": 1.8004052846393808, | |
| "learning_rate": 4.086262581154015e-06, | |
| "loss": 0.3496, | |
| "mean_token_accuracy": 0.9037461578845978, | |
| "num_tokens": 3875591.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.937062937062937, | |
| "grad_norm": 2.0740585166328422, | |
| "learning_rate": 4.0631431143880915e-06, | |
| "loss": 0.3397, | |
| "mean_token_accuracy": 0.9040902256965637, | |
| "num_tokens": 3885411.0, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 1.9417249417249418, | |
| "grad_norm": 2.0199474623334037, | |
| "learning_rate": 4.040065803312921e-06, | |
| "loss": 0.3619, | |
| "mean_token_accuracy": 0.8977847695350647, | |
| "num_tokens": 3893582.0, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 1.9463869463869465, | |
| "grad_norm": 1.9355091980822123, | |
| "learning_rate": 4.017031324989669e-06, | |
| "loss": 0.3382, | |
| "mean_token_accuracy": 0.9016455113887787, | |
| "num_tokens": 3903096.0, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 1.951048951048951, | |
| "grad_norm": 2.0385295249371067, | |
| "learning_rate": 3.994040355222828e-06, | |
| "loss": 0.3558, | |
| "mean_token_accuracy": 0.899684876203537, | |
| "num_tokens": 3912289.0, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 1.9557109557109555, | |
| "grad_norm": 2.1721644040047225, | |
| "learning_rate": 3.971093568540408e-06, | |
| "loss": 0.3711, | |
| "mean_token_accuracy": 0.893602579832077, | |
| "num_tokens": 3921342.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.9603729603729603, | |
| "grad_norm": 2.173567052204643, | |
| "learning_rate": 3.948191638174135e-06, | |
| "loss": 0.3468, | |
| "mean_token_accuracy": 0.9026708602905273, | |
| "num_tokens": 3930927.0, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 1.965034965034965, | |
| "grad_norm": 2.183705481115713, | |
| "learning_rate": 3.925335236039702e-06, | |
| "loss": 0.3842, | |
| "mean_token_accuracy": 0.8991383612155914, | |
| "num_tokens": 3939964.0, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 1.9696969696969697, | |
| "grad_norm": 1.8559395167069694, | |
| "learning_rate": 3.902525032717067e-06, | |
| "loss": 0.3593, | |
| "mean_token_accuracy": 0.8971259593963623, | |
| "num_tokens": 3949279.0, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 1.9743589743589745, | |
| "grad_norm": 1.9573191314090432, | |
| "learning_rate": 3.879761697430756e-06, | |
| "loss": 0.3379, | |
| "mean_token_accuracy": 0.9067609906196594, | |
| "num_tokens": 3957777.0, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 1.9790209790209792, | |
| "grad_norm": 2.0757659987817423, | |
| "learning_rate": 3.8570458980302526e-06, | |
| "loss": 0.3598, | |
| "mean_token_accuracy": 0.8982405662536621, | |
| "num_tokens": 3967234.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.9836829836829837, | |
| "grad_norm": 2.1061841538720314, | |
| "learning_rate": 3.834378300970385e-06, | |
| "loss": 0.3595, | |
| "mean_token_accuracy": 0.9008378386497498, | |
| "num_tokens": 3976342.0, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 1.9883449883449882, | |
| "grad_norm": 2.012444035670508, | |
| "learning_rate": 3.811759571291792e-06, | |
| "loss": 0.3734, | |
| "mean_token_accuracy": 0.8925457000732422, | |
| "num_tokens": 3985787.0, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 1.993006993006993, | |
| "grad_norm": 2.0766910903000557, | |
| "learning_rate": 3.789190372601387e-06, | |
| "loss": 0.3746, | |
| "mean_token_accuracy": 0.8980415463447571, | |
| "num_tokens": 3995313.0, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 1.9976689976689976, | |
| "grad_norm": 2.3515299611822744, | |
| "learning_rate": 3.7666713670529153e-06, | |
| "loss": 0.3756, | |
| "mean_token_accuracy": 0.8946518898010254, | |
| "num_tokens": 4004696.0, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.8965666457221966, | |
| "learning_rate": 3.7442032153275053e-06, | |
| "loss": 0.2816, | |
| "mean_token_accuracy": 0.9159619212150574, | |
| "num_tokens": 4006594.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.0046620046620047, | |
| "grad_norm": 1.9140541750140387, | |
| "learning_rate": 3.7217865766143014e-06, | |
| "loss": 0.2339, | |
| "mean_token_accuracy": 0.9410516619682312, | |
| "num_tokens": 4016177.0, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 2.0093240093240095, | |
| "grad_norm": 1.9667546418678483, | |
| "learning_rate": 3.6994221085911107e-06, | |
| "loss": 0.2227, | |
| "mean_token_accuracy": 0.9423635900020599, | |
| "num_tokens": 4025325.0, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 2.013986013986014, | |
| "grad_norm": 1.9133395961990365, | |
| "learning_rate": 3.6771104674051184e-06, | |
| "loss": 0.2103, | |
| "mean_token_accuracy": 0.9473540186882019, | |
| "num_tokens": 4034580.0, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 2.0186480186480185, | |
| "grad_norm": 2.010682976434144, | |
| "learning_rate": 3.654852307653628e-06, | |
| "loss": 0.2033, | |
| "mean_token_accuracy": 0.9448116421699524, | |
| "num_tokens": 4043927.0, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 2.023310023310023, | |
| "grad_norm": 2.264710867597396, | |
| "learning_rate": 3.6326482823648656e-06, | |
| "loss": 0.1945, | |
| "mean_token_accuracy": 0.9458509981632233, | |
| "num_tokens": 4052748.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 2.027972027972028, | |
| "grad_norm": 3.3055471689921445, | |
| "learning_rate": 3.6104990429788102e-06, | |
| "loss": 0.1809, | |
| "mean_token_accuracy": 0.9497792422771454, | |
| "num_tokens": 4062547.0, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 2.0326340326340326, | |
| "grad_norm": 3.545040453235985, | |
| "learning_rate": 3.588405239328091e-06, | |
| "loss": 0.1825, | |
| "mean_token_accuracy": 0.9520241916179657, | |
| "num_tokens": 4072151.0, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 2.0372960372960374, | |
| "grad_norm": 4.174805075882892, | |
| "learning_rate": 3.5663675196189184e-06, | |
| "loss": 0.2058, | |
| "mean_token_accuracy": 0.9453835189342499, | |
| "num_tokens": 4081740.0, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 2.041958041958042, | |
| "grad_norm": 2.8755369630384786, | |
| "learning_rate": 3.5443865304120607e-06, | |
| "loss": 0.1838, | |
| "mean_token_accuracy": 0.9492034912109375, | |
| "num_tokens": 4090794.0, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 2.046620046620047, | |
| "grad_norm": 2.7985718626593763, | |
| "learning_rate": 3.522462916603887e-06, | |
| "loss": 0.2009, | |
| "mean_token_accuracy": 0.9445820152759552, | |
| "num_tokens": 4099715.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.051282051282051, | |
| "grad_norm": 2.307748489794371, | |
| "learning_rate": 3.500597321407435e-06, | |
| "loss": 0.1825, | |
| "mean_token_accuracy": 0.9495159089565277, | |
| "num_tokens": 4109312.0, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 2.055944055944056, | |
| "grad_norm": 2.167026623774839, | |
| "learning_rate": 3.478790386333546e-06, | |
| "loss": 0.2046, | |
| "mean_token_accuracy": 0.9436551332473755, | |
| "num_tokens": 4118384.0, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 2.0606060606060606, | |
| "grad_norm": 2.197047391901218, | |
| "learning_rate": 3.45704275117204e-06, | |
| "loss": 0.2117, | |
| "mean_token_accuracy": 0.9415621161460876, | |
| "num_tokens": 4127809.0, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 2.0652680652680653, | |
| "grad_norm": 1.944924615630873, | |
| "learning_rate": 3.435355053972953e-06, | |
| "loss": 0.1827, | |
| "mean_token_accuracy": 0.9510793387889862, | |
| "num_tokens": 4137712.0, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 2.06993006993007, | |
| "grad_norm": 2.3278118252307975, | |
| "learning_rate": 3.4137279310278054e-06, | |
| "loss": 0.2021, | |
| "mean_token_accuracy": 0.9442847967147827, | |
| "num_tokens": 4148072.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 2.0745920745920747, | |
| "grad_norm": 2.094002783532905, | |
| "learning_rate": 3.392162016850945e-06, | |
| "loss": 0.1972, | |
| "mean_token_accuracy": 0.9459817409515381, | |
| "num_tokens": 4158042.0, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 2.0792540792540795, | |
| "grad_norm": 2.142035620280267, | |
| "learning_rate": 3.3706579441609224e-06, | |
| "loss": 0.1887, | |
| "mean_token_accuracy": 0.9461734592914581, | |
| "num_tokens": 4167033.0, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 2.0839160839160837, | |
| "grad_norm": 2.4332724683825884, | |
| "learning_rate": 3.3492163438619362e-06, | |
| "loss": 0.2023, | |
| "mean_token_accuracy": 0.9443434476852417, | |
| "num_tokens": 4176760.0, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 2.0885780885780885, | |
| "grad_norm": 2.228646127092191, | |
| "learning_rate": 3.327837845025315e-06, | |
| "loss": 0.1917, | |
| "mean_token_accuracy": 0.9497533142566681, | |
| "num_tokens": 4186226.0, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 2.093240093240093, | |
| "grad_norm": 2.459774250577539, | |
| "learning_rate": 3.3065230748710646e-06, | |
| "loss": 0.2124, | |
| "mean_token_accuracy": 0.9428280293941498, | |
| "num_tokens": 4195499.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.097902097902098, | |
| "grad_norm": 2.4038605044962686, | |
| "learning_rate": 3.2852726587494673e-06, | |
| "loss": 0.1825, | |
| "mean_token_accuracy": 0.9510272741317749, | |
| "num_tokens": 4204395.0, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 2.1025641025641026, | |
| "grad_norm": 2.4928140282417517, | |
| "learning_rate": 3.2640872201227364e-06, | |
| "loss": 0.1967, | |
| "mean_token_accuracy": 0.9442079365253448, | |
| "num_tokens": 4213737.0, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 2.1072261072261074, | |
| "grad_norm": 2.256378296799341, | |
| "learning_rate": 3.242967380546717e-06, | |
| "loss": 0.1826, | |
| "mean_token_accuracy": 0.947964072227478, | |
| "num_tokens": 4223235.0, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 2.111888111888112, | |
| "grad_norm": 2.1809559104100487, | |
| "learning_rate": 3.221913759652657e-06, | |
| "loss": 0.1738, | |
| "mean_token_accuracy": 0.9498437345027924, | |
| "num_tokens": 4232422.0, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 2.1165501165501164, | |
| "grad_norm": 2.2886117964470643, | |
| "learning_rate": 3.20092697512903e-06, | |
| "loss": 0.2023, | |
| "mean_token_accuracy": 0.9447461664676666, | |
| "num_tokens": 4241879.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 2.121212121212121, | |
| "grad_norm": 2.212820158000472, | |
| "learning_rate": 3.180007642703402e-06, | |
| "loss": 0.1865, | |
| "mean_token_accuracy": 0.9497852921485901, | |
| "num_tokens": 4251371.0, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 2.125874125874126, | |
| "grad_norm": 2.2290617079488553, | |
| "learning_rate": 3.1591563761243816e-06, | |
| "loss": 0.1904, | |
| "mean_token_accuracy": 0.9462641477584839, | |
| "num_tokens": 4260491.0, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 2.1305361305361306, | |
| "grad_norm": 2.143157870337979, | |
| "learning_rate": 3.138373787143598e-06, | |
| "loss": 0.1825, | |
| "mean_token_accuracy": 0.9492957592010498, | |
| "num_tokens": 4269810.0, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 2.1351981351981353, | |
| "grad_norm": 2.38741737774062, | |
| "learning_rate": 3.1176604854977665e-06, | |
| "loss": 0.1902, | |
| "mean_token_accuracy": 0.9479398429393768, | |
| "num_tokens": 4279116.0, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 2.13986013986014, | |
| "grad_norm": 2.3435622778544203, | |
| "learning_rate": 3.0970170788907878e-06, | |
| "loss": 0.2016, | |
| "mean_token_accuracy": 0.9475513100624084, | |
| "num_tokens": 4288073.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.1445221445221447, | |
| "grad_norm": 2.338474318951203, | |
| "learning_rate": 3.076444172975932e-06, | |
| "loss": 0.1734, | |
| "mean_token_accuracy": 0.952549546957016, | |
| "num_tokens": 4296974.0, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 2.149184149184149, | |
| "grad_norm": 2.5215652212493653, | |
| "learning_rate": 3.055942371338052e-06, | |
| "loss": 0.199, | |
| "mean_token_accuracy": 0.9409568309783936, | |
| "num_tokens": 4306323.0, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 2.1538461538461537, | |
| "grad_norm": 2.327317690460305, | |
| "learning_rate": 3.035512275475896e-06, | |
| "loss": 0.1827, | |
| "mean_token_accuracy": 0.9496185481548309, | |
| "num_tokens": 4315250.0, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 2.1585081585081585, | |
| "grad_norm": 2.210597087672196, | |
| "learning_rate": 3.0151544847844385e-06, | |
| "loss": 0.1742, | |
| "mean_token_accuracy": 0.9511962532997131, | |
| "num_tokens": 4324195.0, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 2.163170163170163, | |
| "grad_norm": 2.3071386492578045, | |
| "learning_rate": 2.994869596537316e-06, | |
| "loss": 0.1983, | |
| "mean_token_accuracy": 0.9448690712451935, | |
| "num_tokens": 4333101.0, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 2.167832167832168, | |
| "grad_norm": 2.2596793138798104, | |
| "learning_rate": 2.9746582058692803e-06, | |
| "loss": 0.1803, | |
| "mean_token_accuracy": 0.9491128325462341, | |
| "num_tokens": 4341626.0, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 2.1724941724941726, | |
| "grad_norm": 2.2305243738775964, | |
| "learning_rate": 2.954520905758762e-06, | |
| "loss": 0.1935, | |
| "mean_token_accuracy": 0.9493462145328522, | |
| "num_tokens": 4350773.0, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 2.177156177156177, | |
| "grad_norm": 1.8784409884434285, | |
| "learning_rate": 2.934458287010455e-06, | |
| "loss": 0.1608, | |
| "mean_token_accuracy": 0.9552096724510193, | |
| "num_tokens": 4360531.0, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 2.1818181818181817, | |
| "grad_norm": 2.419757102541458, | |
| "learning_rate": 2.9144709382379955e-06, | |
| "loss": 0.1929, | |
| "mean_token_accuracy": 0.9458503425121307, | |
| "num_tokens": 4370583.0, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 2.1864801864801864, | |
| "grad_norm": 2.1959363632762354, | |
| "learning_rate": 2.894559445846682e-06, | |
| "loss": 0.1891, | |
| "mean_token_accuracy": 0.9474013149738312, | |
| "num_tokens": 4380422.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.191142191142191, | |
| "grad_norm": 2.4899530178247877, | |
| "learning_rate": 2.8747243940162774e-06, | |
| "loss": 0.2115, | |
| "mean_token_accuracy": 0.9412341713905334, | |
| "num_tokens": 4389278.0, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 2.195804195804196, | |
| "grad_norm": 2.4399237002135146, | |
| "learning_rate": 2.854966364683872e-06, | |
| "loss": 0.1853, | |
| "mean_token_accuracy": 0.9496332406997681, | |
| "num_tokens": 4397770.0, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 2.2004662004662006, | |
| "grad_norm": 2.215224089050249, | |
| "learning_rate": 2.835285937526801e-06, | |
| "loss": 0.1783, | |
| "mean_token_accuracy": 0.9523276388645172, | |
| "num_tokens": 4407968.0, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 2.2051282051282053, | |
| "grad_norm": 2.2820905304008647, | |
| "learning_rate": 2.8156836899456475e-06, | |
| "loss": 0.1972, | |
| "mean_token_accuracy": 0.9460384547710419, | |
| "num_tokens": 4417210.0, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 2.20979020979021, | |
| "grad_norm": 2.261724927462342, | |
| "learning_rate": 2.796160197047294e-06, | |
| "loss": 0.1754, | |
| "mean_token_accuracy": 0.9524807631969452, | |
| "num_tokens": 4427199.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 2.2144522144522143, | |
| "grad_norm": 2.2910100819309966, | |
| "learning_rate": 2.7767160316280583e-06, | |
| "loss": 0.187, | |
| "mean_token_accuracy": 0.9480907320976257, | |
| "num_tokens": 4435729.0, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 2.219114219114219, | |
| "grad_norm": 2.5356260307985727, | |
| "learning_rate": 2.757351764156877e-06, | |
| "loss": 0.1898, | |
| "mean_token_accuracy": 0.9483682215213776, | |
| "num_tokens": 4444444.0, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 2.2237762237762237, | |
| "grad_norm": 2.185337315098767, | |
| "learning_rate": 2.7380679627585817e-06, | |
| "loss": 0.1746, | |
| "mean_token_accuracy": 0.9521575570106506, | |
| "num_tokens": 4453166.0, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 2.2284382284382285, | |
| "grad_norm": 2.379831452455389, | |
| "learning_rate": 2.718865193197218e-06, | |
| "loss": 0.1947, | |
| "mean_token_accuracy": 0.9449348151683807, | |
| "num_tokens": 4462042.0, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 2.233100233100233, | |
| "grad_norm": 2.171897352199924, | |
| "learning_rate": 2.699744018859457e-06, | |
| "loss": 0.1717, | |
| "mean_token_accuracy": 0.9506651163101196, | |
| "num_tokens": 4471400.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.237762237762238, | |
| "grad_norm": 2.216542641182696, | |
| "learning_rate": 2.680705000738061e-06, | |
| "loss": 0.1691, | |
| "mean_token_accuracy": 0.9513165950775146, | |
| "num_tokens": 4481107.0, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 2.242424242424242, | |
| "grad_norm": 2.1820463594567134, | |
| "learning_rate": 2.661748697415423e-06, | |
| "loss": 0.1753, | |
| "mean_token_accuracy": 0.94921013712883, | |
| "num_tokens": 4490865.0, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 2.247086247086247, | |
| "grad_norm": 2.3975273161711868, | |
| "learning_rate": 2.642875665047182e-06, | |
| "loss": 0.2074, | |
| "mean_token_accuracy": 0.9420484900474548, | |
| "num_tokens": 4500700.0, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 2.2517482517482517, | |
| "grad_norm": 2.3576310035425885, | |
| "learning_rate": 2.6240864573459095e-06, | |
| "loss": 0.1933, | |
| "mean_token_accuracy": 0.948281466960907, | |
| "num_tokens": 4509781.0, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 2.2564102564102564, | |
| "grad_norm": 2.1072685233889783, | |
| "learning_rate": 2.6053816255648543e-06, | |
| "loss": 0.1791, | |
| "mean_token_accuracy": 0.9498388171195984, | |
| "num_tokens": 4519055.0, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 2.261072261072261, | |
| "grad_norm": 2.3694467761113365, | |
| "learning_rate": 2.586761718481776e-06, | |
| "loss": 0.2016, | |
| "mean_token_accuracy": 0.9420836865901947, | |
| "num_tokens": 4528368.0, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 2.265734265734266, | |
| "grad_norm": 2.354865943769181, | |
| "learning_rate": 2.5682272823828474e-06, | |
| "loss": 0.195, | |
| "mean_token_accuracy": 0.9475519955158234, | |
| "num_tokens": 4537216.0, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 2.2703962703962706, | |
| "grad_norm": 2.0798315233476146, | |
| "learning_rate": 2.5497788610466177e-06, | |
| "loss": 0.1775, | |
| "mean_token_accuracy": 0.9497499167919159, | |
| "num_tokens": 4546595.0, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 2.2750582750582753, | |
| "grad_norm": 2.245906905354928, | |
| "learning_rate": 2.53141699572807e-06, | |
| "loss": 0.1873, | |
| "mean_token_accuracy": 0.9471332430839539, | |
| "num_tokens": 4555647.0, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 2.2797202797202796, | |
| "grad_norm": 2.3166573667243973, | |
| "learning_rate": 2.5131422251427313e-06, | |
| "loss": 0.179, | |
| "mean_token_accuracy": 0.9487544298171997, | |
| "num_tokens": 4565487.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.2843822843822843, | |
| "grad_norm": 2.356746974573123, | |
| "learning_rate": 2.4949550854508713e-06, | |
| "loss": 0.1977, | |
| "mean_token_accuracy": 0.9462016522884369, | |
| "num_tokens": 4574193.0, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 2.289044289044289, | |
| "grad_norm": 2.3601704690415213, | |
| "learning_rate": 2.476856110241773e-06, | |
| "loss": 0.1963, | |
| "mean_token_accuracy": 0.9484397768974304, | |
| "num_tokens": 4583652.0, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 2.2937062937062938, | |
| "grad_norm": 2.194739252262633, | |
| "learning_rate": 2.458845830518082e-06, | |
| "loss": 0.1808, | |
| "mean_token_accuracy": 0.948834627866745, | |
| "num_tokens": 4593314.0, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 2.2983682983682985, | |
| "grad_norm": 2.262428951601218, | |
| "learning_rate": 2.440924774680215e-06, | |
| "loss": 0.196, | |
| "mean_token_accuracy": 0.9466191530227661, | |
| "num_tokens": 4602528.0, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 2.303030303030303, | |
| "grad_norm": 2.2737186057132246, | |
| "learning_rate": 2.4230934685108707e-06, | |
| "loss": 0.1959, | |
| "mean_token_accuracy": 0.948832631111145, | |
| "num_tokens": 4611387.0, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 2.3076923076923075, | |
| "grad_norm": 2.19558751291413, | |
| "learning_rate": 2.405352435159595e-06, | |
| "loss": 0.1832, | |
| "mean_token_accuracy": 0.9490224421024323, | |
| "num_tokens": 4620809.0, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 2.312354312354312, | |
| "grad_norm": 2.222802090174067, | |
| "learning_rate": 2.3877021951274374e-06, | |
| "loss": 0.1911, | |
| "mean_token_accuracy": 0.9486294388771057, | |
| "num_tokens": 4629397.0, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 2.317016317016317, | |
| "grad_norm": 2.206389027273452, | |
| "learning_rate": 2.3701432662516772e-06, | |
| "loss": 0.1727, | |
| "mean_token_accuracy": 0.9500547051429749, | |
| "num_tokens": 4638953.0, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 2.3216783216783217, | |
| "grad_norm": 2.5956601776793056, | |
| "learning_rate": 2.3526761636906313e-06, | |
| "loss": 0.1849, | |
| "mean_token_accuracy": 0.9497621655464172, | |
| "num_tokens": 4648328.0, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 2.3263403263403264, | |
| "grad_norm": 2.222447522693371, | |
| "learning_rate": 2.3353013999085402e-06, | |
| "loss": 0.1878, | |
| "mean_token_accuracy": 0.9493353068828583, | |
| "num_tokens": 4658253.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.331002331002331, | |
| "grad_norm": 2.1725903860996265, | |
| "learning_rate": 2.3180194846605367e-06, | |
| "loss": 0.1731, | |
| "mean_token_accuracy": 0.9540884494781494, | |
| "num_tokens": 4667108.0, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 2.335664335664336, | |
| "grad_norm": 2.2106617960868546, | |
| "learning_rate": 2.300830924977683e-06, | |
| "loss": 0.1827, | |
| "mean_token_accuracy": 0.9487708210945129, | |
| "num_tokens": 4675537.0, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 2.3403263403263406, | |
| "grad_norm": 2.26357210261458, | |
| "learning_rate": 2.283736225152099e-06, | |
| "loss": 0.1781, | |
| "mean_token_accuracy": 0.9498123228549957, | |
| "num_tokens": 4684963.0, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 2.344988344988345, | |
| "grad_norm": 2.2970053926748366, | |
| "learning_rate": 2.26673588672217e-06, | |
| "loss": 0.1947, | |
| "mean_token_accuracy": 0.9470812678337097, | |
| "num_tokens": 4694167.0, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 2.3496503496503496, | |
| "grad_norm": 2.180761219218081, | |
| "learning_rate": 2.249830408457826e-06, | |
| "loss": 0.1734, | |
| "mean_token_accuracy": 0.9532337486743927, | |
| "num_tokens": 4704455.0, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 2.3543123543123543, | |
| "grad_norm": 2.1592931519499854, | |
| "learning_rate": 2.2330202863459123e-06, | |
| "loss": 0.18, | |
| "mean_token_accuracy": 0.9502027928829193, | |
| "num_tokens": 4714149.0, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 2.358974358974359, | |
| "grad_norm": 2.3049252234685382, | |
| "learning_rate": 2.2163060135756364e-06, | |
| "loss": 0.193, | |
| "mean_token_accuracy": 0.9451474845409393, | |
| "num_tokens": 4724018.0, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 2.3636363636363638, | |
| "grad_norm": 2.047360241682356, | |
| "learning_rate": 2.1996880805241e-06, | |
| "loss": 0.1603, | |
| "mean_token_accuracy": 0.9543373584747314, | |
| "num_tokens": 4733728.0, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 2.3682983682983685, | |
| "grad_norm": 2.243883991122542, | |
| "learning_rate": 2.1831669747419093e-06, | |
| "loss": 0.1895, | |
| "mean_token_accuracy": 0.9504224061965942, | |
| "num_tokens": 4743424.0, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 2.3729603729603728, | |
| "grad_norm": 2.0125158383022383, | |
| "learning_rate": 2.166743180938875e-06, | |
| "loss": 0.1592, | |
| "mean_token_accuracy": 0.9573898613452911, | |
| "num_tokens": 4753172.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.3776223776223775, | |
| "grad_norm": 1.9899532151234143, | |
| "learning_rate": 2.150417180969784e-06, | |
| "loss": 0.1689, | |
| "mean_token_accuracy": 0.9538862705230713, | |
| "num_tokens": 4763449.0, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 2.382284382284382, | |
| "grad_norm": 2.1855347374072602, | |
| "learning_rate": 2.1341894538202726e-06, | |
| "loss": 0.1933, | |
| "mean_token_accuracy": 0.9471964538097382, | |
| "num_tokens": 4773131.0, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 2.386946386946387, | |
| "grad_norm": 2.1191232295552638, | |
| "learning_rate": 2.1180604755927646e-06, | |
| "loss": 0.1705, | |
| "mean_token_accuracy": 0.9518732130527496, | |
| "num_tokens": 4782766.0, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 2.3916083916083917, | |
| "grad_norm": 2.1956924392039268, | |
| "learning_rate": 2.102030719492508e-06, | |
| "loss": 0.1917, | |
| "mean_token_accuracy": 0.9490616321563721, | |
| "num_tokens": 4792428.0, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 2.3962703962703964, | |
| "grad_norm": 2.2810533470677705, | |
| "learning_rate": 2.086100655813688e-06, | |
| "loss": 0.1862, | |
| "mean_token_accuracy": 0.9471859931945801, | |
| "num_tokens": 4801662.0, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 2.400932400932401, | |
| "grad_norm": 2.244633860144498, | |
| "learning_rate": 2.0702707519256365e-06, | |
| "loss": 0.18, | |
| "mean_token_accuracy": 0.9517810344696045, | |
| "num_tokens": 4810704.0, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 2.4055944055944054, | |
| "grad_norm": 2.494051955620123, | |
| "learning_rate": 2.0545414722591096e-06, | |
| "loss": 0.1864, | |
| "mean_token_accuracy": 0.9491457939147949, | |
| "num_tokens": 4819473.0, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 2.41025641025641, | |
| "grad_norm": 2.0714876270735867, | |
| "learning_rate": 2.03891327829267e-06, | |
| "loss": 0.1611, | |
| "mean_token_accuracy": 0.9556960761547089, | |
| "num_tokens": 4829237.0, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 2.414918414918415, | |
| "grad_norm": 2.1368173991326387, | |
| "learning_rate": 2.0233866285391455e-06, | |
| "loss": 0.175, | |
| "mean_token_accuracy": 0.9533757269382477, | |
| "num_tokens": 4838770.0, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 2.4195804195804196, | |
| "grad_norm": 2.4559899729586157, | |
| "learning_rate": 2.0079619785321713e-06, | |
| "loss": 0.178, | |
| "mean_token_accuracy": 0.9503377377986908, | |
| "num_tokens": 4848313.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.4242424242424243, | |
| "grad_norm": 2.408883608780403, | |
| "learning_rate": 1.992639780812838e-06, | |
| "loss": 0.1797, | |
| "mean_token_accuracy": 0.9508785903453827, | |
| "num_tokens": 4856905.0, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 2.428904428904429, | |
| "grad_norm": 2.4178249725991776, | |
| "learning_rate": 1.9774204849164004e-06, | |
| "loss": 0.1909, | |
| "mean_token_accuracy": 0.9491060078144073, | |
| "num_tokens": 4866602.0, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 2.4335664335664333, | |
| "grad_norm": 2.1757038919566085, | |
| "learning_rate": 1.9623045373590955e-06, | |
| "loss": 0.1734, | |
| "mean_token_accuracy": 0.9516046643257141, | |
| "num_tokens": 4875823.0, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 2.438228438228438, | |
| "grad_norm": 2.27555860775744, | |
| "learning_rate": 1.9472923816250427e-06, | |
| "loss": 0.1869, | |
| "mean_token_accuracy": 0.9469164311885834, | |
| "num_tokens": 4885891.0, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 2.4428904428904428, | |
| "grad_norm": 2.3720687067635517, | |
| "learning_rate": 1.9323844581532334e-06, | |
| "loss": 0.1901, | |
| "mean_token_accuracy": 0.9472060799598694, | |
| "num_tokens": 4895721.0, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 2.4475524475524475, | |
| "grad_norm": 2.3467414380479483, | |
| "learning_rate": 1.9175812043246034e-06, | |
| "loss": 0.1904, | |
| "mean_token_accuracy": 0.9455529749393463, | |
| "num_tokens": 4905678.0, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 2.4522144522144522, | |
| "grad_norm": 2.1506573419595045, | |
| "learning_rate": 1.9028830544492074e-06, | |
| "loss": 0.1903, | |
| "mean_token_accuracy": 0.948146402835846, | |
| "num_tokens": 4915087.0, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 2.456876456876457, | |
| "grad_norm": 2.379645454246707, | |
| "learning_rate": 1.8882904397534705e-06, | |
| "loss": 0.2152, | |
| "mean_token_accuracy": 0.9423483908176422, | |
| "num_tokens": 4924020.0, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 2.4615384615384617, | |
| "grad_norm": 2.264126564788235, | |
| "learning_rate": 1.8738037883675445e-06, | |
| "loss": 0.1844, | |
| "mean_token_accuracy": 0.9492262303829193, | |
| "num_tokens": 4933023.0, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 2.4662004662004664, | |
| "grad_norm": 2.190357275615204, | |
| "learning_rate": 1.8594235253127373e-06, | |
| "loss": 0.1996, | |
| "mean_token_accuracy": 0.9468533098697662, | |
| "num_tokens": 4941720.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.4708624708624707, | |
| "grad_norm": 2.0683527870115483, | |
| "learning_rate": 1.8451500724890509e-06, | |
| "loss": 0.1852, | |
| "mean_token_accuracy": 0.9456667006015778, | |
| "num_tokens": 4951562.0, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 2.4755244755244754, | |
| "grad_norm": 2.3584045584131617, | |
| "learning_rate": 1.8309838486627995e-06, | |
| "loss": 0.2036, | |
| "mean_token_accuracy": 0.94382044672966, | |
| "num_tokens": 4960478.0, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 2.48018648018648, | |
| "grad_norm": 2.1234920339277044, | |
| "learning_rate": 1.816925269454327e-06, | |
| "loss": 0.1666, | |
| "mean_token_accuracy": 0.9552336037158966, | |
| "num_tokens": 4970160.0, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 2.484848484848485, | |
| "grad_norm": 2.3345270471564303, | |
| "learning_rate": 1.8029747473258092e-06, | |
| "loss": 0.1859, | |
| "mean_token_accuracy": 0.9483939707279205, | |
| "num_tokens": 4979864.0, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 2.4895104895104896, | |
| "grad_norm": 2.407762000225371, | |
| "learning_rate": 1.789132691569153e-06, | |
| "loss": 0.195, | |
| "mean_token_accuracy": 0.9456151127815247, | |
| "num_tokens": 4988667.0, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 2.4941724941724943, | |
| "grad_norm": 2.347393889553957, | |
| "learning_rate": 1.7753995082939932e-06, | |
| "loss": 0.1732, | |
| "mean_token_accuracy": 0.9526363909244537, | |
| "num_tokens": 4998687.0, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 2.4988344988344986, | |
| "grad_norm": 2.5390123853320445, | |
| "learning_rate": 1.7617756004157693e-06, | |
| "loss": 0.1987, | |
| "mean_token_accuracy": 0.945299506187439, | |
| "num_tokens": 5007734.0, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 2.5034965034965033, | |
| "grad_norm": 2.438556510312228, | |
| "learning_rate": 1.7482613676439153e-06, | |
| "loss": 0.1943, | |
| "mean_token_accuracy": 0.9447510838508606, | |
| "num_tokens": 5016545.0, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 2.508158508158508, | |
| "grad_norm": 2.3154480135315194, | |
| "learning_rate": 1.7348572064701188e-06, | |
| "loss": 0.1947, | |
| "mean_token_accuracy": 0.9469634890556335, | |
| "num_tokens": 5025652.0, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 2.5128205128205128, | |
| "grad_norm": 2.149785741270873, | |
| "learning_rate": 1.721563510156704e-06, | |
| "loss": 0.1718, | |
| "mean_token_accuracy": 0.9520467817783356, | |
| "num_tokens": 5035567.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.5174825174825175, | |
| "grad_norm": 2.4390965002580463, | |
| "learning_rate": 1.7083806687250795e-06, | |
| "loss": 0.1999, | |
| "mean_token_accuracy": 0.947026789188385, | |
| "num_tokens": 5044442.0, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 2.5221445221445222, | |
| "grad_norm": 2.294881333986716, | |
| "learning_rate": 1.6953090689443074e-06, | |
| "loss": 0.1868, | |
| "mean_token_accuracy": 0.9499901831150055, | |
| "num_tokens": 5053663.0, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 2.526806526806527, | |
| "grad_norm": 2.2057605088088197, | |
| "learning_rate": 1.6823490943197473e-06, | |
| "loss": 0.1719, | |
| "mean_token_accuracy": 0.9531058371067047, | |
| "num_tokens": 5063344.0, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 2.5314685314685317, | |
| "grad_norm": 2.4507085825396007, | |
| "learning_rate": 1.6695011250818094e-06, | |
| "loss": 0.1717, | |
| "mean_token_accuracy": 0.9547788798809052, | |
| "num_tokens": 5073038.0, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 2.5361305361305364, | |
| "grad_norm": 2.3638435763054493, | |
| "learning_rate": 1.6567655381747976e-06, | |
| "loss": 0.1927, | |
| "mean_token_accuracy": 0.9466958940029144, | |
| "num_tokens": 5081603.0, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 2.5407925407925407, | |
| "grad_norm": 2.32752114137372, | |
| "learning_rate": 1.6441427072458493e-06, | |
| "loss": 0.1958, | |
| "mean_token_accuracy": 0.9444697499275208, | |
| "num_tokens": 5091091.0, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 2.5454545454545454, | |
| "grad_norm": 2.509857709791671, | |
| "learning_rate": 1.6316330026339743e-06, | |
| "loss": 0.213, | |
| "mean_token_accuracy": 0.941786378622055, | |
| "num_tokens": 5100278.0, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 2.55011655011655, | |
| "grad_norm": 2.0952838106121914, | |
| "learning_rate": 1.6192367913591916e-06, | |
| "loss": 0.1653, | |
| "mean_token_accuracy": 0.9539439380168915, | |
| "num_tokens": 5110291.0, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 2.554778554778555, | |
| "grad_norm": 2.4251986988481704, | |
| "learning_rate": 1.6069544371117556e-06, | |
| "loss": 0.1737, | |
| "mean_token_accuracy": 0.9515743851661682, | |
| "num_tokens": 5119413.0, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 2.5594405594405596, | |
| "grad_norm": 2.2675536817510276, | |
| "learning_rate": 1.5947863002414938e-06, | |
| "loss": 0.1816, | |
| "mean_token_accuracy": 0.9483968019485474, | |
| "num_tokens": 5128968.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.564102564102564, | |
| "grad_norm": 2.225047159227379, | |
| "learning_rate": 1.5827327377472262e-06, | |
| "loss": 0.1872, | |
| "mean_token_accuracy": 0.9502107799053192, | |
| "num_tokens": 5138040.0, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 2.5687645687645686, | |
| "grad_norm": 2.5646265404291033, | |
| "learning_rate": 1.5707941032662967e-06, | |
| "loss": 0.1941, | |
| "mean_token_accuracy": 0.9476030170917511, | |
| "num_tokens": 5147278.0, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 2.5734265734265733, | |
| "grad_norm": 2.512848592730855, | |
| "learning_rate": 1.558970747064198e-06, | |
| "loss": 0.184, | |
| "mean_token_accuracy": 0.9474705457687378, | |
| "num_tokens": 5156257.0, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 2.578088578088578, | |
| "grad_norm": 2.092641334488186, | |
| "learning_rate": 1.5472630160242921e-06, | |
| "loss": 0.1692, | |
| "mean_token_accuracy": 0.9498989582061768, | |
| "num_tokens": 5166290.0, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 2.582750582750583, | |
| "grad_norm": 2.232715743448255, | |
| "learning_rate": 1.5356712536376345e-06, | |
| "loss": 0.1803, | |
| "mean_token_accuracy": 0.9500272274017334, | |
| "num_tokens": 5176118.0, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 2.5874125874125875, | |
| "grad_norm": 2.5230454773039828, | |
| "learning_rate": 1.5241957999928974e-06, | |
| "loss": 0.1689, | |
| "mean_token_accuracy": 0.952000617980957, | |
| "num_tokens": 5185921.0, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 2.5920745920745922, | |
| "grad_norm": 2.6026889894908978, | |
| "learning_rate": 1.5128369917663924e-06, | |
| "loss": 0.1873, | |
| "mean_token_accuracy": 0.9518662393093109, | |
| "num_tokens": 5194883.0, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 2.596736596736597, | |
| "grad_norm": 2.4331824963283375, | |
| "learning_rate": 1.5015951622121896e-06, | |
| "loss": 0.1782, | |
| "mean_token_accuracy": 0.9511874914169312, | |
| "num_tokens": 5203915.0, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 2.6013986013986012, | |
| "grad_norm": 2.3494840072703598, | |
| "learning_rate": 1.490470641152345e-06, | |
| "loss": 0.1832, | |
| "mean_token_accuracy": 0.9490853250026703, | |
| "num_tokens": 5212979.0, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 2.606060606060606, | |
| "grad_norm": 2.37225992368595, | |
| "learning_rate": 1.4794637549672182e-06, | |
| "loss": 0.2049, | |
| "mean_token_accuracy": 0.9459311068058014, | |
| "num_tokens": 5222060.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.6107226107226107, | |
| "grad_norm": 2.2581667401414354, | |
| "learning_rate": 1.4685748265859043e-06, | |
| "loss": 0.1853, | |
| "mean_token_accuracy": 0.949056476354599, | |
| "num_tokens": 5230955.0, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 2.6153846153846154, | |
| "grad_norm": 2.049312050186223, | |
| "learning_rate": 1.457804175476751e-06, | |
| "loss": 0.1627, | |
| "mean_token_accuracy": 0.9556883871555328, | |
| "num_tokens": 5241027.0, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 2.62004662004662, | |
| "grad_norm": 2.2972322506270095, | |
| "learning_rate": 1.447152117637992e-06, | |
| "loss": 0.185, | |
| "mean_token_accuracy": 0.9484710693359375, | |
| "num_tokens": 5250218.0, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 2.624708624708625, | |
| "grad_norm": 2.1577151240414443, | |
| "learning_rate": 1.436618965588472e-06, | |
| "loss": 0.1715, | |
| "mean_token_accuracy": 0.9517091810703278, | |
| "num_tokens": 5259812.0, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 2.629370629370629, | |
| "grad_norm": 2.2654942496472836, | |
| "learning_rate": 1.4262050283584836e-06, | |
| "loss": 0.1708, | |
| "mean_token_accuracy": 0.952095627784729, | |
| "num_tokens": 5268750.0, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 2.634032634032634, | |
| "grad_norm": 2.0539513008080132, | |
| "learning_rate": 1.4159106114806943e-06, | |
| "loss": 0.1703, | |
| "mean_token_accuracy": 0.9545489847660065, | |
| "num_tokens": 5277570.0, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 2.6386946386946386, | |
| "grad_norm": 2.2352237949709814, | |
| "learning_rate": 1.4057360169811832e-06, | |
| "loss": 0.1856, | |
| "mean_token_accuracy": 0.9448749423027039, | |
| "num_tokens": 5287563.0, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 2.6433566433566433, | |
| "grad_norm": 2.274026900774616, | |
| "learning_rate": 1.3956815433705861e-06, | |
| "loss": 0.1854, | |
| "mean_token_accuracy": 0.9487999975681305, | |
| "num_tokens": 5296479.0, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 2.648018648018648, | |
| "grad_norm": 2.271359052605945, | |
| "learning_rate": 1.3857474856353299e-06, | |
| "loss": 0.1895, | |
| "mean_token_accuracy": 0.9485887885093689, | |
| "num_tokens": 5305636.0, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 2.652680652680653, | |
| "grad_norm": 2.2832232390511935, | |
| "learning_rate": 1.3759341352289832e-06, | |
| "loss": 0.1919, | |
| "mean_token_accuracy": 0.9442925155162811, | |
| "num_tokens": 5315157.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.6573426573426575, | |
| "grad_norm": 2.4224807634049874, | |
| "learning_rate": 1.3662417800637023e-06, | |
| "loss": 0.196, | |
| "mean_token_accuracy": 0.9467974901199341, | |
| "num_tokens": 5324117.0, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 2.6620046620046622, | |
| "grad_norm": 2.383821092419705, | |
| "learning_rate": 1.3566707045017867e-06, | |
| "loss": 0.181, | |
| "mean_token_accuracy": 0.9486918449401855, | |
| "num_tokens": 5332491.0, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 2.451846629070992, | |
| "learning_rate": 1.3472211893473327e-06, | |
| "loss": 0.183, | |
| "mean_token_accuracy": 0.9497961103916168, | |
| "num_tokens": 5341476.0, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 2.6713286713286712, | |
| "grad_norm": 2.2050228122309297, | |
| "learning_rate": 1.3378935118380004e-06, | |
| "loss": 0.1664, | |
| "mean_token_accuracy": 0.9522497951984406, | |
| "num_tokens": 5351031.0, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 2.675990675990676, | |
| "grad_norm": 2.221685948440498, | |
| "learning_rate": 1.3286879456368746e-06, | |
| "loss": 0.1868, | |
| "mean_token_accuracy": 0.9497689306735992, | |
| "num_tokens": 5360953.0, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 2.6806526806526807, | |
| "grad_norm": 2.3611824277077895, | |
| "learning_rate": 1.319604760824439e-06, | |
| "loss": 0.1885, | |
| "mean_token_accuracy": 0.9504337906837463, | |
| "num_tokens": 5370450.0, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 2.6853146853146854, | |
| "grad_norm": 2.524561240638075, | |
| "learning_rate": 1.31064422389065e-06, | |
| "loss": 0.1852, | |
| "mean_token_accuracy": 0.947800487279892, | |
| "num_tokens": 5379656.0, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 2.6899766899766897, | |
| "grad_norm": 2.4377644762322848, | |
| "learning_rate": 1.3018065977271215e-06, | |
| "loss": 0.2048, | |
| "mean_token_accuracy": 0.9430462419986725, | |
| "num_tokens": 5388537.0, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 2.6946386946386944, | |
| "grad_norm": 2.399526282941045, | |
| "learning_rate": 1.293092141619407e-06, | |
| "loss": 0.1904, | |
| "mean_token_accuracy": 0.9497084021568298, | |
| "num_tokens": 5398026.0, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 2.699300699300699, | |
| "grad_norm": 2.461708260111587, | |
| "learning_rate": 1.2845011112394e-06, | |
| "loss": 0.1979, | |
| "mean_token_accuracy": 0.9429396092891693, | |
| "num_tokens": 5407274.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.703962703962704, | |
| "grad_norm": 2.1140139302951506, | |
| "learning_rate": 1.276033758637823e-06, | |
| "loss": 0.1711, | |
| "mean_token_accuracy": 0.954795241355896, | |
| "num_tokens": 5416685.0, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 2.7086247086247086, | |
| "grad_norm": 2.256627061683475, | |
| "learning_rate": 1.2676903322368423e-06, | |
| "loss": 0.1914, | |
| "mean_token_accuracy": 0.9461691081523895, | |
| "num_tokens": 5426216.0, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 2.7132867132867133, | |
| "grad_norm": 2.224029918327043, | |
| "learning_rate": 1.2594710768227734e-06, | |
| "loss": 0.1902, | |
| "mean_token_accuracy": 0.9479357004165649, | |
| "num_tokens": 5435218.0, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 2.717948717948718, | |
| "grad_norm": 2.3441213495740616, | |
| "learning_rate": 1.2513762335389004e-06, | |
| "loss": 0.1952, | |
| "mean_token_accuracy": 0.9469590187072754, | |
| "num_tokens": 5444848.0, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 2.722610722610723, | |
| "grad_norm": 2.2818966905915525, | |
| "learning_rate": 1.2434060398784039e-06, | |
| "loss": 0.1861, | |
| "mean_token_accuracy": 0.9511770606040955, | |
| "num_tokens": 5453935.0, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 2.7272727272727275, | |
| "grad_norm": 2.527822525615972, | |
| "learning_rate": 1.2355607296773896e-06, | |
| "loss": 0.1826, | |
| "mean_token_accuracy": 0.9509838223457336, | |
| "num_tokens": 5463825.0, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 2.731934731934732, | |
| "grad_norm": 2.3014422706991193, | |
| "learning_rate": 1.2278405331080296e-06, | |
| "loss": 0.2017, | |
| "mean_token_accuracy": 0.9433080554008484, | |
| "num_tokens": 5472633.0, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 2.7365967365967365, | |
| "grad_norm": 2.481469694015223, | |
| "learning_rate": 1.2202456766718092e-06, | |
| "loss": 0.1929, | |
| "mean_token_accuracy": 0.9468889534473419, | |
| "num_tokens": 5481508.0, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 2.7412587412587412, | |
| "grad_norm": 2.2243902331538803, | |
| "learning_rate": 1.212776383192883e-06, | |
| "loss": 0.1921, | |
| "mean_token_accuracy": 0.9495699405670166, | |
| "num_tokens": 5490583.0, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 2.745920745920746, | |
| "grad_norm": 2.079107821673962, | |
| "learning_rate": 1.2054328718115336e-06, | |
| "loss": 0.171, | |
| "mean_token_accuracy": 0.9525066316127777, | |
| "num_tokens": 5500701.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.7505827505827507, | |
| "grad_norm": 2.612790704517144, | |
| "learning_rate": 1.1982153579777483e-06, | |
| "loss": 0.1945, | |
| "mean_token_accuracy": 0.9466931521892548, | |
| "num_tokens": 5509719.0, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 2.755244755244755, | |
| "grad_norm": 2.279498702941535, | |
| "learning_rate": 1.1911240534448899e-06, | |
| "loss": 0.2023, | |
| "mean_token_accuracy": 0.9443814754486084, | |
| "num_tokens": 5518911.0, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 2.7599067599067597, | |
| "grad_norm": 2.302875853828702, | |
| "learning_rate": 1.1841591662634943e-06, | |
| "loss": 0.1782, | |
| "mean_token_accuracy": 0.9504655301570892, | |
| "num_tokens": 5528230.0, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 2.7645687645687644, | |
| "grad_norm": 2.391231643668634, | |
| "learning_rate": 1.1773209007751562e-06, | |
| "loss": 0.1973, | |
| "mean_token_accuracy": 0.9472830295562744, | |
| "num_tokens": 5537899.0, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 2.769230769230769, | |
| "grad_norm": 2.195727031813818, | |
| "learning_rate": 1.1706094576065416e-06, | |
| "loss": 0.1797, | |
| "mean_token_accuracy": 0.9503377079963684, | |
| "num_tokens": 5547675.0, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 2.773892773892774, | |
| "grad_norm": 2.3904334187953777, | |
| "learning_rate": 1.164025033663497e-06, | |
| "loss": 0.2021, | |
| "mean_token_accuracy": 0.9435946643352509, | |
| "num_tokens": 5557022.0, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 2.7785547785547786, | |
| "grad_norm": 2.2336433005731924, | |
| "learning_rate": 1.1575678221252763e-06, | |
| "loss": 0.178, | |
| "mean_token_accuracy": 0.9506051242351532, | |
| "num_tokens": 5566788.0, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 2.7832167832167833, | |
| "grad_norm": 2.45559515222341, | |
| "learning_rate": 1.1512380124388695e-06, | |
| "loss": 0.1885, | |
| "mean_token_accuracy": 0.948212593793869, | |
| "num_tokens": 5575464.0, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 2.787878787878788, | |
| "grad_norm": 2.3652897419571213, | |
| "learning_rate": 1.1450357903134463e-06, | |
| "loss": 0.1838, | |
| "mean_token_accuracy": 0.9474283754825592, | |
| "num_tokens": 5584904.0, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 2.792540792540793, | |
| "grad_norm": 2.3554447950041304, | |
| "learning_rate": 1.1389613377149086e-06, | |
| "loss": 0.1903, | |
| "mean_token_accuracy": 0.9471111297607422, | |
| "num_tokens": 5594389.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.797202797202797, | |
| "grad_norm": 2.2047518432891633, | |
| "learning_rate": 1.1330148328605484e-06, | |
| "loss": 0.1763, | |
| "mean_token_accuracy": 0.951262503862381, | |
| "num_tokens": 5603819.0, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 2.801864801864802, | |
| "grad_norm": 2.291063530837646, | |
| "learning_rate": 1.127196450213825e-06, | |
| "loss": 0.1773, | |
| "mean_token_accuracy": 0.9493178725242615, | |
| "num_tokens": 5614459.0, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 2.8065268065268065, | |
| "grad_norm": 2.11284252635657, | |
| "learning_rate": 1.1215063604792396e-06, | |
| "loss": 0.1694, | |
| "mean_token_accuracy": 0.9510103464126587, | |
| "num_tokens": 5623744.0, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 2.8111888111888113, | |
| "grad_norm": 2.1857533437327925, | |
| "learning_rate": 1.1159447305973313e-06, | |
| "loss": 0.1835, | |
| "mean_token_accuracy": 0.9497886300086975, | |
| "num_tokens": 5632743.0, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 2.815850815850816, | |
| "grad_norm": 2.3920794052563665, | |
| "learning_rate": 1.1105117237397777e-06, | |
| "loss": 0.1772, | |
| "mean_token_accuracy": 0.95287024974823, | |
| "num_tokens": 5641669.0, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 2.8205128205128203, | |
| "grad_norm": 2.3412796580645745, | |
| "learning_rate": 1.1052074993046102e-06, | |
| "loss": 0.1808, | |
| "mean_token_accuracy": 0.9482883512973785, | |
| "num_tokens": 5650735.0, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 2.825174825174825, | |
| "grad_norm": 2.3440493241719147, | |
| "learning_rate": 1.100032212911533e-06, | |
| "loss": 0.2039, | |
| "mean_token_accuracy": 0.9425942301750183, | |
| "num_tokens": 5659501.0, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 2.8298368298368297, | |
| "grad_norm": 2.0320304598963213, | |
| "learning_rate": 1.0949860163973616e-06, | |
| "loss": 0.1738, | |
| "mean_token_accuracy": 0.9530138373374939, | |
| "num_tokens": 5669759.0, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 2.8344988344988344, | |
| "grad_norm": 2.325052766726883, | |
| "learning_rate": 1.0900690578115643e-06, | |
| "loss": 0.1905, | |
| "mean_token_accuracy": 0.9488844573497772, | |
| "num_tokens": 5678984.0, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 2.839160839160839, | |
| "grad_norm": 2.4642634644073396, | |
| "learning_rate": 1.0852814814119238e-06, | |
| "loss": 0.2002, | |
| "mean_token_accuracy": 0.9488008916378021, | |
| "num_tokens": 5688220.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.843822843822844, | |
| "grad_norm": 2.345629463948856, | |
| "learning_rate": 1.0806234276602984e-06, | |
| "loss": 0.1949, | |
| "mean_token_accuracy": 0.9474165737628937, | |
| "num_tokens": 5697038.0, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 2.8484848484848486, | |
| "grad_norm": 2.0761774146720398, | |
| "learning_rate": 1.0760950332185055e-06, | |
| "loss": 0.1623, | |
| "mean_token_accuracy": 0.9569342732429504, | |
| "num_tokens": 5706946.0, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 2.8531468531468533, | |
| "grad_norm": 2.1184654015758357, | |
| "learning_rate": 1.071696430944311e-06, | |
| "loss": 0.1657, | |
| "mean_token_accuracy": 0.9557340145111084, | |
| "num_tokens": 5716519.0, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 2.857808857808858, | |
| "grad_norm": 2.7525481756348427, | |
| "learning_rate": 1.0674277498875325e-06, | |
| "loss": 0.192, | |
| "mean_token_accuracy": 0.9446141123771667, | |
| "num_tokens": 5725936.0, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 2.8624708624708624, | |
| "grad_norm": 2.415698399491206, | |
| "learning_rate": 1.0632891152862493e-06, | |
| "loss": 0.1881, | |
| "mean_token_accuracy": 0.9490151107311249, | |
| "num_tokens": 5735600.0, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 2.867132867132867, | |
| "grad_norm": 2.0904164041799644, | |
| "learning_rate": 1.0592806485631326e-06, | |
| "loss": 0.1651, | |
| "mean_token_accuracy": 0.9557408690452576, | |
| "num_tokens": 5745302.0, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 2.871794871794872, | |
| "grad_norm": 2.373611778744568, | |
| "learning_rate": 1.0554024673218808e-06, | |
| "loss": 0.1925, | |
| "mean_token_accuracy": 0.9461594521999359, | |
| "num_tokens": 5754314.0, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 2.8764568764568765, | |
| "grad_norm": 2.3770918318336687, | |
| "learning_rate": 1.0516546853437686e-06, | |
| "loss": 0.1888, | |
| "mean_token_accuracy": 0.9470888376235962, | |
| "num_tokens": 5763507.0, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 2.8811188811188813, | |
| "grad_norm": 2.615459038788364, | |
| "learning_rate": 1.0480374125843114e-06, | |
| "loss": 0.1873, | |
| "mean_token_accuracy": 0.9462830722332001, | |
| "num_tokens": 5772748.0, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 2.8857808857808855, | |
| "grad_norm": 2.412878313629322, | |
| "learning_rate": 1.0445507551700356e-06, | |
| "loss": 0.1896, | |
| "mean_token_accuracy": 0.9492884576320648, | |
| "num_tokens": 5782160.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.8904428904428903, | |
| "grad_norm": 2.349334019362764, | |
| "learning_rate": 1.0411948153953696e-06, | |
| "loss": 0.194, | |
| "mean_token_accuracy": 0.9473778307437897, | |
| "num_tokens": 5790728.0, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 2.895104895104895, | |
| "grad_norm": 2.342666618452988, | |
| "learning_rate": 1.0379696917196378e-06, | |
| "loss": 0.1764, | |
| "mean_token_accuracy": 0.9477843642234802, | |
| "num_tokens": 5800771.0, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 2.8997668997668997, | |
| "grad_norm": 2.369332709510537, | |
| "learning_rate": 1.0348754787641751e-06, | |
| "loss": 0.1775, | |
| "mean_token_accuracy": 0.949904203414917, | |
| "num_tokens": 5809633.0, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 2.9044289044289044, | |
| "grad_norm": 2.2013635420452173, | |
| "learning_rate": 1.031912267309549e-06, | |
| "loss": 0.1624, | |
| "mean_token_accuracy": 0.9559362530708313, | |
| "num_tokens": 5819766.0, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 2.909090909090909, | |
| "grad_norm": 2.208698138845042, | |
| "learning_rate": 1.029080144292899e-06, | |
| "loss": 0.1829, | |
| "mean_token_accuracy": 0.9497508108615875, | |
| "num_tokens": 5829413.0, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 2.913752913752914, | |
| "grad_norm": 2.332053491912036, | |
| "learning_rate": 1.026379192805382e-06, | |
| "loss": 0.1777, | |
| "mean_token_accuracy": 0.9528988003730774, | |
| "num_tokens": 5839124.0, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 2.9184149184149186, | |
| "grad_norm": 2.4789581729644077, | |
| "learning_rate": 1.0238094920897374e-06, | |
| "loss": 0.173, | |
| "mean_token_accuracy": 0.953327864408493, | |
| "num_tokens": 5848406.0, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 2.9230769230769234, | |
| "grad_norm": 2.2923048279485667, | |
| "learning_rate": 1.0213711175379614e-06, | |
| "loss": 0.171, | |
| "mean_token_accuracy": 0.956305056810379, | |
| "num_tokens": 5857592.0, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 2.9277389277389276, | |
| "grad_norm": 2.34320318426194, | |
| "learning_rate": 1.0190641406890946e-06, | |
| "loss": 0.1831, | |
| "mean_token_accuracy": 0.9514197111129761, | |
| "num_tokens": 5866687.0, | |
| "step": 629 | |
| }, | |
| { | |
| "epoch": 2.9324009324009324, | |
| "grad_norm": 2.4904828242049293, | |
| "learning_rate": 1.0168886292271246e-06, | |
| "loss": 0.189, | |
| "mean_token_accuracy": 0.9476959109306335, | |
| "num_tokens": 5875890.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.937062937062937, | |
| "grad_norm": 2.216909312747613, | |
| "learning_rate": 1.0148446469789979e-06, | |
| "loss": 0.179, | |
| "mean_token_accuracy": 0.9502739012241364, | |
| "num_tokens": 5884491.0, | |
| "step": 631 | |
| }, | |
| { | |
| "epoch": 2.941724941724942, | |
| "grad_norm": 2.393678749036967, | |
| "learning_rate": 1.0129322539127494e-06, | |
| "loss": 0.1815, | |
| "mean_token_accuracy": 0.9475315511226654, | |
| "num_tokens": 5894473.0, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 2.9463869463869465, | |
| "grad_norm": 2.3025122993118443, | |
| "learning_rate": 1.011151506135742e-06, | |
| "loss": 0.1755, | |
| "mean_token_accuracy": 0.9525851905345917, | |
| "num_tokens": 5903874.0, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 2.951048951048951, | |
| "grad_norm": 2.149623052195717, | |
| "learning_rate": 1.0095024558930204e-06, | |
| "loss": 0.1827, | |
| "mean_token_accuracy": 0.9500269293785095, | |
| "num_tokens": 5913132.0, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 2.9557109557109555, | |
| "grad_norm": 2.347852575129903, | |
| "learning_rate": 1.0079851515657794e-06, | |
| "loss": 0.1992, | |
| "mean_token_accuracy": 0.9474380910396576, | |
| "num_tokens": 5923130.0, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 2.9603729603729603, | |
| "grad_norm": 2.4799488075118408, | |
| "learning_rate": 1.006599637669943e-06, | |
| "loss": 0.192, | |
| "mean_token_accuracy": 0.946626216173172, | |
| "num_tokens": 5931948.0, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 2.965034965034965, | |
| "grad_norm": 2.1839881945919055, | |
| "learning_rate": 1.0053459548548582e-06, | |
| "loss": 0.1699, | |
| "mean_token_accuracy": 0.9527485072612762, | |
| "num_tokens": 5941672.0, | |
| "step": 637 | |
| }, | |
| { | |
| "epoch": 2.9696969696969697, | |
| "grad_norm": 2.3405062342525884, | |
| "learning_rate": 1.004224139902105e-06, | |
| "loss": 0.192, | |
| "mean_token_accuracy": 0.9456218183040619, | |
| "num_tokens": 5951095.0, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 2.9743589743589745, | |
| "grad_norm": 2.1209481683555906, | |
| "learning_rate": 1.0032342257244139e-06, | |
| "loss": 0.17, | |
| "mean_token_accuracy": 0.9527087509632111, | |
| "num_tokens": 5961196.0, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 2.979020979020979, | |
| "grad_norm": 2.3906839410006957, | |
| "learning_rate": 1.0023762413647023e-06, | |
| "loss": 0.19, | |
| "mean_token_accuracy": 0.9449608623981476, | |
| "num_tokens": 5970352.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.983682983682984, | |
| "grad_norm": 2.2744118662012736, | |
| "learning_rate": 1.0016502119952224e-06, | |
| "loss": 0.1824, | |
| "mean_token_accuracy": 0.9502459466457367, | |
| "num_tokens": 5979759.0, | |
| "step": 641 | |
| }, | |
| { | |
| "epoch": 2.988344988344988, | |
| "grad_norm": 2.3978292522263933, | |
| "learning_rate": 1.0010561589168217e-06, | |
| "loss": 0.1858, | |
| "mean_token_accuracy": 0.9475610256195068, | |
| "num_tokens": 5989632.0, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 2.993006993006993, | |
| "grad_norm": 2.220217029439386, | |
| "learning_rate": 1.0005940995583183e-06, | |
| "loss": 0.1843, | |
| "mean_token_accuracy": 0.9482160210609436, | |
| "num_tokens": 5999031.0, | |
| "step": 643 | |
| }, | |
| { | |
| "epoch": 2.9976689976689976, | |
| "grad_norm": 2.303701651736636, | |
| "learning_rate": 1.0002640474759911e-06, | |
| "loss": 0.1822, | |
| "mean_token_accuracy": 0.950013667345047, | |
| "num_tokens": 6008269.0, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 2.303701651736636, | |
| "learning_rate": 1.0000660123531788e-06, | |
| "loss": 0.1305, | |
| "mean_token_accuracy": 0.9722093343734741, | |
| "num_tokens": 6010434.0, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 645, | |
| "total_flos": 194968109678592.0, | |
| "train_loss": 0.4934643579314845, | |
| "train_runtime": 19223.4417, | |
| "train_samples_per_second": 1.069, | |
| "train_steps_per_second": 0.034 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 645, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 194968109678592.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |