| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.0, |
| "eval_steps": 100.0, |
| "global_step": 320, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.015625, |
| "grad_norm": 179.0, |
| "learning_rate": 0.0, |
| "loss": 5.2367, |
| "mean_token_accuracy": 0.43311607837677, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.03125, |
| "grad_norm": 198.0, |
| "learning_rate": 3.75e-07, |
| "loss": 5.5486, |
| "mean_token_accuracy": 0.4108841121196747, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.046875, |
| "grad_norm": 178.0, |
| "learning_rate": 7.5e-07, |
| "loss": 5.2141, |
| "mean_token_accuracy": 0.4345736503601074, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.0625, |
| "grad_norm": 172.0, |
| "learning_rate": 1.125e-06, |
| "loss": 5.5836, |
| "mean_token_accuracy": 0.41250425577163696, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.078125, |
| "grad_norm": 152.0, |
| "learning_rate": 1.5e-06, |
| "loss": 5.2296, |
| "mean_token_accuracy": 0.4149760901927948, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.09375, |
| "grad_norm": 126.5, |
| "learning_rate": 1.875e-06, |
| "loss": 4.6572, |
| "mean_token_accuracy": 0.4508528411388397, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.109375, |
| "grad_norm": 122.0, |
| "learning_rate": 2.25e-06, |
| "loss": 4.8441, |
| "mean_token_accuracy": 0.4253324270248413, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 93.5, |
| "learning_rate": 2.6250000000000003e-06, |
| "loss": 4.3849, |
| "mean_token_accuracy": 0.4506120979785919, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.140625, |
| "grad_norm": 73.0, |
| "learning_rate": 3e-06, |
| "loss": 4.0401, |
| "mean_token_accuracy": 0.45889315009117126, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.15625, |
| "grad_norm": 60.75, |
| "learning_rate": 2.9903846153846156e-06, |
| "loss": 3.8834, |
| "mean_token_accuracy": 0.47038498520851135, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.171875, |
| "grad_norm": 50.5, |
| "learning_rate": 2.9807692307692307e-06, |
| "loss": 3.6623, |
| "mean_token_accuracy": 0.4854481816291809, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.1875, |
| "grad_norm": 43.5, |
| "learning_rate": 2.9711538461538463e-06, |
| "loss": 3.4374, |
| "mean_token_accuracy": 0.4968532621860504, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.203125, |
| "grad_norm": 45.5, |
| "learning_rate": 2.961538461538462e-06, |
| "loss": 3.2283, |
| "mean_token_accuracy": 0.513970673084259, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.21875, |
| "grad_norm": 35.25, |
| "learning_rate": 2.951923076923077e-06, |
| "loss": 3.1423, |
| "mean_token_accuracy": 0.5192499160766602, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.234375, |
| "grad_norm": 32.0, |
| "learning_rate": 2.942307692307692e-06, |
| "loss": 2.9815, |
| "mean_token_accuracy": 0.5281234383583069, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 32.5, |
| "learning_rate": 2.9326923076923076e-06, |
| "loss": 2.863, |
| "mean_token_accuracy": 0.5459433197975159, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.265625, |
| "grad_norm": 27.5, |
| "learning_rate": 2.923076923076923e-06, |
| "loss": 2.8314, |
| "mean_token_accuracy": 0.5349738597869873, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.28125, |
| "grad_norm": 26.125, |
| "learning_rate": 2.9134615384615387e-06, |
| "loss": 2.7488, |
| "mean_token_accuracy": 0.5427058339118958, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.296875, |
| "grad_norm": 23.625, |
| "learning_rate": 2.903846153846154e-06, |
| "loss": 2.6134, |
| "mean_token_accuracy": 0.5453544855117798, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.3125, |
| "grad_norm": 23.75, |
| "learning_rate": 2.8942307692307693e-06, |
| "loss": 2.562, |
| "mean_token_accuracy": 0.5491897463798523, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.328125, |
| "grad_norm": 22.625, |
| "learning_rate": 2.884615384615385e-06, |
| "loss": 2.4742, |
| "mean_token_accuracy": 0.5496063232421875, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.34375, |
| "grad_norm": 21.875, |
| "learning_rate": 2.875e-06, |
| "loss": 2.3924, |
| "mean_token_accuracy": 0.5709417462348938, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.359375, |
| "grad_norm": 22.625, |
| "learning_rate": 2.8653846153846155e-06, |
| "loss": 2.2842, |
| "mean_token_accuracy": 0.5809605717658997, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.375, |
| "grad_norm": 21.125, |
| "learning_rate": 2.8557692307692307e-06, |
| "loss": 2.1831, |
| "mean_token_accuracy": 0.5968478918075562, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.390625, |
| "grad_norm": 21.875, |
| "learning_rate": 2.846153846153846e-06, |
| "loss": 2.1536, |
| "mean_token_accuracy": 0.602466344833374, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.40625, |
| "grad_norm": 21.5, |
| "learning_rate": 2.8365384615384613e-06, |
| "loss": 2.0267, |
| "mean_token_accuracy": 0.6342177987098694, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.421875, |
| "grad_norm": 23.25, |
| "learning_rate": 2.826923076923077e-06, |
| "loss": 1.9421, |
| "mean_token_accuracy": 0.6432027220726013, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.4375, |
| "grad_norm": 23.5, |
| "learning_rate": 2.8173076923076924e-06, |
| "loss": 1.9156, |
| "mean_token_accuracy": 0.6579385995864868, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.453125, |
| "grad_norm": 24.125, |
| "learning_rate": 2.807692307692308e-06, |
| "loss": 1.8596, |
| "mean_token_accuracy": 0.6711886525154114, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.46875, |
| "grad_norm": 21.5, |
| "learning_rate": 2.798076923076923e-06, |
| "loss": 1.7841, |
| "mean_token_accuracy": 0.6769159436225891, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.484375, |
| "grad_norm": 19.25, |
| "learning_rate": 2.7884615384615386e-06, |
| "loss": 1.7306, |
| "mean_token_accuracy": 0.6835744976997375, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 19.375, |
| "learning_rate": 2.778846153846154e-06, |
| "loss": 1.6979, |
| "mean_token_accuracy": 0.6885009407997131, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.515625, |
| "grad_norm": 19.625, |
| "learning_rate": 2.7692307692307693e-06, |
| "loss": 1.6705, |
| "mean_token_accuracy": 0.6899428963661194, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.53125, |
| "grad_norm": 19.375, |
| "learning_rate": 2.7596153846153844e-06, |
| "loss": 1.5569, |
| "mean_token_accuracy": 0.7047604918479919, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.546875, |
| "grad_norm": 25.25, |
| "learning_rate": 2.75e-06, |
| "loss": 1.5437, |
| "mean_token_accuracy": 0.7077205777168274, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.5625, |
| "grad_norm": 18.0, |
| "learning_rate": 2.7403846153846155e-06, |
| "loss": 1.5079, |
| "mean_token_accuracy": 0.721455454826355, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.578125, |
| "grad_norm": 16.125, |
| "learning_rate": 2.7307692307692306e-06, |
| "loss": 1.4751, |
| "mean_token_accuracy": 0.731259822845459, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.59375, |
| "grad_norm": 13.875, |
| "learning_rate": 2.721153846153846e-06, |
| "loss": 1.4444, |
| "mean_token_accuracy": 0.721596896648407, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.609375, |
| "grad_norm": 17.25, |
| "learning_rate": 2.7115384615384617e-06, |
| "loss": 1.4267, |
| "mean_token_accuracy": 0.7311121821403503, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.625, |
| "grad_norm": 14.6875, |
| "learning_rate": 2.7019230769230772e-06, |
| "loss": 1.4606, |
| "mean_token_accuracy": 0.7229064106941223, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.640625, |
| "grad_norm": 12.0625, |
| "learning_rate": 2.6923076923076923e-06, |
| "loss": 1.4415, |
| "mean_token_accuracy": 0.7228537201881409, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.65625, |
| "grad_norm": 12.1875, |
| "learning_rate": 2.682692307692308e-06, |
| "loss": 1.3615, |
| "mean_token_accuracy": 0.7459965944290161, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.671875, |
| "grad_norm": 12.0625, |
| "learning_rate": 2.6730769230769234e-06, |
| "loss": 1.3769, |
| "mean_token_accuracy": 0.7359253764152527, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.6875, |
| "grad_norm": 12.0, |
| "learning_rate": 2.6634615384615385e-06, |
| "loss": 1.3322, |
| "mean_token_accuracy": 0.7482958436012268, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.703125, |
| "grad_norm": 11.3125, |
| "learning_rate": 2.6538461538461537e-06, |
| "loss": 1.3471, |
| "mean_token_accuracy": 0.738798975944519, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.71875, |
| "grad_norm": 12.8125, |
| "learning_rate": 2.644230769230769e-06, |
| "loss": 1.2892, |
| "mean_token_accuracy": 0.7512567043304443, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.734375, |
| "grad_norm": 12.25, |
| "learning_rate": 2.6346153846153847e-06, |
| "loss": 1.2592, |
| "mean_token_accuracy": 0.7561485171318054, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 13.875, |
| "learning_rate": 2.6250000000000003e-06, |
| "loss": 1.289, |
| "mean_token_accuracy": 0.7504905462265015, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.765625, |
| "grad_norm": 17.25, |
| "learning_rate": 2.6153846153846154e-06, |
| "loss": 1.2412, |
| "mean_token_accuracy": 0.7566288113594055, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.78125, |
| "grad_norm": 24.375, |
| "learning_rate": 2.605769230769231e-06, |
| "loss": 1.2232, |
| "mean_token_accuracy": 0.7667444348335266, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.796875, |
| "grad_norm": 34.25, |
| "learning_rate": 2.5961538461538465e-06, |
| "loss": 1.2176, |
| "mean_token_accuracy": 0.7597681879997253, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.8125, |
| "grad_norm": 31.0, |
| "learning_rate": 2.5865384615384616e-06, |
| "loss": 1.2091, |
| "mean_token_accuracy": 0.7564249634742737, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.828125, |
| "grad_norm": 9.4375, |
| "learning_rate": 2.5769230769230767e-06, |
| "loss": 1.1706, |
| "mean_token_accuracy": 0.7586262226104736, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.84375, |
| "grad_norm": 9.3125, |
| "learning_rate": 2.5673076923076923e-06, |
| "loss": 1.2491, |
| "mean_token_accuracy": 0.7373932600021362, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.859375, |
| "grad_norm": 8.8125, |
| "learning_rate": 2.557692307692308e-06, |
| "loss": 1.2008, |
| "mean_token_accuracy": 0.7576074004173279, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.875, |
| "grad_norm": 7.8125, |
| "learning_rate": 2.548076923076923e-06, |
| "loss": 1.1553, |
| "mean_token_accuracy": 0.7607351541519165, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.890625, |
| "grad_norm": 7.875, |
| "learning_rate": 2.5384615384615385e-06, |
| "loss": 1.1896, |
| "mean_token_accuracy": 0.7548706531524658, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.90625, |
| "grad_norm": 13.6875, |
| "learning_rate": 2.528846153846154e-06, |
| "loss": 1.156, |
| "mean_token_accuracy": 0.7589271664619446, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.921875, |
| "grad_norm": 12.375, |
| "learning_rate": 2.5192307692307695e-06, |
| "loss": 1.1924, |
| "mean_token_accuracy": 0.7581174373626709, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.9375, |
| "grad_norm": 8.875, |
| "learning_rate": 2.5096153846153847e-06, |
| "loss": 1.193, |
| "mean_token_accuracy": 0.7505763173103333, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.953125, |
| "grad_norm": 7.53125, |
| "learning_rate": 2.5e-06, |
| "loss": 1.118, |
| "mean_token_accuracy": 0.7625401020050049, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.96875, |
| "grad_norm": 7.03125, |
| "learning_rate": 2.4903846153846157e-06, |
| "loss": 1.1186, |
| "mean_token_accuracy": 0.7640949487686157, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.984375, |
| "grad_norm": 6.9375, |
| "learning_rate": 2.480769230769231e-06, |
| "loss": 1.1178, |
| "mean_token_accuracy": 0.7689425945281982, |
| "step": 63 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 9.75, |
| "learning_rate": 2.471153846153846e-06, |
| "loss": 1.1485, |
| "mean_token_accuracy": 0.7646326422691345, |
| "step": 64 |
| }, |
| { |
| "epoch": 1.015625, |
| "grad_norm": 15.3125, |
| "learning_rate": 2.4615384615384615e-06, |
| "loss": 1.1362, |
| "mean_token_accuracy": 0.7596017122268677, |
| "step": 65 |
| }, |
| { |
| "epoch": 1.03125, |
| "grad_norm": 11.5625, |
| "learning_rate": 2.451923076923077e-06, |
| "loss": 1.1227, |
| "mean_token_accuracy": 0.7617481350898743, |
| "step": 66 |
| }, |
| { |
| "epoch": 1.046875, |
| "grad_norm": 29.125, |
| "learning_rate": 2.442307692307692e-06, |
| "loss": 1.0983, |
| "mean_token_accuracy": 0.7637330293655396, |
| "step": 67 |
| }, |
| { |
| "epoch": 1.0625, |
| "grad_norm": 11.0625, |
| "learning_rate": 2.4326923076923077e-06, |
| "loss": 1.1244, |
| "mean_token_accuracy": 0.75943922996521, |
| "step": 68 |
| }, |
| { |
| "epoch": 1.078125, |
| "grad_norm": 6.78125, |
| "learning_rate": 2.4230769230769233e-06, |
| "loss": 1.1029, |
| "mean_token_accuracy": 0.7678335309028625, |
| "step": 69 |
| }, |
| { |
| "epoch": 1.09375, |
| "grad_norm": 8.25, |
| "learning_rate": 2.413461538461539e-06, |
| "loss": 1.1107, |
| "mean_token_accuracy": 0.766456127166748, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.109375, |
| "grad_norm": 6.5, |
| "learning_rate": 2.403846153846154e-06, |
| "loss": 1.0539, |
| "mean_token_accuracy": 0.7779717445373535, |
| "step": 71 |
| }, |
| { |
| "epoch": 1.125, |
| "grad_norm": 11.125, |
| "learning_rate": 2.3942307692307695e-06, |
| "loss": 1.1745, |
| "mean_token_accuracy": 0.7527372241020203, |
| "step": 72 |
| }, |
| { |
| "epoch": 1.140625, |
| "grad_norm": 6.65625, |
| "learning_rate": 2.3846153846153846e-06, |
| "loss": 1.1154, |
| "mean_token_accuracy": 0.7618581652641296, |
| "step": 73 |
| }, |
| { |
| "epoch": 1.15625, |
| "grad_norm": 6.65625, |
| "learning_rate": 2.375e-06, |
| "loss": 1.0716, |
| "mean_token_accuracy": 0.7723715305328369, |
| "step": 74 |
| }, |
| { |
| "epoch": 1.171875, |
| "grad_norm": 7.09375, |
| "learning_rate": 2.3653846153846152e-06, |
| "loss": 1.0951, |
| "mean_token_accuracy": 0.7622343897819519, |
| "step": 75 |
| }, |
| { |
| "epoch": 1.1875, |
| "grad_norm": 6.375, |
| "learning_rate": 2.355769230769231e-06, |
| "loss": 1.1046, |
| "mean_token_accuracy": 0.768226146697998, |
| "step": 76 |
| }, |
| { |
| "epoch": 1.203125, |
| "grad_norm": 8.0, |
| "learning_rate": 2.3461538461538463e-06, |
| "loss": 1.1724, |
| "mean_token_accuracy": 0.7464057803153992, |
| "step": 77 |
| }, |
| { |
| "epoch": 1.21875, |
| "grad_norm": 6.96875, |
| "learning_rate": 2.3365384615384615e-06, |
| "loss": 1.0518, |
| "mean_token_accuracy": 0.7725957036018372, |
| "step": 78 |
| }, |
| { |
| "epoch": 1.234375, |
| "grad_norm": 7.03125, |
| "learning_rate": 2.326923076923077e-06, |
| "loss": 1.0578, |
| "mean_token_accuracy": 0.7661430835723877, |
| "step": 79 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 7.15625, |
| "learning_rate": 2.3173076923076925e-06, |
| "loss": 1.0921, |
| "mean_token_accuracy": 0.7706676125526428, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.265625, |
| "grad_norm": 7.9375, |
| "learning_rate": 2.307692307692308e-06, |
| "loss": 1.0947, |
| "mean_token_accuracy": 0.7600434422492981, |
| "step": 81 |
| }, |
| { |
| "epoch": 1.28125, |
| "grad_norm": 6.28125, |
| "learning_rate": 2.298076923076923e-06, |
| "loss": 1.0152, |
| "mean_token_accuracy": 0.7791481614112854, |
| "step": 82 |
| }, |
| { |
| "epoch": 1.296875, |
| "grad_norm": 6.40625, |
| "learning_rate": 2.2884615384615383e-06, |
| "loss": 1.1093, |
| "mean_token_accuracy": 0.7621586918830872, |
| "step": 83 |
| }, |
| { |
| "epoch": 1.3125, |
| "grad_norm": 6.0625, |
| "learning_rate": 2.278846153846154e-06, |
| "loss": 1.0595, |
| "mean_token_accuracy": 0.7689430713653564, |
| "step": 84 |
| }, |
| { |
| "epoch": 1.328125, |
| "grad_norm": 25.625, |
| "learning_rate": 2.2692307692307694e-06, |
| "loss": 1.1264, |
| "mean_token_accuracy": 0.7479003667831421, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.34375, |
| "grad_norm": 6.9375, |
| "learning_rate": 2.2596153846153845e-06, |
| "loss": 1.0278, |
| "mean_token_accuracy": 0.7810243964195251, |
| "step": 86 |
| }, |
| { |
| "epoch": 1.359375, |
| "grad_norm": 12.0, |
| "learning_rate": 2.25e-06, |
| "loss": 1.0898, |
| "mean_token_accuracy": 0.7608263492584229, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.375, |
| "grad_norm": 6.96875, |
| "learning_rate": 2.2403846153846156e-06, |
| "loss": 1.0646, |
| "mean_token_accuracy": 0.7628912925720215, |
| "step": 88 |
| }, |
| { |
| "epoch": 1.390625, |
| "grad_norm": 6.3125, |
| "learning_rate": 2.2307692307692307e-06, |
| "loss": 1.1259, |
| "mean_token_accuracy": 0.747133195400238, |
| "step": 89 |
| }, |
| { |
| "epoch": 1.40625, |
| "grad_norm": 6.96875, |
| "learning_rate": 2.2211538461538463e-06, |
| "loss": 1.0527, |
| "mean_token_accuracy": 0.769545316696167, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.421875, |
| "grad_norm": 6.78125, |
| "learning_rate": 2.211538461538462e-06, |
| "loss": 1.0857, |
| "mean_token_accuracy": 0.7563959956169128, |
| "step": 91 |
| }, |
| { |
| "epoch": 1.4375, |
| "grad_norm": 6.6875, |
| "learning_rate": 2.201923076923077e-06, |
| "loss": 1.0779, |
| "mean_token_accuracy": 0.7625620365142822, |
| "step": 92 |
| }, |
| { |
| "epoch": 1.453125, |
| "grad_norm": 11.0, |
| "learning_rate": 2.192307692307692e-06, |
| "loss": 1.0615, |
| "mean_token_accuracy": 0.7684396505355835, |
| "step": 93 |
| }, |
| { |
| "epoch": 1.46875, |
| "grad_norm": 7.53125, |
| "learning_rate": 2.1826923076923076e-06, |
| "loss": 1.0067, |
| "mean_token_accuracy": 0.773740291595459, |
| "step": 94 |
| }, |
| { |
| "epoch": 1.484375, |
| "grad_norm": 6.40625, |
| "learning_rate": 2.173076923076923e-06, |
| "loss": 1.0334, |
| "mean_token_accuracy": 0.7674839496612549, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 9.3125, |
| "learning_rate": 2.1634615384615387e-06, |
| "loss": 1.0772, |
| "mean_token_accuracy": 0.7548962235450745, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.515625, |
| "grad_norm": 20.25, |
| "learning_rate": 2.1538461538461538e-06, |
| "loss": 1.0953, |
| "mean_token_accuracy": 0.7568336129188538, |
| "step": 97 |
| }, |
| { |
| "epoch": 1.53125, |
| "grad_norm": 6.84375, |
| "learning_rate": 2.1442307692307693e-06, |
| "loss": 1.0656, |
| "mean_token_accuracy": 0.7618821859359741, |
| "step": 98 |
| }, |
| { |
| "epoch": 1.546875, |
| "grad_norm": 6.96875, |
| "learning_rate": 2.134615384615385e-06, |
| "loss": 0.9638, |
| "mean_token_accuracy": 0.7852028608322144, |
| "step": 99 |
| }, |
| { |
| "epoch": 1.5625, |
| "grad_norm": 16.25, |
| "learning_rate": 2.125e-06, |
| "loss": 1.057, |
| "mean_token_accuracy": 0.7648255228996277, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.578125, |
| "grad_norm": 6.59375, |
| "learning_rate": 2.1153846153846155e-06, |
| "loss": 1.0351, |
| "mean_token_accuracy": 0.7706173062324524, |
| "step": 101 |
| }, |
| { |
| "epoch": 1.59375, |
| "grad_norm": 19.375, |
| "learning_rate": 2.1057692307692306e-06, |
| "loss": 1.0262, |
| "mean_token_accuracy": 0.7676799297332764, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.609375, |
| "grad_norm": 6.40625, |
| "learning_rate": 2.096153846153846e-06, |
| "loss": 0.9901, |
| "mean_token_accuracy": 0.7759581208229065, |
| "step": 103 |
| }, |
| { |
| "epoch": 1.625, |
| "grad_norm": 8.5, |
| "learning_rate": 2.0865384615384613e-06, |
| "loss": 0.9888, |
| "mean_token_accuracy": 0.7782798409461975, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.640625, |
| "grad_norm": 6.90625, |
| "learning_rate": 2.076923076923077e-06, |
| "loss": 0.9912, |
| "mean_token_accuracy": 0.7795735001564026, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.65625, |
| "grad_norm": 8.6875, |
| "learning_rate": 2.0673076923076924e-06, |
| "loss": 1.0046, |
| "mean_token_accuracy": 0.7750487923622131, |
| "step": 106 |
| }, |
| { |
| "epoch": 1.671875, |
| "grad_norm": 6.59375, |
| "learning_rate": 2.057692307692308e-06, |
| "loss": 1.0527, |
| "mean_token_accuracy": 0.7730545997619629, |
| "step": 107 |
| }, |
| { |
| "epoch": 1.6875, |
| "grad_norm": 7.0, |
| "learning_rate": 2.048076923076923e-06, |
| "loss": 0.9943, |
| "mean_token_accuracy": 0.7790677547454834, |
| "step": 108 |
| }, |
| { |
| "epoch": 1.703125, |
| "grad_norm": 52.25, |
| "learning_rate": 2.0384615384615386e-06, |
| "loss": 1.0243, |
| "mean_token_accuracy": 0.7736808657646179, |
| "step": 109 |
| }, |
| { |
| "epoch": 1.71875, |
| "grad_norm": 6.65625, |
| "learning_rate": 2.028846153846154e-06, |
| "loss": 0.9599, |
| "mean_token_accuracy": 0.7829864025115967, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.734375, |
| "grad_norm": 6.875, |
| "learning_rate": 2.0192307692307692e-06, |
| "loss": 0.993, |
| "mean_token_accuracy": 0.784246563911438, |
| "step": 111 |
| }, |
| { |
| "epoch": 1.75, |
| "grad_norm": 15.9375, |
| "learning_rate": 2.0096153846153844e-06, |
| "loss": 1.0284, |
| "mean_token_accuracy": 0.7701562643051147, |
| "step": 112 |
| }, |
| { |
| "epoch": 1.765625, |
| "grad_norm": 11.25, |
| "learning_rate": 2e-06, |
| "loss": 1.0453, |
| "mean_token_accuracy": 0.765605092048645, |
| "step": 113 |
| }, |
| { |
| "epoch": 1.78125, |
| "grad_norm": 11.25, |
| "learning_rate": 1.9903846153846155e-06, |
| "loss": 0.9963, |
| "mean_token_accuracy": 0.7780460119247437, |
| "step": 114 |
| }, |
| { |
| "epoch": 1.796875, |
| "grad_norm": 7.8125, |
| "learning_rate": 1.9807692307692306e-06, |
| "loss": 1.0104, |
| "mean_token_accuracy": 0.7722231149673462, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.8125, |
| "grad_norm": 7.46875, |
| "learning_rate": 1.971153846153846e-06, |
| "loss": 0.9977, |
| "mean_token_accuracy": 0.7763562798500061, |
| "step": 116 |
| }, |
| { |
| "epoch": 1.828125, |
| "grad_norm": 7.09375, |
| "learning_rate": 1.9615384615384617e-06, |
| "loss": 0.9994, |
| "mean_token_accuracy": 0.7727492451667786, |
| "step": 117 |
| }, |
| { |
| "epoch": 1.84375, |
| "grad_norm": 6.71875, |
| "learning_rate": 1.951923076923077e-06, |
| "loss": 0.9729, |
| "mean_token_accuracy": 0.7792785167694092, |
| "step": 118 |
| }, |
| { |
| "epoch": 1.859375, |
| "grad_norm": 10.25, |
| "learning_rate": 1.9423076923076923e-06, |
| "loss": 1.0092, |
| "mean_token_accuracy": 0.7684956789016724, |
| "step": 119 |
| }, |
| { |
| "epoch": 1.875, |
| "grad_norm": 7.53125, |
| "learning_rate": 1.932692307692308e-06, |
| "loss": 0.962, |
| "mean_token_accuracy": 0.7807108163833618, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.890625, |
| "grad_norm": 7.0, |
| "learning_rate": 1.9230769230769234e-06, |
| "loss": 1.0035, |
| "mean_token_accuracy": 0.7749958634376526, |
| "step": 121 |
| }, |
| { |
| "epoch": 1.90625, |
| "grad_norm": 9.3125, |
| "learning_rate": 1.9134615384615385e-06, |
| "loss": 1.0121, |
| "mean_token_accuracy": 0.7693877816200256, |
| "step": 122 |
| }, |
| { |
| "epoch": 1.921875, |
| "grad_norm": 6.46875, |
| "learning_rate": 1.9038461538461538e-06, |
| "loss": 0.9797, |
| "mean_token_accuracy": 0.7859560251235962, |
| "step": 123 |
| }, |
| { |
| "epoch": 1.9375, |
| "grad_norm": 6.65625, |
| "learning_rate": 1.8942307692307692e-06, |
| "loss": 0.9071, |
| "mean_token_accuracy": 0.7927353978157043, |
| "step": 124 |
| }, |
| { |
| "epoch": 1.953125, |
| "grad_norm": 6.5, |
| "learning_rate": 1.8846153846153847e-06, |
| "loss": 0.8898, |
| "mean_token_accuracy": 0.7983871102333069, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.96875, |
| "grad_norm": 11.625, |
| "learning_rate": 1.875e-06, |
| "loss": 0.9509, |
| "mean_token_accuracy": 0.7818240523338318, |
| "step": 126 |
| }, |
| { |
| "epoch": 1.984375, |
| "grad_norm": 23.0, |
| "learning_rate": 1.8653846153846154e-06, |
| "loss": 0.9974, |
| "mean_token_accuracy": 0.7754906415939331, |
| "step": 127 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 16.25, |
| "learning_rate": 1.855769230769231e-06, |
| "loss": 0.9983, |
| "mean_token_accuracy": 0.7626262903213501, |
| "step": 128 |
| }, |
| { |
| "epoch": 2.015625, |
| "grad_norm": 107.5, |
| "learning_rate": 1.8461538461538462e-06, |
| "loss": 0.981, |
| "mean_token_accuracy": 0.7775020599365234, |
| "step": 129 |
| }, |
| { |
| "epoch": 2.03125, |
| "grad_norm": 78.0, |
| "learning_rate": 1.8365384615384618e-06, |
| "loss": 1.0013, |
| "mean_token_accuracy": 0.777273416519165, |
| "step": 130 |
| }, |
| { |
| "epoch": 2.046875, |
| "grad_norm": 15.75, |
| "learning_rate": 1.826923076923077e-06, |
| "loss": 1.0124, |
| "mean_token_accuracy": 0.7745746970176697, |
| "step": 131 |
| }, |
| { |
| "epoch": 2.0625, |
| "grad_norm": 6.75, |
| "learning_rate": 1.8173076923076922e-06, |
| "loss": 1.0109, |
| "mean_token_accuracy": 0.7626786828041077, |
| "step": 132 |
| }, |
| { |
| "epoch": 2.078125, |
| "grad_norm": 10.75, |
| "learning_rate": 1.8076923076923076e-06, |
| "loss": 0.9854, |
| "mean_token_accuracy": 0.7782512903213501, |
| "step": 133 |
| }, |
| { |
| "epoch": 2.09375, |
| "grad_norm": 13.1875, |
| "learning_rate": 1.7980769230769231e-06, |
| "loss": 0.9558, |
| "mean_token_accuracy": 0.7810325026512146, |
| "step": 134 |
| }, |
| { |
| "epoch": 2.109375, |
| "grad_norm": 9.875, |
| "learning_rate": 1.7884615384615384e-06, |
| "loss": 0.9925, |
| "mean_token_accuracy": 0.7802160978317261, |
| "step": 135 |
| }, |
| { |
| "epoch": 2.125, |
| "grad_norm": 7.40625, |
| "learning_rate": 1.778846153846154e-06, |
| "loss": 0.9899, |
| "mean_token_accuracy": 0.7759094834327698, |
| "step": 136 |
| }, |
| { |
| "epoch": 2.140625, |
| "grad_norm": 6.28125, |
| "learning_rate": 1.7692307692307693e-06, |
| "loss": 0.9509, |
| "mean_token_accuracy": 0.7890124917030334, |
| "step": 137 |
| }, |
| { |
| "epoch": 2.15625, |
| "grad_norm": 8.1875, |
| "learning_rate": 1.7596153846153846e-06, |
| "loss": 0.9832, |
| "mean_token_accuracy": 0.7752114534378052, |
| "step": 138 |
| }, |
| { |
| "epoch": 2.171875, |
| "grad_norm": 6.25, |
| "learning_rate": 1.7500000000000002e-06, |
| "loss": 0.9225, |
| "mean_token_accuracy": 0.7903473973274231, |
| "step": 139 |
| }, |
| { |
| "epoch": 2.1875, |
| "grad_norm": 5.84375, |
| "learning_rate": 1.7403846153846155e-06, |
| "loss": 0.9476, |
| "mean_token_accuracy": 0.7873295545578003, |
| "step": 140 |
| }, |
| { |
| "epoch": 2.203125, |
| "grad_norm": 20.625, |
| "learning_rate": 1.7307692307692306e-06, |
| "loss": 1.0055, |
| "mean_token_accuracy": 0.7686769962310791, |
| "step": 141 |
| }, |
| { |
| "epoch": 2.21875, |
| "grad_norm": 26.625, |
| "learning_rate": 1.7211538461538462e-06, |
| "loss": 1.0145, |
| "mean_token_accuracy": 0.7676523923873901, |
| "step": 142 |
| }, |
| { |
| "epoch": 2.234375, |
| "grad_norm": 7.4375, |
| "learning_rate": 1.7115384615384615e-06, |
| "loss": 0.9358, |
| "mean_token_accuracy": 0.7874542474746704, |
| "step": 143 |
| }, |
| { |
| "epoch": 2.25, |
| "grad_norm": 23.875, |
| "learning_rate": 1.7019230769230768e-06, |
| "loss": 1.0153, |
| "mean_token_accuracy": 0.768985390663147, |
| "step": 144 |
| }, |
| { |
| "epoch": 2.265625, |
| "grad_norm": 6.625, |
| "learning_rate": 1.6923076923076924e-06, |
| "loss": 0.9844, |
| "mean_token_accuracy": 0.7775037288665771, |
| "step": 145 |
| }, |
| { |
| "epoch": 2.28125, |
| "grad_norm": 7.03125, |
| "learning_rate": 1.6826923076923077e-06, |
| "loss": 0.9851, |
| "mean_token_accuracy": 0.7740368843078613, |
| "step": 146 |
| }, |
| { |
| "epoch": 2.296875, |
| "grad_norm": 44.25, |
| "learning_rate": 1.6730769230769232e-06, |
| "loss": 0.9817, |
| "mean_token_accuracy": 0.7741778492927551, |
| "step": 147 |
| }, |
| { |
| "epoch": 2.3125, |
| "grad_norm": 13.3125, |
| "learning_rate": 1.6634615384615386e-06, |
| "loss": 0.9974, |
| "mean_token_accuracy": 0.7730644941329956, |
| "step": 148 |
| }, |
| { |
| "epoch": 2.328125, |
| "grad_norm": 5.625, |
| "learning_rate": 1.653846153846154e-06, |
| "loss": 0.927, |
| "mean_token_accuracy": 0.7886461615562439, |
| "step": 149 |
| }, |
| { |
| "epoch": 2.34375, |
| "grad_norm": 6.9375, |
| "learning_rate": 1.6442307692307695e-06, |
| "loss": 0.8944, |
| "mean_token_accuracy": 0.795914888381958, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.359375, |
| "grad_norm": 8.625, |
| "learning_rate": 1.6346153846153846e-06, |
| "loss": 0.9641, |
| "mean_token_accuracy": 0.7850483059883118, |
| "step": 151 |
| }, |
| { |
| "epoch": 2.375, |
| "grad_norm": 61.25, |
| "learning_rate": 1.625e-06, |
| "loss": 0.9894, |
| "mean_token_accuracy": 0.7669225335121155, |
| "step": 152 |
| }, |
| { |
| "epoch": 2.390625, |
| "grad_norm": 13.9375, |
| "learning_rate": 1.6153846153846154e-06, |
| "loss": 0.9687, |
| "mean_token_accuracy": 0.7841285467147827, |
| "step": 153 |
| }, |
| { |
| "epoch": 2.40625, |
| "grad_norm": 25.375, |
| "learning_rate": 1.6057692307692308e-06, |
| "loss": 0.9431, |
| "mean_token_accuracy": 0.7892006039619446, |
| "step": 154 |
| }, |
| { |
| "epoch": 2.421875, |
| "grad_norm": 6.28125, |
| "learning_rate": 1.596153846153846e-06, |
| "loss": 0.9615, |
| "mean_token_accuracy": 0.7802021503448486, |
| "step": 155 |
| }, |
| { |
| "epoch": 2.4375, |
| "grad_norm": 9.4375, |
| "learning_rate": 1.5865384615384616e-06, |
| "loss": 0.912, |
| "mean_token_accuracy": 0.7976588606834412, |
| "step": 156 |
| }, |
| { |
| "epoch": 2.453125, |
| "grad_norm": 6.1875, |
| "learning_rate": 1.576923076923077e-06, |
| "loss": 0.944, |
| "mean_token_accuracy": 0.7864833474159241, |
| "step": 157 |
| }, |
| { |
| "epoch": 2.46875, |
| "grad_norm": 10.4375, |
| "learning_rate": 1.5673076923076925e-06, |
| "loss": 0.9251, |
| "mean_token_accuracy": 0.7923402786254883, |
| "step": 158 |
| }, |
| { |
| "epoch": 2.484375, |
| "grad_norm": 14.0625, |
| "learning_rate": 1.5576923076923078e-06, |
| "loss": 0.9495, |
| "mean_token_accuracy": 0.7810263633728027, |
| "step": 159 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 27.75, |
| "learning_rate": 1.5480769230769232e-06, |
| "loss": 1.0118, |
| "mean_token_accuracy": 0.7756314873695374, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.515625, |
| "grad_norm": 6.1875, |
| "learning_rate": 1.5384615384615383e-06, |
| "loss": 0.8795, |
| "mean_token_accuracy": 0.8030520677566528, |
| "step": 161 |
| }, |
| { |
| "epoch": 2.53125, |
| "grad_norm": 10.75, |
| "learning_rate": 1.5288461538461538e-06, |
| "loss": 0.993, |
| "mean_token_accuracy": 0.7751861214637756, |
| "step": 162 |
| }, |
| { |
| "epoch": 2.546875, |
| "grad_norm": 6.34375, |
| "learning_rate": 1.5192307692307692e-06, |
| "loss": 0.9337, |
| "mean_token_accuracy": 0.7878151535987854, |
| "step": 163 |
| }, |
| { |
| "epoch": 2.5625, |
| "grad_norm": 6.4375, |
| "learning_rate": 1.5096153846153847e-06, |
| "loss": 0.9229, |
| "mean_token_accuracy": 0.7985841631889343, |
| "step": 164 |
| }, |
| { |
| "epoch": 2.578125, |
| "grad_norm": 6.03125, |
| "learning_rate": 1.5e-06, |
| "loss": 0.9183, |
| "mean_token_accuracy": 0.7949057221412659, |
| "step": 165 |
| }, |
| { |
| "epoch": 2.59375, |
| "grad_norm": 8.1875, |
| "learning_rate": 1.4903846153846154e-06, |
| "loss": 0.9715, |
| "mean_token_accuracy": 0.7781420946121216, |
| "step": 166 |
| }, |
| { |
| "epoch": 2.609375, |
| "grad_norm": 5.90625, |
| "learning_rate": 1.480769230769231e-06, |
| "loss": 0.9473, |
| "mean_token_accuracy": 0.7875062227249146, |
| "step": 167 |
| }, |
| { |
| "epoch": 2.625, |
| "grad_norm": 5.75, |
| "learning_rate": 1.471153846153846e-06, |
| "loss": 0.9271, |
| "mean_token_accuracy": 0.7928742170333862, |
| "step": 168 |
| }, |
| { |
| "epoch": 2.640625, |
| "grad_norm": 13.25, |
| "learning_rate": 1.4615384615384616e-06, |
| "loss": 0.9739, |
| "mean_token_accuracy": 0.7802754640579224, |
| "step": 169 |
| }, |
| { |
| "epoch": 2.65625, |
| "grad_norm": 6.34375, |
| "learning_rate": 1.451923076923077e-06, |
| "loss": 0.9205, |
| "mean_token_accuracy": 0.7956867218017578, |
| "step": 170 |
| }, |
| { |
| "epoch": 2.671875, |
| "grad_norm": 20.125, |
| "learning_rate": 1.4423076923076924e-06, |
| "loss": 0.9228, |
| "mean_token_accuracy": 0.7963763475418091, |
| "step": 171 |
| }, |
| { |
| "epoch": 2.6875, |
| "grad_norm": 7.8125, |
| "learning_rate": 1.4326923076923078e-06, |
| "loss": 0.9705, |
| "mean_token_accuracy": 0.7838233113288879, |
| "step": 172 |
| }, |
| { |
| "epoch": 2.703125, |
| "grad_norm": 7.625, |
| "learning_rate": 1.423076923076923e-06, |
| "loss": 0.9856, |
| "mean_token_accuracy": 0.777830958366394, |
| "step": 173 |
| }, |
| { |
| "epoch": 2.71875, |
| "grad_norm": 5.90625, |
| "learning_rate": 1.4134615384615384e-06, |
| "loss": 0.9123, |
| "mean_token_accuracy": 0.7965753674507141, |
| "step": 174 |
| }, |
| { |
| "epoch": 2.734375, |
| "grad_norm": 6.59375, |
| "learning_rate": 1.403846153846154e-06, |
| "loss": 0.8977, |
| "mean_token_accuracy": 0.7950465679168701, |
| "step": 175 |
| }, |
| { |
| "epoch": 2.75, |
| "grad_norm": 5.96875, |
| "learning_rate": 1.3942307692307693e-06, |
| "loss": 0.9534, |
| "mean_token_accuracy": 0.7896023392677307, |
| "step": 176 |
| }, |
| { |
| "epoch": 2.765625, |
| "grad_norm": 51.5, |
| "learning_rate": 1.3846153846153846e-06, |
| "loss": 0.9886, |
| "mean_token_accuracy": 0.7733170390129089, |
| "step": 177 |
| }, |
| { |
| "epoch": 2.78125, |
| "grad_norm": 6.25, |
| "learning_rate": 1.375e-06, |
| "loss": 0.9304, |
| "mean_token_accuracy": 0.7874513864517212, |
| "step": 178 |
| }, |
| { |
| "epoch": 2.796875, |
| "grad_norm": 9.1875, |
| "learning_rate": 1.3653846153846153e-06, |
| "loss": 0.9909, |
| "mean_token_accuracy": 0.7773162722587585, |
| "step": 179 |
| }, |
| { |
| "epoch": 2.8125, |
| "grad_norm": 17.375, |
| "learning_rate": 1.3557692307692308e-06, |
| "loss": 0.9561, |
| "mean_token_accuracy": 0.7817555665969849, |
| "step": 180 |
| }, |
| { |
| "epoch": 2.828125, |
| "grad_norm": 6.15625, |
| "learning_rate": 1.3461538461538462e-06, |
| "loss": 0.9449, |
| "mean_token_accuracy": 0.7880972623825073, |
| "step": 181 |
| }, |
| { |
| "epoch": 2.84375, |
| "grad_norm": 6.375, |
| "learning_rate": 1.3365384615384617e-06, |
| "loss": 0.9588, |
| "mean_token_accuracy": 0.7854828238487244, |
| "step": 182 |
| }, |
| { |
| "epoch": 2.859375, |
| "grad_norm": 33.75, |
| "learning_rate": 1.3269230769230768e-06, |
| "loss": 0.9635, |
| "mean_token_accuracy": 0.7801363468170166, |
| "step": 183 |
| }, |
| { |
| "epoch": 2.875, |
| "grad_norm": 6.53125, |
| "learning_rate": 1.3173076923076924e-06, |
| "loss": 0.9914, |
| "mean_token_accuracy": 0.7746433615684509, |
| "step": 184 |
| }, |
| { |
| "epoch": 2.890625, |
| "grad_norm": 6.4375, |
| "learning_rate": 1.3076923076923077e-06, |
| "loss": 0.9492, |
| "mean_token_accuracy": 0.7791839838027954, |
| "step": 185 |
| }, |
| { |
| "epoch": 2.90625, |
| "grad_norm": 6.65625, |
| "learning_rate": 1.2980769230769232e-06, |
| "loss": 0.9667, |
| "mean_token_accuracy": 0.7795350551605225, |
| "step": 186 |
| }, |
| { |
| "epoch": 2.921875, |
| "grad_norm": 10.0625, |
| "learning_rate": 1.2884615384615384e-06, |
| "loss": 0.9588, |
| "mean_token_accuracy": 0.7847999930381775, |
| "step": 187 |
| }, |
| { |
| "epoch": 2.9375, |
| "grad_norm": 25.375, |
| "learning_rate": 1.278846153846154e-06, |
| "loss": 0.8759, |
| "mean_token_accuracy": 0.8001649975776672, |
| "step": 188 |
| }, |
| { |
| "epoch": 2.953125, |
| "grad_norm": 8.5625, |
| "learning_rate": 1.2692307692307692e-06, |
| "loss": 0.9675, |
| "mean_token_accuracy": 0.7719402313232422, |
| "step": 189 |
| }, |
| { |
| "epoch": 2.96875, |
| "grad_norm": 6.9375, |
| "learning_rate": 1.2596153846153848e-06, |
| "loss": 0.9304, |
| "mean_token_accuracy": 0.7856206893920898, |
| "step": 190 |
| }, |
| { |
| "epoch": 2.984375, |
| "grad_norm": 6.1875, |
| "learning_rate": 1.25e-06, |
| "loss": 0.9542, |
| "mean_token_accuracy": 0.78487229347229, |
| "step": 191 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 10.8125, |
| "learning_rate": 1.2403846153846154e-06, |
| "loss": 0.9371, |
| "mean_token_accuracy": 0.7693274617195129, |
| "step": 192 |
| }, |
| { |
| "epoch": 3.015625, |
| "grad_norm": 6.25, |
| "learning_rate": 1.2307692307692308e-06, |
| "loss": 0.892, |
| "mean_token_accuracy": 0.7988702654838562, |
| "step": 193 |
| }, |
| { |
| "epoch": 3.03125, |
| "grad_norm": 18.375, |
| "learning_rate": 1.221153846153846e-06, |
| "loss": 0.9524, |
| "mean_token_accuracy": 0.782721221446991, |
| "step": 194 |
| }, |
| { |
| "epoch": 3.046875, |
| "grad_norm": 15.25, |
| "learning_rate": 1.2115384615384616e-06, |
| "loss": 0.9835, |
| "mean_token_accuracy": 0.7716888189315796, |
| "step": 195 |
| }, |
| { |
| "epoch": 3.0625, |
| "grad_norm": 6.03125, |
| "learning_rate": 1.201923076923077e-06, |
| "loss": 0.8871, |
| "mean_token_accuracy": 0.8021034002304077, |
| "step": 196 |
| }, |
| { |
| "epoch": 3.078125, |
| "grad_norm": 35.0, |
| "learning_rate": 1.1923076923076923e-06, |
| "loss": 0.9402, |
| "mean_token_accuracy": 0.7832167744636536, |
| "step": 197 |
| }, |
| { |
| "epoch": 3.09375, |
| "grad_norm": 8.75, |
| "learning_rate": 1.1826923076923076e-06, |
| "loss": 0.991, |
| "mean_token_accuracy": 0.7750375866889954, |
| "step": 198 |
| }, |
| { |
| "epoch": 3.109375, |
| "grad_norm": 20.25, |
| "learning_rate": 1.1730769230769232e-06, |
| "loss": 0.906, |
| "mean_token_accuracy": 0.7941325902938843, |
| "step": 199 |
| }, |
| { |
| "epoch": 3.125, |
| "grad_norm": 6.15625, |
| "learning_rate": 1.1634615384615385e-06, |
| "loss": 0.9131, |
| "mean_token_accuracy": 0.7984575629234314, |
| "step": 200 |
| }, |
| { |
| "epoch": 3.140625, |
| "grad_norm": 6.3125, |
| "learning_rate": 1.153846153846154e-06, |
| "loss": 0.823, |
| "mean_token_accuracy": 0.8103417158126831, |
| "step": 201 |
| }, |
| { |
| "epoch": 3.15625, |
| "grad_norm": 7.0, |
| "learning_rate": 1.1442307692307692e-06, |
| "loss": 0.9495, |
| "mean_token_accuracy": 0.7842703461647034, |
| "step": 202 |
| }, |
| { |
| "epoch": 3.171875, |
| "grad_norm": 7.4375, |
| "learning_rate": 1.1346153846153847e-06, |
| "loss": 0.9537, |
| "mean_token_accuracy": 0.7833541035652161, |
| "step": 203 |
| }, |
| { |
| "epoch": 3.1875, |
| "grad_norm": 8.8125, |
| "learning_rate": 1.125e-06, |
| "loss": 0.9689, |
| "mean_token_accuracy": 0.7763713002204895, |
| "step": 204 |
| }, |
| { |
| "epoch": 3.203125, |
| "grad_norm": 23.375, |
| "learning_rate": 1.1153846153846154e-06, |
| "loss": 0.9487, |
| "mean_token_accuracy": 0.7811124920845032, |
| "step": 205 |
| }, |
| { |
| "epoch": 3.21875, |
| "grad_norm": 6.0625, |
| "learning_rate": 1.105769230769231e-06, |
| "loss": 0.8658, |
| "mean_token_accuracy": 0.8113903999328613, |
| "step": 206 |
| }, |
| { |
| "epoch": 3.234375, |
| "grad_norm": 11.625, |
| "learning_rate": 1.096153846153846e-06, |
| "loss": 0.9577, |
| "mean_token_accuracy": 0.7787481546401978, |
| "step": 207 |
| }, |
| { |
| "epoch": 3.25, |
| "grad_norm": 6.4375, |
| "learning_rate": 1.0865384615384616e-06, |
| "loss": 0.9042, |
| "mean_token_accuracy": 0.7923972010612488, |
| "step": 208 |
| }, |
| { |
| "epoch": 3.265625, |
| "grad_norm": 6.6875, |
| "learning_rate": 1.0769230769230769e-06, |
| "loss": 0.9379, |
| "mean_token_accuracy": 0.7835002541542053, |
| "step": 209 |
| }, |
| { |
| "epoch": 3.28125, |
| "grad_norm": 5.9375, |
| "learning_rate": 1.0673076923076924e-06, |
| "loss": 0.9172, |
| "mean_token_accuracy": 0.7931802868843079, |
| "step": 210 |
| }, |
| { |
| "epoch": 3.296875, |
| "grad_norm": 7.875, |
| "learning_rate": 1.0576923076923078e-06, |
| "loss": 0.9593, |
| "mean_token_accuracy": 0.7814356684684753, |
| "step": 211 |
| }, |
| { |
| "epoch": 3.3125, |
| "grad_norm": 30.125, |
| "learning_rate": 1.048076923076923e-06, |
| "loss": 0.9164, |
| "mean_token_accuracy": 0.7888871431350708, |
| "step": 212 |
| }, |
| { |
| "epoch": 3.328125, |
| "grad_norm": 6.28125, |
| "learning_rate": 1.0384615384615384e-06, |
| "loss": 0.9056, |
| "mean_token_accuracy": 0.7965211868286133, |
| "step": 213 |
| }, |
| { |
| "epoch": 3.34375, |
| "grad_norm": 6.53125, |
| "learning_rate": 1.028846153846154e-06, |
| "loss": 0.948, |
| "mean_token_accuracy": 0.7837017178535461, |
| "step": 214 |
| }, |
| { |
| "epoch": 3.359375, |
| "grad_norm": 6.3125, |
| "learning_rate": 1.0192307692307693e-06, |
| "loss": 0.902, |
| "mean_token_accuracy": 0.8006668090820312, |
| "step": 215 |
| }, |
| { |
| "epoch": 3.375, |
| "grad_norm": 5.75, |
| "learning_rate": 1.0096153846153846e-06, |
| "loss": 0.8923, |
| "mean_token_accuracy": 0.7972199320793152, |
| "step": 216 |
| }, |
| { |
| "epoch": 3.390625, |
| "grad_norm": 6.1875, |
| "learning_rate": 1e-06, |
| "loss": 0.8828, |
| "mean_token_accuracy": 0.8011194467544556, |
| "step": 217 |
| }, |
| { |
| "epoch": 3.40625, |
| "grad_norm": 6.28125, |
| "learning_rate": 9.903846153846153e-07, |
| "loss": 0.8411, |
| "mean_token_accuracy": 0.8033494353294373, |
| "step": 218 |
| }, |
| { |
| "epoch": 3.421875, |
| "grad_norm": 33.75, |
| "learning_rate": 9.807692307692308e-07, |
| "loss": 0.916, |
| "mean_token_accuracy": 0.7819077372550964, |
| "step": 219 |
| }, |
| { |
| "epoch": 3.4375, |
| "grad_norm": 6.40625, |
| "learning_rate": 9.711538461538462e-07, |
| "loss": 0.9401, |
| "mean_token_accuracy": 0.7851645946502686, |
| "step": 220 |
| }, |
| { |
| "epoch": 3.453125, |
| "grad_norm": 6.15625, |
| "learning_rate": 9.615384615384617e-07, |
| "loss": 0.8603, |
| "mean_token_accuracy": 0.8017836213111877, |
| "step": 221 |
| }, |
| { |
| "epoch": 3.46875, |
| "grad_norm": 7.5, |
| "learning_rate": 9.519230769230769e-07, |
| "loss": 0.9541, |
| "mean_token_accuracy": 0.7865185141563416, |
| "step": 222 |
| }, |
| { |
| "epoch": 3.484375, |
| "grad_norm": 8.625, |
| "learning_rate": 9.423076923076924e-07, |
| "loss": 0.9576, |
| "mean_token_accuracy": 0.774846613407135, |
| "step": 223 |
| }, |
| { |
| "epoch": 3.5, |
| "grad_norm": 15.25, |
| "learning_rate": 9.326923076923077e-07, |
| "loss": 0.9012, |
| "mean_token_accuracy": 0.7876802086830139, |
| "step": 224 |
| }, |
| { |
| "epoch": 3.515625, |
| "grad_norm": 25.5, |
| "learning_rate": 9.230769230769231e-07, |
| "loss": 0.9489, |
| "mean_token_accuracy": 0.7805652618408203, |
| "step": 225 |
| }, |
| { |
| "epoch": 3.53125, |
| "grad_norm": 6.28125, |
| "learning_rate": 9.134615384615385e-07, |
| "loss": 0.874, |
| "mean_token_accuracy": 0.7986671328544617, |
| "step": 226 |
| }, |
| { |
| "epoch": 3.546875, |
| "grad_norm": 23.5, |
| "learning_rate": 9.038461538461538e-07, |
| "loss": 0.966, |
| "mean_token_accuracy": 0.7745603322982788, |
| "step": 227 |
| }, |
| { |
| "epoch": 3.5625, |
| "grad_norm": 6.46875, |
| "learning_rate": 8.942307692307692e-07, |
| "loss": 0.8936, |
| "mean_token_accuracy": 0.7917812466621399, |
| "step": 228 |
| }, |
| { |
| "epoch": 3.578125, |
| "grad_norm": 10.875, |
| "learning_rate": 8.846153846153847e-07, |
| "loss": 0.9755, |
| "mean_token_accuracy": 0.7724282145500183, |
| "step": 229 |
| }, |
| { |
| "epoch": 3.59375, |
| "grad_norm": 21.875, |
| "learning_rate": 8.750000000000001e-07, |
| "loss": 0.9574, |
| "mean_token_accuracy": 0.7761261463165283, |
| "step": 230 |
| }, |
| { |
| "epoch": 3.609375, |
| "grad_norm": 20.625, |
| "learning_rate": 8.653846153846153e-07, |
| "loss": 0.9784, |
| "mean_token_accuracy": 0.7752029299736023, |
| "step": 231 |
| }, |
| { |
| "epoch": 3.625, |
| "grad_norm": 6.25, |
| "learning_rate": 8.557692307692308e-07, |
| "loss": 0.8936, |
| "mean_token_accuracy": 0.796856701374054, |
| "step": 232 |
| }, |
| { |
| "epoch": 3.640625, |
| "grad_norm": 6.21875, |
| "learning_rate": 8.461538461538462e-07, |
| "loss": 0.9131, |
| "mean_token_accuracy": 0.7863614559173584, |
| "step": 233 |
| }, |
| { |
| "epoch": 3.65625, |
| "grad_norm": 5.9375, |
| "learning_rate": 8.365384615384616e-07, |
| "loss": 0.8746, |
| "mean_token_accuracy": 0.7959774732589722, |
| "step": 234 |
| }, |
| { |
| "epoch": 3.671875, |
| "grad_norm": 24.625, |
| "learning_rate": 8.26923076923077e-07, |
| "loss": 0.9224, |
| "mean_token_accuracy": 0.7824280858039856, |
| "step": 235 |
| }, |
| { |
| "epoch": 3.6875, |
| "grad_norm": 27.125, |
| "learning_rate": 8.173076923076923e-07, |
| "loss": 0.9327, |
| "mean_token_accuracy": 0.7815178632736206, |
| "step": 236 |
| }, |
| { |
| "epoch": 3.703125, |
| "grad_norm": 6.09375, |
| "learning_rate": 8.076923076923077e-07, |
| "loss": 0.9501, |
| "mean_token_accuracy": 0.7856559157371521, |
| "step": 237 |
| }, |
| { |
| "epoch": 3.71875, |
| "grad_norm": 20.25, |
| "learning_rate": 7.98076923076923e-07, |
| "loss": 0.8748, |
| "mean_token_accuracy": 0.7932746410369873, |
| "step": 238 |
| }, |
| { |
| "epoch": 3.734375, |
| "grad_norm": 6.625, |
| "learning_rate": 7.884615384615385e-07, |
| "loss": 0.9192, |
| "mean_token_accuracy": 0.7865205407142639, |
| "step": 239 |
| }, |
| { |
| "epoch": 3.75, |
| "grad_norm": 6.4375, |
| "learning_rate": 7.788461538461539e-07, |
| "loss": 0.9443, |
| "mean_token_accuracy": 0.7778134346008301, |
| "step": 240 |
| }, |
| { |
| "epoch": 3.765625, |
| "grad_norm": 6.0, |
| "learning_rate": 7.692307692307691e-07, |
| "loss": 0.9398, |
| "mean_token_accuracy": 0.7858214974403381, |
| "step": 241 |
| }, |
| { |
| "epoch": 3.78125, |
| "grad_norm": 5.9375, |
| "learning_rate": 7.596153846153846e-07, |
| "loss": 0.873, |
| "mean_token_accuracy": 0.7989254593849182, |
| "step": 242 |
| }, |
| { |
| "epoch": 3.796875, |
| "grad_norm": 6.21875, |
| "learning_rate": 7.5e-07, |
| "loss": 0.9991, |
| "mean_token_accuracy": 0.7662928104400635, |
| "step": 243 |
| }, |
| { |
| "epoch": 3.8125, |
| "grad_norm": 11.75, |
| "learning_rate": 7.403846153846155e-07, |
| "loss": 0.8945, |
| "mean_token_accuracy": 0.7896128296852112, |
| "step": 244 |
| }, |
| { |
| "epoch": 3.828125, |
| "grad_norm": 6.375, |
| "learning_rate": 7.307692307692308e-07, |
| "loss": 0.917, |
| "mean_token_accuracy": 0.7882217764854431, |
| "step": 245 |
| }, |
| { |
| "epoch": 3.84375, |
| "grad_norm": 6.28125, |
| "learning_rate": 7.211538461538462e-07, |
| "loss": 0.9661, |
| "mean_token_accuracy": 0.7709052562713623, |
| "step": 246 |
| }, |
| { |
| "epoch": 3.859375, |
| "grad_norm": 6.125, |
| "learning_rate": 7.115384615384616e-07, |
| "loss": 0.9326, |
| "mean_token_accuracy": 0.7830464243888855, |
| "step": 247 |
| }, |
| { |
| "epoch": 3.875, |
| "grad_norm": 8.5, |
| "learning_rate": 7.01923076923077e-07, |
| "loss": 0.9376, |
| "mean_token_accuracy": 0.7810230255126953, |
| "step": 248 |
| }, |
| { |
| "epoch": 3.890625, |
| "grad_norm": 14.0, |
| "learning_rate": 6.923076923076923e-07, |
| "loss": 0.9734, |
| "mean_token_accuracy": 0.7753646373748779, |
| "step": 249 |
| }, |
| { |
| "epoch": 3.90625, |
| "grad_norm": 12.8125, |
| "learning_rate": 6.826923076923076e-07, |
| "loss": 0.8922, |
| "mean_token_accuracy": 0.7958292961120605, |
| "step": 250 |
| }, |
| { |
| "epoch": 3.921875, |
| "grad_norm": 5.71875, |
| "learning_rate": 6.730769230769231e-07, |
| "loss": 0.9053, |
| "mean_token_accuracy": 0.7927485704421997, |
| "step": 251 |
| }, |
| { |
| "epoch": 3.9375, |
| "grad_norm": 6.21875, |
| "learning_rate": 6.634615384615384e-07, |
| "loss": 0.8707, |
| "mean_token_accuracy": 0.8024818897247314, |
| "step": 252 |
| }, |
| { |
| "epoch": 3.953125, |
| "grad_norm": 6.09375, |
| "learning_rate": 6.538461538461538e-07, |
| "loss": 0.8756, |
| "mean_token_accuracy": 0.7953294515609741, |
| "step": 253 |
| }, |
| { |
| "epoch": 3.96875, |
| "grad_norm": 16.75, |
| "learning_rate": 6.442307692307692e-07, |
| "loss": 0.9806, |
| "mean_token_accuracy": 0.7700640559196472, |
| "step": 254 |
| }, |
| { |
| "epoch": 3.984375, |
| "grad_norm": 6.15625, |
| "learning_rate": 6.346153846153846e-07, |
| "loss": 0.9052, |
| "mean_token_accuracy": 0.7910767197608948, |
| "step": 255 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 6.1875, |
| "learning_rate": 6.25e-07, |
| "loss": 0.8756, |
| "mean_token_accuracy": 0.7918968796730042, |
| "step": 256 |
| }, |
| { |
| "epoch": 4.015625, |
| "grad_norm": 6.15625, |
| "learning_rate": 6.153846153846154e-07, |
| "loss": 0.8858, |
| "mean_token_accuracy": 0.7959042191505432, |
| "step": 257 |
| }, |
| { |
| "epoch": 4.03125, |
| "grad_norm": 6.8125, |
| "learning_rate": 6.057692307692308e-07, |
| "loss": 0.9302, |
| "mean_token_accuracy": 0.7876441478729248, |
| "step": 258 |
| }, |
| { |
| "epoch": 4.046875, |
| "grad_norm": 7.96875, |
| "learning_rate": 5.961538461538461e-07, |
| "loss": 0.9649, |
| "mean_token_accuracy": 0.7740775346755981, |
| "step": 259 |
| }, |
| { |
| "epoch": 4.0625, |
| "grad_norm": 19.125, |
| "learning_rate": 5.865384615384616e-07, |
| "loss": 0.9184, |
| "mean_token_accuracy": 0.7818529605865479, |
| "step": 260 |
| }, |
| { |
| "epoch": 4.078125, |
| "grad_norm": 11.375, |
| "learning_rate": 5.76923076923077e-07, |
| "loss": 0.912, |
| "mean_token_accuracy": 0.7843265533447266, |
| "step": 261 |
| }, |
| { |
| "epoch": 4.09375, |
| "grad_norm": 6.84375, |
| "learning_rate": 5.673076923076923e-07, |
| "loss": 0.9025, |
| "mean_token_accuracy": 0.7933535575866699, |
| "step": 262 |
| }, |
| { |
| "epoch": 4.109375, |
| "grad_norm": 6.1875, |
| "learning_rate": 5.576923076923077e-07, |
| "loss": 0.9127, |
| "mean_token_accuracy": 0.7852448225021362, |
| "step": 263 |
| }, |
| { |
| "epoch": 4.125, |
| "grad_norm": 6.03125, |
| "learning_rate": 5.48076923076923e-07, |
| "loss": 0.9111, |
| "mean_token_accuracy": 0.7866109013557434, |
| "step": 264 |
| }, |
| { |
| "epoch": 4.140625, |
| "grad_norm": 6.21875, |
| "learning_rate": 5.384615384615384e-07, |
| "loss": 0.892, |
| "mean_token_accuracy": 0.7890470027923584, |
| "step": 265 |
| }, |
| { |
| "epoch": 4.15625, |
| "grad_norm": 12.0625, |
| "learning_rate": 5.288461538461539e-07, |
| "loss": 0.8986, |
| "mean_token_accuracy": 0.7962346076965332, |
| "step": 266 |
| }, |
| { |
| "epoch": 4.171875, |
| "grad_norm": 29.0, |
| "learning_rate": 5.192307692307692e-07, |
| "loss": 0.9209, |
| "mean_token_accuracy": 0.786607563495636, |
| "step": 267 |
| }, |
| { |
| "epoch": 4.1875, |
| "grad_norm": 6.25, |
| "learning_rate": 5.096153846153846e-07, |
| "loss": 0.904, |
| "mean_token_accuracy": 0.7876223921775818, |
| "step": 268 |
| }, |
| { |
| "epoch": 4.203125, |
| "grad_norm": 7.96875, |
| "learning_rate": 5e-07, |
| "loss": 0.9383, |
| "mean_token_accuracy": 0.7814289927482605, |
| "step": 269 |
| }, |
| { |
| "epoch": 4.21875, |
| "grad_norm": 6.15625, |
| "learning_rate": 4.903846153846154e-07, |
| "loss": 0.9128, |
| "mean_token_accuracy": 0.7915287613868713, |
| "step": 270 |
| }, |
| { |
| "epoch": 4.234375, |
| "grad_norm": 6.78125, |
| "learning_rate": 4.807692307692308e-07, |
| "loss": 0.9481, |
| "mean_token_accuracy": 0.7819157242774963, |
| "step": 271 |
| }, |
| { |
| "epoch": 4.25, |
| "grad_norm": 6.25, |
| "learning_rate": 4.711538461538462e-07, |
| "loss": 0.8902, |
| "mean_token_accuracy": 0.7958080768585205, |
| "step": 272 |
| }, |
| { |
| "epoch": 4.265625, |
| "grad_norm": 7.65625, |
| "learning_rate": 4.6153846153846156e-07, |
| "loss": 0.977, |
| "mean_token_accuracy": 0.7705891132354736, |
| "step": 273 |
| }, |
| { |
| "epoch": 4.28125, |
| "grad_norm": 18.125, |
| "learning_rate": 4.519230769230769e-07, |
| "loss": 0.9237, |
| "mean_token_accuracy": 0.7899967432022095, |
| "step": 274 |
| }, |
| { |
| "epoch": 4.296875, |
| "grad_norm": 8.0, |
| "learning_rate": 4.4230769230769233e-07, |
| "loss": 0.8679, |
| "mean_token_accuracy": 0.7969164252281189, |
| "step": 275 |
| }, |
| { |
| "epoch": 4.3125, |
| "grad_norm": 6.03125, |
| "learning_rate": 4.3269230769230766e-07, |
| "loss": 0.8823, |
| "mean_token_accuracy": 0.7964279651641846, |
| "step": 276 |
| }, |
| { |
| "epoch": 4.328125, |
| "grad_norm": 9.6875, |
| "learning_rate": 4.230769230769231e-07, |
| "loss": 0.8829, |
| "mean_token_accuracy": 0.7917771935462952, |
| "step": 277 |
| }, |
| { |
| "epoch": 4.34375, |
| "grad_norm": 5.9375, |
| "learning_rate": 4.134615384615385e-07, |
| "loss": 0.8448, |
| "mean_token_accuracy": 0.80218505859375, |
| "step": 278 |
| }, |
| { |
| "epoch": 4.359375, |
| "grad_norm": 23.875, |
| "learning_rate": 4.0384615384615386e-07, |
| "loss": 0.9189, |
| "mean_token_accuracy": 0.7808871865272522, |
| "step": 279 |
| }, |
| { |
| "epoch": 4.375, |
| "grad_norm": 11.9375, |
| "learning_rate": 3.9423076923076924e-07, |
| "loss": 0.9288, |
| "mean_token_accuracy": 0.7887097001075745, |
| "step": 280 |
| }, |
| { |
| "epoch": 4.390625, |
| "grad_norm": 6.0625, |
| "learning_rate": 3.846153846153846e-07, |
| "loss": 0.9306, |
| "mean_token_accuracy": 0.7884582877159119, |
| "step": 281 |
| }, |
| { |
| "epoch": 4.40625, |
| "grad_norm": 6.40625, |
| "learning_rate": 3.75e-07, |
| "loss": 0.9234, |
| "mean_token_accuracy": 0.7834262847900391, |
| "step": 282 |
| }, |
| { |
| "epoch": 4.421875, |
| "grad_norm": 6.1875, |
| "learning_rate": 3.653846153846154e-07, |
| "loss": 0.8417, |
| "mean_token_accuracy": 0.8057113885879517, |
| "step": 283 |
| }, |
| { |
| "epoch": 4.4375, |
| "grad_norm": 6.59375, |
| "learning_rate": 3.557692307692308e-07, |
| "loss": 0.9138, |
| "mean_token_accuracy": 0.7837575078010559, |
| "step": 284 |
| }, |
| { |
| "epoch": 4.453125, |
| "grad_norm": 33.25, |
| "learning_rate": 3.4615384615384616e-07, |
| "loss": 0.8612, |
| "mean_token_accuracy": 0.7957943081855774, |
| "step": 285 |
| }, |
| { |
| "epoch": 4.46875, |
| "grad_norm": 5.71875, |
| "learning_rate": 3.3653846153846154e-07, |
| "loss": 0.8749, |
| "mean_token_accuracy": 0.8011859059333801, |
| "step": 286 |
| }, |
| { |
| "epoch": 4.484375, |
| "grad_norm": 6.5, |
| "learning_rate": 3.269230769230769e-07, |
| "loss": 0.8933, |
| "mean_token_accuracy": 0.7931327819824219, |
| "step": 287 |
| }, |
| { |
| "epoch": 4.5, |
| "grad_norm": 5.96875, |
| "learning_rate": 3.173076923076923e-07, |
| "loss": 0.873, |
| "mean_token_accuracy": 0.799176812171936, |
| "step": 288 |
| }, |
| { |
| "epoch": 4.515625, |
| "grad_norm": 29.25, |
| "learning_rate": 3.076923076923077e-07, |
| "loss": 0.9831, |
| "mean_token_accuracy": 0.7707536816596985, |
| "step": 289 |
| }, |
| { |
| "epoch": 4.53125, |
| "grad_norm": 13.0, |
| "learning_rate": 2.980769230769231e-07, |
| "loss": 0.9484, |
| "mean_token_accuracy": 0.7785703539848328, |
| "step": 290 |
| }, |
| { |
| "epoch": 4.546875, |
| "grad_norm": 6.875, |
| "learning_rate": 2.884615384615385e-07, |
| "loss": 0.9039, |
| "mean_token_accuracy": 0.7895255088806152, |
| "step": 291 |
| }, |
| { |
| "epoch": 4.5625, |
| "grad_norm": 6.625, |
| "learning_rate": 2.7884615384615384e-07, |
| "loss": 0.9063, |
| "mean_token_accuracy": 0.7905294299125671, |
| "step": 292 |
| }, |
| { |
| "epoch": 4.578125, |
| "grad_norm": 7.5, |
| "learning_rate": 2.692307692307692e-07, |
| "loss": 0.8963, |
| "mean_token_accuracy": 0.7899447083473206, |
| "step": 293 |
| }, |
| { |
| "epoch": 4.59375, |
| "grad_norm": 6.84375, |
| "learning_rate": 2.596153846153846e-07, |
| "loss": 0.94, |
| "mean_token_accuracy": 0.7839446663856506, |
| "step": 294 |
| }, |
| { |
| "epoch": 4.609375, |
| "grad_norm": 5.875, |
| "learning_rate": 2.5e-07, |
| "loss": 0.8796, |
| "mean_token_accuracy": 0.7935015559196472, |
| "step": 295 |
| }, |
| { |
| "epoch": 4.625, |
| "grad_norm": 10.125, |
| "learning_rate": 2.403846153846154e-07, |
| "loss": 0.8859, |
| "mean_token_accuracy": 0.789797842502594, |
| "step": 296 |
| }, |
| { |
| "epoch": 4.640625, |
| "grad_norm": 6.5, |
| "learning_rate": 2.3076923076923078e-07, |
| "loss": 0.9302, |
| "mean_token_accuracy": 0.7839468121528625, |
| "step": 297 |
| }, |
| { |
| "epoch": 4.65625, |
| "grad_norm": 6.28125, |
| "learning_rate": 2.2115384615384616e-07, |
| "loss": 0.9524, |
| "mean_token_accuracy": 0.7737887501716614, |
| "step": 298 |
| }, |
| { |
| "epoch": 4.671875, |
| "grad_norm": 8.125, |
| "learning_rate": 2.1153846153846155e-07, |
| "loss": 0.9181, |
| "mean_token_accuracy": 0.7865311503410339, |
| "step": 299 |
| }, |
| { |
| "epoch": 4.6875, |
| "grad_norm": 6.8125, |
| "learning_rate": 2.0192307692307693e-07, |
| "loss": 0.8934, |
| "mean_token_accuracy": 0.789657473564148, |
| "step": 300 |
| }, |
| { |
| "epoch": 4.703125, |
| "grad_norm": 6.4375, |
| "learning_rate": 1.923076923076923e-07, |
| "loss": 0.9065, |
| "mean_token_accuracy": 0.789890468120575, |
| "step": 301 |
| }, |
| { |
| "epoch": 4.71875, |
| "grad_norm": 6.0625, |
| "learning_rate": 1.826923076923077e-07, |
| "loss": 0.8889, |
| "mean_token_accuracy": 0.7914140820503235, |
| "step": 302 |
| }, |
| { |
| "epoch": 4.734375, |
| "grad_norm": 14.0, |
| "learning_rate": 1.7307692307692308e-07, |
| "loss": 0.9113, |
| "mean_token_accuracy": 0.7846829891204834, |
| "step": 303 |
| }, |
| { |
| "epoch": 4.75, |
| "grad_norm": 7.34375, |
| "learning_rate": 1.6346153846153846e-07, |
| "loss": 0.9374, |
| "mean_token_accuracy": 0.7835116386413574, |
| "step": 304 |
| }, |
| { |
| "epoch": 4.765625, |
| "grad_norm": 11.375, |
| "learning_rate": 1.5384615384615385e-07, |
| "loss": 0.8824, |
| "mean_token_accuracy": 0.7850437760353088, |
| "step": 305 |
| }, |
| { |
| "epoch": 4.78125, |
| "grad_norm": 7.1875, |
| "learning_rate": 1.4423076923076925e-07, |
| "loss": 0.9408, |
| "mean_token_accuracy": 0.7855393886566162, |
| "step": 306 |
| }, |
| { |
| "epoch": 4.796875, |
| "grad_norm": 8.3125, |
| "learning_rate": 1.346153846153846e-07, |
| "loss": 0.9047, |
| "mean_token_accuracy": 0.7834271788597107, |
| "step": 307 |
| }, |
| { |
| "epoch": 4.8125, |
| "grad_norm": 8.0625, |
| "learning_rate": 1.25e-07, |
| "loss": 0.9188, |
| "mean_token_accuracy": 0.7803459167480469, |
| "step": 308 |
| }, |
| { |
| "epoch": 4.828125, |
| "grad_norm": 20.25, |
| "learning_rate": 1.1538461538461539e-07, |
| "loss": 0.9627, |
| "mean_token_accuracy": 0.7762289047241211, |
| "step": 309 |
| }, |
| { |
| "epoch": 4.84375, |
| "grad_norm": 10.6875, |
| "learning_rate": 1.0576923076923077e-07, |
| "loss": 0.884, |
| "mean_token_accuracy": 0.7895846962928772, |
| "step": 310 |
| }, |
| { |
| "epoch": 4.859375, |
| "grad_norm": 6.53125, |
| "learning_rate": 9.615384615384614e-08, |
| "loss": 0.894, |
| "mean_token_accuracy": 0.792397677898407, |
| "step": 311 |
| }, |
| { |
| "epoch": 4.875, |
| "grad_norm": 10.6875, |
| "learning_rate": 8.653846153846154e-08, |
| "loss": 0.9112, |
| "mean_token_accuracy": 0.786827564239502, |
| "step": 312 |
| }, |
| { |
| "epoch": 4.890625, |
| "grad_norm": 6.6875, |
| "learning_rate": 7.692307692307692e-08, |
| "loss": 0.9368, |
| "mean_token_accuracy": 0.7861944437026978, |
| "step": 313 |
| }, |
| { |
| "epoch": 4.90625, |
| "grad_norm": 10.625, |
| "learning_rate": 6.73076923076923e-08, |
| "loss": 0.9033, |
| "mean_token_accuracy": 0.7840073704719543, |
| "step": 314 |
| }, |
| { |
| "epoch": 4.921875, |
| "grad_norm": 10.25, |
| "learning_rate": 5.7692307692307695e-08, |
| "loss": 0.8777, |
| "mean_token_accuracy": 0.7938881516456604, |
| "step": 315 |
| }, |
| { |
| "epoch": 4.9375, |
| "grad_norm": 6.5625, |
| "learning_rate": 4.807692307692307e-08, |
| "loss": 0.9133, |
| "mean_token_accuracy": 0.7796001434326172, |
| "step": 316 |
| }, |
| { |
| "epoch": 4.953125, |
| "grad_norm": 5.6875, |
| "learning_rate": 3.846153846153846e-08, |
| "loss": 0.8674, |
| "mean_token_accuracy": 0.7963815927505493, |
| "step": 317 |
| }, |
| { |
| "epoch": 4.96875, |
| "grad_norm": 5.96875, |
| "learning_rate": 2.8846153846153848e-08, |
| "loss": 0.8872, |
| "mean_token_accuracy": 0.7956120371818542, |
| "step": 318 |
| }, |
| { |
| "epoch": 4.984375, |
| "grad_norm": 35.0, |
| "learning_rate": 1.923076923076923e-08, |
| "loss": 0.9419, |
| "mean_token_accuracy": 0.7827126979827881, |
| "step": 319 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 6.625, |
| "learning_rate": 9.615384615384615e-09, |
| "loss": 0.8891, |
| "mean_token_accuracy": 0.7931276559829712, |
| "step": 320 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 320, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 25, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 5.299754052563763e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|