diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,8 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 2.9250457038391224, - "global_step": 1800, + "epoch": 0.17627940285352284, + "eval_steps": 500, + "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -10,5407 +11,1209 @@ { "epoch": 0.0, "learning_rate": 2e-05, - "loss": 1.6531, + "loss": 1.9869, "step": 2 }, { - "epoch": 0.01, + "epoch": 0.0, "learning_rate": 2e-05, - "loss": 1.7623, + "loss": 2.3031, "step": 4 }, { - "epoch": 0.01, + "epoch": 0.0, "learning_rate": 2e-05, - "loss": 1.4946, + "loss": 2.1553, "step": 6 }, { - "epoch": 0.01, + "epoch": 0.0, "learning_rate": 2e-05, - "loss": 1.8882, + "loss": 1.9117, "step": 8 }, { - "epoch": 0.02, + "epoch": 0.0, "learning_rate": 2e-05, - "loss": 1.7562, + "loss": 2.3573, "step": 10 }, { - "epoch": 0.02, + "epoch": 0.01, "learning_rate": 2e-05, - "loss": 1.875, + "loss": 2.316, "step": 12 }, { - "epoch": 0.02, + "epoch": 0.01, "learning_rate": 2e-05, - "loss": 1.9855, + "loss": 2.1818, "step": 14 }, { - "epoch": 0.03, + "epoch": 0.01, "learning_rate": 2e-05, - "loss": 1.833, + "loss": 2.2639, "step": 16 }, { - "epoch": 0.03, + "epoch": 0.01, "learning_rate": 2e-05, - "loss": 1.7799, + "loss": 1.7835, "step": 18 }, { - "epoch": 0.03, + "epoch": 0.01, "learning_rate": 2e-05, - "loss": 1.9502, + "loss": 2.2783, "step": 20 }, { - "epoch": 0.04, + "epoch": 0.01, "learning_rate": 2e-05, - "loss": 1.8134, + "loss": 2.0463, "step": 22 }, { - "epoch": 0.04, + "epoch": 0.01, "learning_rate": 2e-05, - "loss": 1.8352, + "loss": 1.8931, "step": 24 }, { - "epoch": 0.04, + "epoch": 0.01, "learning_rate": 2e-05, - "loss": 1.9382, + "loss": 1.9732, "step": 26 }, { - "epoch": 0.05, + "epoch": 0.01, "learning_rate": 2e-05, - "loss": 2.2001, + "loss": 1.9932, "step": 28 }, { - "epoch": 0.05, + "epoch": 0.01, "learning_rate": 2e-05, - "loss": 1.9664, + "loss": 2.0808, "step": 30 }, { - "epoch": 0.05, + "epoch": 0.01, "learning_rate": 2e-05, - "loss": 2.0627, + "loss": 1.9768, "step": 32 }, { - "epoch": 0.06, + "epoch": 0.01, "learning_rate": 2e-05, - "loss": 2.0913, + "loss": 1.808, "step": 34 }, { - "epoch": 0.06, + "epoch": 0.02, "learning_rate": 2e-05, - "loss": 2.1183, + "loss": 1.7387, "step": 36 }, { - "epoch": 0.06, + "epoch": 0.02, "learning_rate": 2e-05, - "loss": 2.2824, + "loss": 1.9784, "step": 38 }, { - "epoch": 0.07, + "epoch": 0.02, "learning_rate": 2e-05, - "loss": 2.2368, + "loss": 1.8164, "step": 40 }, { - "epoch": 0.07, + "epoch": 0.02, "learning_rate": 2e-05, - "loss": 2.3417, + "loss": 1.7091, "step": 42 }, { - "epoch": 0.07, + "epoch": 0.02, "learning_rate": 2e-05, - "loss": 2.3669, + "loss": 1.7938, "step": 44 }, { - "epoch": 0.07, + "epoch": 0.02, "learning_rate": 2e-05, - "loss": 2.5686, + "loss": 1.973, "step": 46 }, { - "epoch": 0.08, + "epoch": 0.02, "learning_rate": 2e-05, - "loss": 2.6004, + "loss": 2.1455, "step": 48 }, { - "epoch": 0.08, + "epoch": 0.02, "learning_rate": 2e-05, - "loss": 2.7763, + "loss": 1.9081, "step": 50 }, { - "epoch": 0.08, + "epoch": 0.02, "learning_rate": 2e-05, - "loss": 1.7123, + "loss": 1.7108, "step": 52 }, { - "epoch": 0.09, + "epoch": 0.02, "learning_rate": 2e-05, - "loss": 1.5842, + "loss": 1.8766, "step": 54 }, { - "epoch": 0.09, + "epoch": 0.02, "learning_rate": 2e-05, - "loss": 1.634, + "loss": 1.8224, "step": 56 }, { - "epoch": 0.09, + "epoch": 0.03, "learning_rate": 2e-05, - "loss": 1.6568, + "loss": 1.6724, "step": 58 }, { - "epoch": 0.1, + "epoch": 0.03, "learning_rate": 2e-05, - "loss": 1.7682, + "loss": 1.9838, "step": 60 }, { - "epoch": 0.1, + "epoch": 0.03, "learning_rate": 2e-05, - "loss": 1.7746, + "loss": 1.5954, "step": 62 }, { - "epoch": 0.1, + "epoch": 0.03, "learning_rate": 2e-05, - "loss": 1.7256, + "loss": 1.6449, "step": 64 }, { - "epoch": 0.11, + "epoch": 0.03, "learning_rate": 2e-05, - "loss": 1.8528, + "loss": 1.6045, "step": 66 }, { - "epoch": 0.11, + "epoch": 0.03, "learning_rate": 2e-05, - "loss": 1.8566, + "loss": 1.7874, "step": 68 }, { - "epoch": 0.11, + "epoch": 0.03, "learning_rate": 2e-05, - "loss": 1.8693, + "loss": 1.5796, "step": 70 }, { - "epoch": 0.12, + "epoch": 0.03, "learning_rate": 2e-05, - "loss": 1.9164, + "loss": 1.7467, "step": 72 }, { - "epoch": 0.12, + "epoch": 0.03, "learning_rate": 2e-05, - "loss": 1.8215, + "loss": 1.6555, "step": 74 }, { - "epoch": 0.12, + "epoch": 0.03, "learning_rate": 2e-05, - "loss": 1.6364, + "loss": 1.6531, "step": 76 }, { - "epoch": 0.13, + "epoch": 0.03, "learning_rate": 2e-05, - "loss": 1.9358, + "loss": 1.7604, "step": 78 }, { - "epoch": 0.13, + "epoch": 0.04, "learning_rate": 2e-05, - "loss": 1.829, + "loss": 1.8428, "step": 80 }, { - "epoch": 0.13, + "epoch": 0.04, "learning_rate": 2e-05, - "loss": 2.0402, + "loss": 1.6431, "step": 82 }, { - "epoch": 0.14, + "epoch": 0.04, "learning_rate": 2e-05, - "loss": 2.046, + "loss": 1.6619, "step": 84 }, { - "epoch": 0.14, + "epoch": 0.04, "learning_rate": 2e-05, - "loss": 1.9946, + "loss": 1.5432, "step": 86 }, { - "epoch": 0.14, + "epoch": 0.04, "learning_rate": 2e-05, - "loss": 2.0116, + "loss": 1.6974, "step": 88 }, { - "epoch": 0.15, + "epoch": 0.04, "learning_rate": 2e-05, - "loss": 2.1454, + "loss": 1.5422, "step": 90 }, { - "epoch": 0.15, + "epoch": 0.04, "learning_rate": 2e-05, - "loss": 2.3368, + "loss": 1.6739, "step": 92 }, { - "epoch": 0.15, + "epoch": 0.04, "learning_rate": 2e-05, - "loss": 2.3731, + "loss": 1.5725, "step": 94 }, { - "epoch": 0.16, + "epoch": 0.04, "learning_rate": 2e-05, - "loss": 2.3366, + "loss": 1.6482, "step": 96 }, { - "epoch": 0.16, + "epoch": 0.04, "learning_rate": 2e-05, - "loss": 2.3437, + "loss": 1.7447, "step": 98 }, { - "epoch": 0.16, + "epoch": 0.04, "learning_rate": 2e-05, - "loss": 2.524, + "loss": 1.7923, "step": 100 }, { - "epoch": 0.17, + "epoch": 0.04, "learning_rate": 2e-05, - "loss": 1.5793, + "loss": 1.6704, "step": 102 }, { - "epoch": 0.17, + "epoch": 0.05, "learning_rate": 2e-05, - "loss": 1.6267, + "loss": 1.6213, "step": 104 }, { - "epoch": 0.17, + "epoch": 0.05, "learning_rate": 2e-05, - "loss": 1.6976, + "loss": 1.5271, "step": 106 }, { - "epoch": 0.18, + "epoch": 0.05, "learning_rate": 2e-05, - "loss": 1.7402, + "loss": 1.5259, "step": 108 }, { - "epoch": 0.18, + "epoch": 0.05, "learning_rate": 2e-05, - "loss": 1.6939, + "loss": 1.7935, "step": 110 }, { - "epoch": 0.18, + "epoch": 0.05, "learning_rate": 2e-05, - "loss": 1.8356, + "loss": 1.7862, "step": 112 }, { - "epoch": 0.19, + "epoch": 0.05, "learning_rate": 2e-05, - "loss": 1.8122, + "loss": 1.6882, "step": 114 }, { - "epoch": 0.19, + "epoch": 0.05, "learning_rate": 2e-05, - "loss": 1.6891, + "loss": 1.672, "step": 116 }, { - "epoch": 0.19, + "epoch": 0.05, "learning_rate": 2e-05, - "loss": 1.7671, + "loss": 1.4945, "step": 118 }, { - "epoch": 0.2, + "epoch": 0.05, "learning_rate": 2e-05, - "loss": 1.8497, + "loss": 1.7371, "step": 120 }, { - "epoch": 0.2, + "epoch": 0.05, "learning_rate": 2e-05, - "loss": 1.7755, + "loss": 1.6145, "step": 122 }, { - "epoch": 0.2, + "epoch": 0.05, "learning_rate": 2e-05, - "loss": 1.8383, + "loss": 1.5366, "step": 124 }, { - "epoch": 0.2, + "epoch": 0.06, "learning_rate": 2e-05, - "loss": 1.9323, + "loss": 1.5875, "step": 126 }, { - "epoch": 0.21, + "epoch": 0.06, "learning_rate": 2e-05, - "loss": 1.8231, + "loss": 1.5924, "step": 128 }, { - "epoch": 0.21, + "epoch": 0.06, "learning_rate": 2e-05, - "loss": 1.8705, + "loss": 1.7048, "step": 130 }, { - "epoch": 0.21, + "epoch": 0.06, "learning_rate": 2e-05, - "loss": 1.8914, + "loss": 1.5899, "step": 132 }, { - "epoch": 0.22, + "epoch": 0.06, "learning_rate": 2e-05, - "loss": 2.1717, + "loss": 1.7091, "step": 134 }, { - "epoch": 0.22, + "epoch": 0.06, "learning_rate": 2e-05, - "loss": 2.0413, + "loss": 1.6705, "step": 136 }, { - "epoch": 0.22, + "epoch": 0.06, "learning_rate": 2e-05, - "loss": 2.0281, + "loss": 1.7555, "step": 138 }, { - "epoch": 0.23, + "epoch": 0.06, "learning_rate": 2e-05, - "loss": 2.1997, + "loss": 1.7616, "step": 140 }, { - "epoch": 0.23, + "epoch": 0.06, "learning_rate": 2e-05, - "loss": 2.3263, + "loss": 1.6282, "step": 142 }, { - "epoch": 0.23, + "epoch": 0.06, "learning_rate": 2e-05, - "loss": 2.1469, + "loss": 1.6069, "step": 144 }, { - "epoch": 0.24, + "epoch": 0.06, "learning_rate": 2e-05, - "loss": 2.2804, + "loss": 1.5998, "step": 146 }, { - "epoch": 0.24, + "epoch": 0.07, "learning_rate": 2e-05, - "loss": 2.4742, + "loss": 1.6762, "step": 148 }, { - "epoch": 0.24, + "epoch": 0.07, "learning_rate": 2e-05, - "loss": 2.3649, + "loss": 1.7779, "step": 150 }, { - "epoch": 0.25, + "epoch": 0.07, "learning_rate": 2e-05, - "loss": 1.6274, + "loss": 1.7046, "step": 152 }, { - "epoch": 0.25, + "epoch": 0.07, "learning_rate": 2e-05, - "loss": 1.6698, + "loss": 1.6704, "step": 154 }, { - "epoch": 0.25, + "epoch": 0.07, "learning_rate": 2e-05, - "loss": 1.7127, + "loss": 1.5523, "step": 156 }, { - "epoch": 0.26, + "epoch": 0.07, "learning_rate": 2e-05, - "loss": 1.6559, + "loss": 1.6883, "step": 158 }, { - "epoch": 0.26, + "epoch": 0.07, "learning_rate": 2e-05, - "loss": 1.7203, + "loss": 1.5095, "step": 160 }, { - "epoch": 0.26, + "epoch": 0.07, "learning_rate": 2e-05, - "loss": 1.7946, + "loss": 1.5435, "step": 162 }, { - "epoch": 0.27, + "epoch": 0.07, "learning_rate": 2e-05, - "loss": 1.71, + "loss": 1.8038, "step": 164 }, { - "epoch": 0.27, + "epoch": 0.07, "learning_rate": 2e-05, - "loss": 1.8896, + "loss": 1.5684, "step": 166 }, { - "epoch": 0.27, + "epoch": 0.07, "learning_rate": 2e-05, - "loss": 1.9708, + "loss": 1.7597, "step": 168 }, { - "epoch": 0.28, + "epoch": 0.07, "learning_rate": 2e-05, - "loss": 1.8426, + "loss": 1.7696, "step": 170 }, { - "epoch": 0.28, + "epoch": 0.08, "learning_rate": 2e-05, - "loss": 1.7623, + "loss": 1.7651, "step": 172 }, { - "epoch": 0.28, + "epoch": 0.08, "learning_rate": 2e-05, - "loss": 1.8608, + "loss": 1.4832, "step": 174 }, { - "epoch": 0.29, + "epoch": 0.08, "learning_rate": 2e-05, - "loss": 1.8288, + "loss": 1.6022, "step": 176 }, { - "epoch": 0.29, + "epoch": 0.08, "learning_rate": 2e-05, - "loss": 1.8339, + "loss": 1.82, "step": 178 }, { - "epoch": 0.29, + "epoch": 0.08, "learning_rate": 2e-05, - "loss": 1.955, + "loss": 1.504, "step": 180 }, { - "epoch": 0.3, + "epoch": 0.08, "learning_rate": 2e-05, - "loss": 2.0129, + "loss": 1.6136, "step": 182 }, { - "epoch": 0.3, + "epoch": 0.08, "learning_rate": 2e-05, - "loss": 2.0187, + "loss": 1.5933, "step": 184 }, { - "epoch": 0.3, + "epoch": 0.08, "learning_rate": 2e-05, - "loss": 2.0102, + "loss": 1.6525, "step": 186 }, { - "epoch": 0.31, + "epoch": 0.08, "learning_rate": 2e-05, - "loss": 2.0275, + "loss": 1.6809, "step": 188 }, { - "epoch": 0.31, + "epoch": 0.08, "learning_rate": 2e-05, - "loss": 2.2052, + "loss": 1.4673, "step": 190 }, { - "epoch": 0.31, + "epoch": 0.08, "learning_rate": 2e-05, - "loss": 2.2022, + "loss": 1.4732, "step": 192 }, { - "epoch": 0.32, + "epoch": 0.09, "learning_rate": 2e-05, - "loss": 2.207, + "loss": 1.6401, "step": 194 }, { - "epoch": 0.32, + "epoch": 0.09, "learning_rate": 2e-05, - "loss": 2.2069, + "loss": 1.4686, "step": 196 }, { - "epoch": 0.32, + "epoch": 0.09, "learning_rate": 2e-05, - "loss": 2.4695, + "loss": 1.6673, "step": 198 }, { - "epoch": 0.33, + "epoch": 0.09, "learning_rate": 2e-05, - "loss": 2.3821, + "loss": 1.5662, "step": 200 }, { - "epoch": 0.33, + "epoch": 0.09, "learning_rate": 2e-05, - "loss": 1.5257, + "loss": 1.3947, "step": 202 }, { - "epoch": 0.33, + "epoch": 0.09, "learning_rate": 2e-05, - "loss": 1.6402, + "loss": 1.6261, "step": 204 }, { - "epoch": 0.33, + "epoch": 0.09, "learning_rate": 2e-05, - "loss": 1.6208, + "loss": 1.7449, "step": 206 }, { - "epoch": 0.34, + "epoch": 0.09, "learning_rate": 2e-05, - "loss": 1.7039, + "loss": 1.514, "step": 208 }, { - "epoch": 0.34, + "epoch": 0.09, "learning_rate": 2e-05, - "loss": 1.6589, + "loss": 1.5812, "step": 210 }, { - "epoch": 0.34, + "epoch": 0.09, "learning_rate": 2e-05, - "loss": 1.6637, + "loss": 1.5474, "step": 212 }, { - "epoch": 0.35, + "epoch": 0.09, "learning_rate": 2e-05, - "loss": 1.6445, + "loss": 1.8105, "step": 214 }, { - "epoch": 0.35, + "epoch": 0.1, "learning_rate": 2e-05, - "loss": 1.6848, + "loss": 1.8234, "step": 216 }, { - "epoch": 0.35, + "epoch": 0.1, "learning_rate": 2e-05, - "loss": 1.5876, + "loss": 1.6536, "step": 218 }, { - "epoch": 0.36, + "epoch": 0.1, "learning_rate": 2e-05, - "loss": 1.7515, + "loss": 1.5396, "step": 220 }, { - "epoch": 0.36, + "epoch": 0.1, "learning_rate": 2e-05, - "loss": 1.7731, + "loss": 1.6499, "step": 222 }, { - "epoch": 0.36, + "epoch": 0.1, "learning_rate": 2e-05, - "loss": 1.8234, + "loss": 1.6298, "step": 224 }, { - "epoch": 0.37, + "epoch": 0.1, "learning_rate": 2e-05, - "loss": 1.7628, + "loss": 1.6265, "step": 226 }, { - "epoch": 0.37, + "epoch": 0.1, "learning_rate": 2e-05, - "loss": 1.8525, + "loss": 1.6794, "step": 228 }, { - "epoch": 0.37, + "epoch": 0.1, "learning_rate": 2e-05, - "loss": 1.8992, + "loss": 1.6706, "step": 230 }, { - "epoch": 0.38, + "epoch": 0.1, "learning_rate": 2e-05, - "loss": 1.8982, + "loss": 1.6217, "step": 232 }, { - "epoch": 0.38, + "epoch": 0.1, "learning_rate": 2e-05, - "loss": 2.0687, + "loss": 1.5515, "step": 234 }, { - "epoch": 0.38, + "epoch": 0.1, "learning_rate": 2e-05, - "loss": 2.0344, + "loss": 1.7083, "step": 236 }, { - "epoch": 0.39, + "epoch": 0.1, "learning_rate": 2e-05, - "loss": 2.2035, + "loss": 1.7026, "step": 238 }, { - "epoch": 0.39, + "epoch": 0.11, "learning_rate": 2e-05, - "loss": 2.1326, + "loss": 1.4813, "step": 240 }, { - "epoch": 0.39, + "epoch": 0.11, "learning_rate": 2e-05, - "loss": 2.3343, + "loss": 1.6321, "step": 242 }, { - "epoch": 0.4, + "epoch": 0.11, "learning_rate": 2e-05, - "loss": 2.331, + "loss": 1.4753, "step": 244 }, { - "epoch": 0.4, + "epoch": 0.11, "learning_rate": 2e-05, - "loss": 2.1588, + "loss": 1.4435, "step": 246 }, { - "epoch": 0.4, + "epoch": 0.11, "learning_rate": 2e-05, - "loss": 2.4497, + "loss": 1.5495, "step": 248 }, { - "epoch": 0.41, + "epoch": 0.11, "learning_rate": 2e-05, - "loss": 2.3209, + "loss": 1.4777, "step": 250 }, { - "epoch": 0.41, + "epoch": 0.11, "learning_rate": 2e-05, - "loss": 1.4431, + "loss": 1.4638, "step": 252 }, { - "epoch": 0.41, + "epoch": 0.11, "learning_rate": 2e-05, - "loss": 1.5372, + "loss": 1.6587, "step": 254 }, { - "epoch": 0.42, + "epoch": 0.11, "learning_rate": 2e-05, - "loss": 1.5978, + "loss": 1.7546, "step": 256 }, { - "epoch": 0.42, + "epoch": 0.11, "learning_rate": 2e-05, - "loss": 1.7773, + "loss": 1.5319, "step": 258 }, { - "epoch": 0.42, + "epoch": 0.11, "learning_rate": 2e-05, - "loss": 1.8033, + "loss": 1.4681, "step": 260 }, { - "epoch": 0.43, + "epoch": 0.12, "learning_rate": 2e-05, - "loss": 1.7154, + "loss": 1.6264, "step": 262 }, { - "epoch": 0.43, + "epoch": 0.12, "learning_rate": 2e-05, - "loss": 1.7038, + "loss": 1.5565, "step": 264 }, { - "epoch": 0.43, + "epoch": 0.12, "learning_rate": 2e-05, - "loss": 1.9145, + "loss": 1.5509, "step": 266 }, { - "epoch": 0.44, + "epoch": 0.12, "learning_rate": 2e-05, - "loss": 1.7382, + "loss": 1.4856, "step": 268 }, { - "epoch": 0.44, + "epoch": 0.12, "learning_rate": 2e-05, - "loss": 1.712, + "loss": 1.6516, "step": 270 }, { - "epoch": 0.44, + "epoch": 0.12, "learning_rate": 2e-05, - "loss": 1.6205, + "loss": 1.6128, "step": 272 }, { - "epoch": 0.45, + "epoch": 0.12, "learning_rate": 2e-05, - "loss": 1.8052, + "loss": 1.763, "step": 274 }, { - "epoch": 0.45, + "epoch": 0.12, "learning_rate": 2e-05, - "loss": 1.7942, + "loss": 1.6703, "step": 276 }, { - "epoch": 0.45, + "epoch": 0.12, "learning_rate": 2e-05, - "loss": 1.941, + "loss": 1.6881, "step": 278 }, { - "epoch": 0.46, + "epoch": 0.12, "learning_rate": 2e-05, - "loss": 1.8929, + "loss": 1.49, "step": 280 }, { - "epoch": 0.46, + "epoch": 0.12, "learning_rate": 2e-05, - "loss": 1.9285, + "loss": 1.7967, "step": 282 }, { - "epoch": 0.46, + "epoch": 0.13, "learning_rate": 2e-05, - "loss": 2.0238, + "loss": 1.4738, "step": 284 }, { - "epoch": 0.46, + "epoch": 0.13, "learning_rate": 2e-05, - "loss": 2.0938, + "loss": 1.5275, "step": 286 }, { - "epoch": 0.47, + "epoch": 0.13, "learning_rate": 2e-05, - "loss": 2.0362, + "loss": 1.4755, "step": 288 }, { - "epoch": 0.47, + "epoch": 0.13, "learning_rate": 2e-05, - "loss": 2.2545, + "loss": 1.5535, "step": 290 }, { - "epoch": 0.47, + "epoch": 0.13, "learning_rate": 2e-05, - "loss": 2.1432, + "loss": 1.6888, "step": 292 }, { - "epoch": 0.48, + "epoch": 0.13, "learning_rate": 2e-05, - "loss": 2.3454, + "loss": 1.6605, "step": 294 }, { - "epoch": 0.48, + "epoch": 0.13, "learning_rate": 2e-05, - "loss": 2.359, + "loss": 1.2232, "step": 296 }, { - "epoch": 0.48, + "epoch": 0.13, "learning_rate": 2e-05, - "loss": 2.4653, + "loss": 1.5012, "step": 298 }, { - "epoch": 0.49, + "epoch": 0.13, "learning_rate": 2e-05, - "loss": 2.2918, + "loss": 1.5564, "step": 300 }, { - "epoch": 0.49, + "epoch": 0.13, "learning_rate": 2e-05, - "loss": 1.748, + "loss": 1.3169, "step": 302 }, { - "epoch": 0.49, + "epoch": 0.13, "learning_rate": 2e-05, - "loss": 1.6338, + "loss": 1.6143, "step": 304 }, { - "epoch": 0.5, + "epoch": 0.13, "learning_rate": 2e-05, - "loss": 1.5935, + "loss": 1.6366, "step": 306 }, { - "epoch": 0.5, + "epoch": 0.14, "learning_rate": 2e-05, - "loss": 1.6318, + "loss": 1.4345, "step": 308 }, { - "epoch": 0.5, + "epoch": 0.14, "learning_rate": 2e-05, - "loss": 1.7291, + "loss": 1.5439, "step": 310 }, { - "epoch": 0.51, + "epoch": 0.14, "learning_rate": 2e-05, - "loss": 1.8349, + "loss": 1.5227, "step": 312 }, { - "epoch": 0.51, + "epoch": 0.14, "learning_rate": 2e-05, - "loss": 1.6695, + "loss": 1.5976, "step": 314 }, { - "epoch": 0.51, + "epoch": 0.14, "learning_rate": 2e-05, - "loss": 1.8873, + "loss": 1.6695, "step": 316 }, { - "epoch": 0.52, + "epoch": 0.14, "learning_rate": 2e-05, - "loss": 1.7472, + "loss": 1.6449, "step": 318 }, { - "epoch": 0.52, + "epoch": 0.14, "learning_rate": 2e-05, - "loss": 1.8549, + "loss": 1.6323, "step": 320 }, { - "epoch": 0.52, + "epoch": 0.14, "learning_rate": 2e-05, - "loss": 1.7141, + "loss": 1.3631, "step": 322 }, { - "epoch": 0.53, + "epoch": 0.14, "learning_rate": 2e-05, - "loss": 1.7487, + "loss": 1.599, "step": 324 }, { - "epoch": 0.53, + "epoch": 0.14, "learning_rate": 2e-05, - "loss": 1.8681, + "loss": 1.6603, "step": 326 }, { - "epoch": 0.53, + "epoch": 0.14, "learning_rate": 2e-05, - "loss": 1.9445, + "loss": 1.5663, "step": 328 }, { - "epoch": 0.54, + "epoch": 0.15, "learning_rate": 2e-05, - "loss": 1.8244, + "loss": 1.4458, "step": 330 }, { - "epoch": 0.54, + "epoch": 0.15, "learning_rate": 2e-05, - "loss": 1.8444, + "loss": 1.4435, "step": 332 }, { - "epoch": 0.54, + "epoch": 0.15, "learning_rate": 2e-05, - "loss": 1.9026, + "loss": 1.4231, "step": 334 }, { - "epoch": 0.55, + "epoch": 0.15, "learning_rate": 2e-05, - "loss": 2.0311, + "loss": 1.6965, "step": 336 }, { - "epoch": 0.55, + "epoch": 0.15, "learning_rate": 2e-05, - "loss": 1.9602, + "loss": 1.7649, "step": 338 }, { - "epoch": 0.55, + "epoch": 0.15, "learning_rate": 2e-05, - "loss": 2.1369, + "loss": 1.5374, "step": 340 }, { - "epoch": 0.56, + "epoch": 0.15, "learning_rate": 2e-05, - "loss": 1.9471, + "loss": 1.4524, "step": 342 }, { - "epoch": 0.56, + "epoch": 0.15, "learning_rate": 2e-05, - "loss": 2.3028, + "loss": 1.4514, "step": 344 }, { - "epoch": 0.56, + "epoch": 0.15, "learning_rate": 2e-05, - "loss": 2.1903, + "loss": 1.6242, "step": 346 }, { - "epoch": 0.57, + "epoch": 0.15, "learning_rate": 2e-05, - "loss": 2.4402, + "loss": 1.3011, "step": 348 }, { - "epoch": 0.57, + "epoch": 0.15, "learning_rate": 2e-05, - "loss": 2.5361, + "loss": 1.5991, "step": 350 }, { - "epoch": 0.57, + "epoch": 0.16, "learning_rate": 2e-05, - "loss": 1.5716, + "loss": 1.5717, "step": 352 }, { - "epoch": 0.58, + "epoch": 0.16, "learning_rate": 2e-05, - "loss": 1.5262, + "loss": 1.4342, "step": 354 }, { - "epoch": 0.58, + "epoch": 0.16, "learning_rate": 2e-05, - "loss": 1.5327, + "loss": 1.5818, "step": 356 }, { - "epoch": 0.58, + "epoch": 0.16, "learning_rate": 2e-05, - "loss": 1.5762, + "loss": 1.3967, "step": 358 }, { - "epoch": 0.59, + "epoch": 0.16, "learning_rate": 2e-05, - "loss": 1.7237, + "loss": 1.4011, "step": 360 }, { - "epoch": 0.59, + "epoch": 0.16, "learning_rate": 2e-05, - "loss": 1.6061, + "loss": 1.5034, "step": 362 }, { - "epoch": 0.59, + "epoch": 0.16, "learning_rate": 2e-05, - "loss": 1.6987, + "loss": 1.5202, "step": 364 }, { - "epoch": 0.59, + "epoch": 0.16, "learning_rate": 2e-05, - "loss": 1.7565, + "loss": 1.4779, "step": 366 }, { - "epoch": 0.6, + "epoch": 0.16, "learning_rate": 2e-05, - "loss": 1.8116, + "loss": 1.6557, "step": 368 }, { - "epoch": 0.6, + "epoch": 0.16, "learning_rate": 2e-05, - "loss": 1.9457, + "loss": 1.6508, "step": 370 }, { - "epoch": 0.6, + "epoch": 0.16, "learning_rate": 2e-05, - "loss": 1.6663, + "loss": 1.506, "step": 372 }, { - "epoch": 0.61, + "epoch": 0.16, "learning_rate": 2e-05, - "loss": 1.8635, + "loss": 1.5586, "step": 374 }, { - "epoch": 0.61, + "epoch": 0.17, "learning_rate": 2e-05, - "loss": 1.7305, + "loss": 1.5296, "step": 376 }, { - "epoch": 0.61, + "epoch": 0.17, "learning_rate": 2e-05, - "loss": 1.8786, + "loss": 1.5015, "step": 378 }, { - "epoch": 0.62, + "epoch": 0.17, "learning_rate": 2e-05, - "loss": 1.87, + "loss": 1.589, "step": 380 }, { - "epoch": 0.62, + "epoch": 0.17, "learning_rate": 2e-05, - "loss": 1.8725, + "loss": 1.3286, "step": 382 }, { - "epoch": 0.62, + "epoch": 0.17, "learning_rate": 2e-05, - "loss": 1.9773, + "loss": 1.5073, "step": 384 }, { - "epoch": 0.63, + "epoch": 0.17, "learning_rate": 2e-05, - "loss": 2.0312, + "loss": 1.4456, "step": 386 }, { - "epoch": 0.63, + "epoch": 0.17, "learning_rate": 2e-05, - "loss": 2.0488, + "loss": 1.549, "step": 388 }, { - "epoch": 0.63, + "epoch": 0.17, "learning_rate": 2e-05, - "loss": 2.3269, + "loss": 1.4319, "step": 390 }, { - "epoch": 0.64, + "epoch": 0.17, "learning_rate": 2e-05, - "loss": 2.1795, + "loss": 1.5936, "step": 392 }, { - "epoch": 0.64, + "epoch": 0.17, "learning_rate": 2e-05, - "loss": 2.3712, + "loss": 1.2661, "step": 394 }, { - "epoch": 0.64, + "epoch": 0.17, "learning_rate": 2e-05, - "loss": 2.4111, + "loss": 1.5003, "step": 396 }, { - "epoch": 0.65, + "epoch": 0.18, "learning_rate": 2e-05, - "loss": 2.1541, + "loss": 1.5337, "step": 398 }, { - "epoch": 0.65, + "epoch": 0.18, "learning_rate": 2e-05, - "loss": 2.549, + "loss": 1.3789, "step": 400 - }, - { - "epoch": 0.65, - "learning_rate": 2e-05, - "loss": 1.6017, - "step": 402 - }, - { - "epoch": 0.66, - "learning_rate": 2e-05, - "loss": 1.5165, - "step": 404 - }, - { - "epoch": 0.66, - "learning_rate": 2e-05, - "loss": 1.682, - "step": 406 - }, - { - "epoch": 0.66, - "learning_rate": 2e-05, - "loss": 1.7208, - "step": 408 - }, - { - "epoch": 0.67, - "learning_rate": 2e-05, - "loss": 1.6538, - "step": 410 - }, - { - "epoch": 0.67, - "learning_rate": 2e-05, - "loss": 1.7341, - "step": 412 - }, - { - "epoch": 0.67, - "learning_rate": 2e-05, - "loss": 1.6098, - "step": 414 - }, - { - "epoch": 0.68, - "learning_rate": 2e-05, - "loss": 1.518, - "step": 416 - }, - { - "epoch": 0.68, - "learning_rate": 2e-05, - "loss": 1.7724, - "step": 418 - }, - { - "epoch": 0.68, - "learning_rate": 2e-05, - "loss": 1.9174, - "step": 420 - }, - { - "epoch": 0.69, - "learning_rate": 2e-05, - "loss": 1.8182, - "step": 422 - }, - { - "epoch": 0.69, - "learning_rate": 2e-05, - "loss": 1.7367, - "step": 424 - }, - { - "epoch": 0.69, - "learning_rate": 2e-05, - "loss": 1.643, - "step": 426 - }, - { - "epoch": 0.7, - "learning_rate": 2e-05, - "loss": 1.8868, - "step": 428 - }, - { - "epoch": 0.7, - "learning_rate": 2e-05, - "loss": 1.9231, - "step": 430 - }, - { - "epoch": 0.7, - "learning_rate": 2e-05, - "loss": 1.9085, - "step": 432 - }, - { - "epoch": 0.71, - "learning_rate": 2e-05, - "loss": 1.8751, - "step": 434 - }, - { - "epoch": 0.71, - "learning_rate": 2e-05, - "loss": 2.0751, - "step": 436 - }, - { - "epoch": 0.71, - "learning_rate": 2e-05, - "loss": 2.1407, - "step": 438 - }, - { - "epoch": 0.72, - "learning_rate": 2e-05, - "loss": 1.9238, - "step": 440 - }, - { - "epoch": 0.72, - "learning_rate": 2e-05, - "loss": 2.3091, - "step": 442 - }, - { - "epoch": 0.72, - "learning_rate": 2e-05, - "loss": 2.2479, - "step": 444 - }, - { - "epoch": 0.72, - "learning_rate": 2e-05, - "loss": 2.3011, - "step": 446 - }, - { - "epoch": 0.73, - "learning_rate": 2e-05, - "loss": 2.185, - "step": 448 - }, - { - "epoch": 0.73, - "learning_rate": 2e-05, - "loss": 2.2973, - "step": 450 - }, - { - "epoch": 0.73, - "learning_rate": 2e-05, - "loss": 1.7113, - "step": 452 - }, - { - "epoch": 0.74, - "learning_rate": 2e-05, - "loss": 1.5509, - "step": 454 - }, - { - "epoch": 0.74, - "learning_rate": 2e-05, - "loss": 1.6546, - "step": 456 - }, - { - "epoch": 0.74, - "learning_rate": 2e-05, - "loss": 1.6055, - "step": 458 - }, - { - "epoch": 0.75, - "learning_rate": 2e-05, - "loss": 1.7226, - "step": 460 - }, - { - "epoch": 0.75, - "learning_rate": 2e-05, - "loss": 1.7313, - "step": 462 - }, - { - "epoch": 0.75, - "learning_rate": 2e-05, - "loss": 1.7648, - "step": 464 - }, - { - "epoch": 0.76, - "learning_rate": 2e-05, - "loss": 1.6838, - "step": 466 - }, - { - "epoch": 0.76, - "learning_rate": 2e-05, - "loss": 1.7465, - "step": 468 - }, - { - "epoch": 0.76, - "learning_rate": 2e-05, - "loss": 1.7302, - "step": 470 - }, - { - "epoch": 0.77, - "learning_rate": 2e-05, - "loss": 1.7998, - "step": 472 - }, - { - "epoch": 0.77, - "learning_rate": 2e-05, - "loss": 1.8177, - "step": 474 - }, - { - "epoch": 0.77, - "learning_rate": 2e-05, - "loss": 1.6845, - "step": 476 - }, - { - "epoch": 0.78, - "learning_rate": 2e-05, - "loss": 1.7208, - "step": 478 - }, - { - "epoch": 0.78, - "learning_rate": 2e-05, - "loss": 1.9087, - "step": 480 - }, - { - "epoch": 0.78, - "learning_rate": 2e-05, - "loss": 1.7116, - "step": 482 - }, - { - "epoch": 0.79, - "learning_rate": 2e-05, - "loss": 2.0417, - "step": 484 - }, - { - "epoch": 0.79, - "learning_rate": 2e-05, - "loss": 1.9781, - "step": 486 - }, - { - "epoch": 0.79, - "learning_rate": 2e-05, - "loss": 2.1279, - "step": 488 - }, - { - "epoch": 0.8, - "learning_rate": 2e-05, - "loss": 1.9915, - "step": 490 - }, - { - "epoch": 0.8, - "learning_rate": 2e-05, - "loss": 2.1453, - "step": 492 - }, - { - "epoch": 0.8, - "learning_rate": 2e-05, - "loss": 2.1639, - "step": 494 - }, - { - "epoch": 0.81, - "learning_rate": 2e-05, - "loss": 2.2218, - "step": 496 - }, - { - "epoch": 0.81, - "learning_rate": 2e-05, - "loss": 2.2928, - "step": 498 - }, - { - "epoch": 0.81, - "learning_rate": 2e-05, - "loss": 2.1805, - "step": 500 - }, - { - "epoch": 0.82, - "learning_rate": 2e-05, - "loss": 1.6048, - "step": 502 - }, - { - "epoch": 0.82, - "learning_rate": 2e-05, - "loss": 1.6373, - "step": 504 - }, - { - "epoch": 0.82, - "learning_rate": 2e-05, - "loss": 1.6171, - "step": 506 - }, - { - "epoch": 0.83, - "learning_rate": 2e-05, - "loss": 1.6666, - "step": 508 - }, - { - "epoch": 0.83, - "learning_rate": 2e-05, - "loss": 1.5267, - "step": 510 - }, - { - "epoch": 0.83, - "learning_rate": 2e-05, - "loss": 1.8299, - "step": 512 - }, - { - "epoch": 0.84, - "learning_rate": 2e-05, - "loss": 1.6865, - "step": 514 - }, - { - "epoch": 0.84, - "learning_rate": 2e-05, - "loss": 1.899, - "step": 516 - }, - { - "epoch": 0.84, - "learning_rate": 2e-05, - "loss": 1.7581, - "step": 518 - }, - { - "epoch": 0.85, - "learning_rate": 2e-05, - "loss": 1.7595, - "step": 520 - }, - { - "epoch": 0.85, - "learning_rate": 2e-05, - "loss": 1.7733, - "step": 522 - }, - { - "epoch": 0.85, - "learning_rate": 2e-05, - "loss": 1.8707, - "step": 524 - }, - { - "epoch": 0.85, - "learning_rate": 2e-05, - "loss": 1.773, - "step": 526 - }, - { - "epoch": 0.86, - "learning_rate": 2e-05, - "loss": 1.8177, - "step": 528 - }, - { - "epoch": 0.86, - "learning_rate": 2e-05, - "loss": 1.8523, - "step": 530 - }, - { - "epoch": 0.86, - "learning_rate": 2e-05, - "loss": 1.9317, - "step": 532 - }, - { - "epoch": 0.87, - "learning_rate": 2e-05, - "loss": 2.0409, - "step": 534 - }, - { - "epoch": 0.87, - "learning_rate": 2e-05, - "loss": 2.059, - "step": 536 - }, - { - "epoch": 0.87, - "learning_rate": 2e-05, - "loss": 2.1391, - "step": 538 - }, - { - "epoch": 0.88, - "learning_rate": 2e-05, - "loss": 2.2914, - "step": 540 - }, - { - "epoch": 0.88, - "learning_rate": 2e-05, - "loss": 2.1879, - "step": 542 - }, - { - "epoch": 0.88, - "learning_rate": 2e-05, - "loss": 2.3531, - "step": 544 - }, - { - "epoch": 0.89, - "learning_rate": 2e-05, - "loss": 2.2852, - "step": 546 - }, - { - "epoch": 0.89, - "learning_rate": 2e-05, - "loss": 2.3946, - "step": 548 - }, - { - "epoch": 0.89, - "learning_rate": 2e-05, - "loss": 2.1933, - "step": 550 - }, - { - "epoch": 0.9, - "learning_rate": 2e-05, - "loss": 1.587, - "step": 552 - }, - { - "epoch": 0.9, - "learning_rate": 2e-05, - "loss": 1.5762, - "step": 554 - }, - { - "epoch": 0.9, - "learning_rate": 2e-05, - "loss": 1.4593, - "step": 556 - }, - { - "epoch": 0.91, - "learning_rate": 2e-05, - "loss": 1.6386, - "step": 558 - }, - { - "epoch": 0.91, - "learning_rate": 2e-05, - "loss": 1.759, - "step": 560 - }, - { - "epoch": 0.91, - "learning_rate": 2e-05, - "loss": 1.6945, - "step": 562 - }, - { - "epoch": 0.92, - "learning_rate": 2e-05, - "loss": 1.6703, - "step": 564 - }, - { - "epoch": 0.92, - "learning_rate": 2e-05, - "loss": 1.7655, - "step": 566 - }, - { - "epoch": 0.92, - "learning_rate": 2e-05, - "loss": 1.7459, - "step": 568 - }, - { - "epoch": 0.93, - "learning_rate": 2e-05, - "loss": 1.8065, - "step": 570 - }, - { - "epoch": 0.93, - "learning_rate": 2e-05, - "loss": 1.7765, - "step": 572 - }, - { - "epoch": 0.93, - "learning_rate": 2e-05, - "loss": 1.7079, - "step": 574 - }, - { - "epoch": 0.94, - "learning_rate": 2e-05, - "loss": 1.8376, - "step": 576 - }, - { - "epoch": 0.94, - "learning_rate": 2e-05, - "loss": 2.0, - "step": 578 - }, - { - "epoch": 0.94, - "learning_rate": 2e-05, - "loss": 1.876, - "step": 580 - }, - { - "epoch": 0.95, - "learning_rate": 2e-05, - "loss": 1.8461, - "step": 582 - }, - { - "epoch": 0.95, - "learning_rate": 2e-05, - "loss": 1.9589, - "step": 584 - }, - { - "epoch": 0.95, - "learning_rate": 2e-05, - "loss": 1.9262, - "step": 586 - }, - { - "epoch": 0.96, - "learning_rate": 2e-05, - "loss": 2.0984, - "step": 588 - }, - { - "epoch": 0.96, - "learning_rate": 2e-05, - "loss": 2.1432, - "step": 590 - }, - { - "epoch": 0.96, - "learning_rate": 2e-05, - "loss": 2.2204, - "step": 592 - }, - { - "epoch": 0.97, - "learning_rate": 2e-05, - "loss": 2.3121, - "step": 594 - }, - { - "epoch": 0.97, - "learning_rate": 2e-05, - "loss": 2.3432, - "step": 596 - }, - { - "epoch": 0.97, - "learning_rate": 2e-05, - "loss": 2.2153, - "step": 598 - }, - { - "epoch": 0.98, - "learning_rate": 2e-05, - "loss": 2.4357, - "step": 600 - }, - { - "epoch": 0.98, - "learning_rate": 2e-05, - "loss": 1.6654, - "step": 602 - }, - { - "epoch": 0.98, - "learning_rate": 2e-05, - "loss": 1.5651, - "step": 604 - }, - { - "epoch": 0.98, - "learning_rate": 2e-05, - "loss": 1.8015, - "step": 606 - }, - { - "epoch": 0.99, - "learning_rate": 2e-05, - "loss": 1.8798, - "step": 608 - }, - { - "epoch": 0.99, - "learning_rate": 2e-05, - "loss": 1.8118, - "step": 610 - }, - { - "epoch": 0.99, - "learning_rate": 2e-05, - "loss": 2.101, - "step": 612 - }, - { - "epoch": 1.0, - "learning_rate": 2e-05, - "loss": 2.2339, - "step": 614 - }, - { - "epoch": 1.0, - "learning_rate": 2e-05, - "loss": 2.0958, - "step": 616 - }, - { - "epoch": 1.0, - "learning_rate": 2e-05, - "loss": 1.5011, - "step": 618 - }, - { - "epoch": 1.01, - "learning_rate": 2e-05, - "loss": 1.6487, - "step": 620 - }, - { - "epoch": 1.01, - "learning_rate": 2e-05, - "loss": 1.663, - "step": 622 - }, - { - "epoch": 1.01, - "learning_rate": 2e-05, - "loss": 1.7526, - "step": 624 - }, - { - "epoch": 1.02, - "learning_rate": 2e-05, - "loss": 1.6576, - "step": 626 - }, - { - "epoch": 1.02, - "learning_rate": 2e-05, - "loss": 1.8096, - "step": 628 - }, - { - "epoch": 1.02, - "learning_rate": 2e-05, - "loss": 1.6533, - "step": 630 - }, - { - "epoch": 1.03, - "learning_rate": 2e-05, - "loss": 1.8164, - "step": 632 - }, - { - "epoch": 1.03, - "learning_rate": 2e-05, - "loss": 1.7376, - "step": 634 - }, - { - "epoch": 1.03, - "learning_rate": 2e-05, - "loss": 1.7395, - "step": 636 - }, - { - "epoch": 1.04, - "learning_rate": 2e-05, - "loss": 1.7583, - "step": 638 - }, - { - "epoch": 1.04, - "learning_rate": 2e-05, - "loss": 1.8188, - "step": 640 - }, - { - "epoch": 1.04, - "learning_rate": 2e-05, - "loss": 1.8144, - "step": 642 - }, - { - "epoch": 1.05, - "learning_rate": 2e-05, - "loss": 1.8275, - "step": 644 - }, - { - "epoch": 1.05, - "learning_rate": 2e-05, - "loss": 1.8898, - "step": 646 - }, - { - "epoch": 1.05, - "learning_rate": 2e-05, - "loss": 1.831, - "step": 648 - }, - { - "epoch": 1.06, - "learning_rate": 2e-05, - "loss": 2.017, - "step": 650 - }, - { - "epoch": 1.06, - "learning_rate": 2e-05, - "loss": 2.0152, - "step": 652 - }, - { - "epoch": 1.06, - "learning_rate": 2e-05, - "loss": 2.0229, - "step": 654 - }, - { - "epoch": 1.07, - "learning_rate": 2e-05, - "loss": 2.0969, - "step": 656 - }, - { - "epoch": 1.07, - "learning_rate": 2e-05, - "loss": 2.2473, - "step": 658 - }, - { - "epoch": 1.07, - "learning_rate": 2e-05, - "loss": 2.2124, - "step": 660 - }, - { - "epoch": 1.08, - "learning_rate": 2e-05, - "loss": 2.4132, - "step": 662 - }, - { - "epoch": 1.08, - "learning_rate": 2e-05, - "loss": 2.2386, - "step": 664 - }, - { - "epoch": 1.08, - "learning_rate": 2e-05, - "loss": 1.9457, - "step": 666 - }, - { - "epoch": 1.09, - "learning_rate": 2e-05, - "loss": 1.5253, - "step": 668 - }, - { - "epoch": 1.09, - "learning_rate": 2e-05, - "loss": 1.6223, - "step": 670 - }, - { - "epoch": 1.09, - "learning_rate": 2e-05, - "loss": 1.5866, - "step": 672 - }, - { - "epoch": 1.1, - "learning_rate": 2e-05, - "loss": 1.639, - "step": 674 - }, - { - "epoch": 1.1, - "learning_rate": 2e-05, - "loss": 1.7485, - "step": 676 - }, - { - "epoch": 1.1, - "learning_rate": 2e-05, - "loss": 1.7922, - "step": 678 - }, - { - "epoch": 1.11, - "learning_rate": 2e-05, - "loss": 1.6644, - "step": 680 - }, - { - "epoch": 1.11, - "learning_rate": 2e-05, - "loss": 1.7133, - "step": 682 - }, - { - "epoch": 1.11, - "learning_rate": 2e-05, - "loss": 1.7215, - "step": 684 - }, - { - "epoch": 1.11, - "learning_rate": 2e-05, - "loss": 1.6665, - "step": 686 - }, - { - "epoch": 1.12, - "learning_rate": 2e-05, - "loss": 1.809, - "step": 688 - }, - { - "epoch": 1.12, - "learning_rate": 2e-05, - "loss": 1.8063, - "step": 690 - }, - { - "epoch": 1.12, - "learning_rate": 2e-05, - "loss": 1.8331, - "step": 692 - }, - { - "epoch": 1.13, - "learning_rate": 2e-05, - "loss": 1.6631, - "step": 694 - }, - { - "epoch": 1.13, - "learning_rate": 2e-05, - "loss": 1.7219, - "step": 696 - }, - { - "epoch": 1.13, - "learning_rate": 2e-05, - "loss": 1.793, - "step": 698 - }, - { - "epoch": 1.14, - "learning_rate": 2e-05, - "loss": 1.9089, - "step": 700 - }, - { - "epoch": 1.14, - "learning_rate": 2e-05, - "loss": 2.0203, - "step": 702 - }, - { - "epoch": 1.14, - "learning_rate": 2e-05, - "loss": 2.0187, - "step": 704 - }, - { - "epoch": 1.15, - "learning_rate": 2e-05, - "loss": 2.2132, - "step": 706 - }, - { - "epoch": 1.15, - "learning_rate": 2e-05, - "loss": 2.3242, - "step": 708 - }, - { - "epoch": 1.15, - "learning_rate": 2e-05, - "loss": 2.3611, - "step": 710 - }, - { - "epoch": 1.16, - "learning_rate": 2e-05, - "loss": 2.1036, - "step": 712 - }, - { - "epoch": 1.16, - "learning_rate": 2e-05, - "loss": 2.1611, - "step": 714 - }, - { - "epoch": 1.16, - "learning_rate": 2e-05, - "loss": 2.084, - "step": 716 - }, - { - "epoch": 1.17, - "learning_rate": 2e-05, - "loss": 1.5422, - "step": 718 - }, - { - "epoch": 1.17, - "learning_rate": 2e-05, - "loss": 1.5357, - "step": 720 - }, - { - "epoch": 1.17, - "learning_rate": 2e-05, - "loss": 1.5656, - "step": 722 - }, - { - "epoch": 1.18, - "learning_rate": 2e-05, - "loss": 1.665, - "step": 724 - }, - { - "epoch": 1.18, - "learning_rate": 2e-05, - "loss": 1.6541, - "step": 726 - }, - { - "epoch": 1.18, - "learning_rate": 2e-05, - "loss": 1.7007, - "step": 728 - }, - { - "epoch": 1.19, - "learning_rate": 2e-05, - "loss": 1.6715, - "step": 730 - }, - { - "epoch": 1.19, - "learning_rate": 2e-05, - "loss": 1.7646, - "step": 732 - }, - { - "epoch": 1.19, - "learning_rate": 2e-05, - "loss": 1.7159, - "step": 734 - }, - { - "epoch": 1.2, - "learning_rate": 2e-05, - "loss": 1.8008, - "step": 736 - }, - { - "epoch": 1.2, - "learning_rate": 2e-05, - "loss": 1.7396, - "step": 738 - }, - { - "epoch": 1.2, - "learning_rate": 2e-05, - "loss": 1.5893, - "step": 740 - }, - { - "epoch": 1.21, - "learning_rate": 2e-05, - "loss": 1.7379, - "step": 742 - }, - { - "epoch": 1.21, - "learning_rate": 2e-05, - "loss": 1.9598, - "step": 744 - }, - { - "epoch": 1.21, - "learning_rate": 2e-05, - "loss": 1.767, - "step": 746 - }, - { - "epoch": 1.22, - "learning_rate": 2e-05, - "loss": 1.6946, - "step": 748 - }, - { - "epoch": 1.22, - "learning_rate": 2e-05, - "loss": 1.9439, - "step": 750 - }, - { - "epoch": 1.22, - "learning_rate": 2e-05, - "loss": 1.809, - "step": 752 - }, - { - "epoch": 1.23, - "learning_rate": 2e-05, - "loss": 1.9658, - "step": 754 - }, - { - "epoch": 1.23, - "learning_rate": 2e-05, - "loss": 2.3269, - "step": 756 - }, - { - "epoch": 1.23, - "learning_rate": 2e-05, - "loss": 2.1706, - "step": 758 - }, - { - "epoch": 1.24, - "learning_rate": 2e-05, - "loss": 2.1369, - "step": 760 - }, - { - "epoch": 1.24, - "learning_rate": 2e-05, - "loss": 2.2149, - "step": 762 - }, - { - "epoch": 1.24, - "learning_rate": 2e-05, - "loss": 2.2046, - "step": 764 - }, - { - "epoch": 1.24, - "learning_rate": 2e-05, - "loss": 2.0422, - "step": 766 - }, - { - "epoch": 1.25, - "learning_rate": 2e-05, - "loss": 1.5257, - "step": 768 - }, - { - "epoch": 1.25, - "learning_rate": 2e-05, - "loss": 1.5342, - "step": 770 - }, - { - "epoch": 1.25, - "learning_rate": 2e-05, - "loss": 1.6119, - "step": 772 - }, - { - "epoch": 1.26, - "learning_rate": 2e-05, - "loss": 1.4974, - "step": 774 - }, - { - "epoch": 1.26, - "learning_rate": 2e-05, - "loss": 1.6021, - "step": 776 - }, - { - "epoch": 1.26, - "learning_rate": 2e-05, - "loss": 1.7111, - "step": 778 - }, - { - "epoch": 1.27, - "learning_rate": 2e-05, - "loss": 1.7096, - "step": 780 - }, - { - "epoch": 1.27, - "learning_rate": 2e-05, - "loss": 1.7536, - "step": 782 - }, - { - "epoch": 1.27, - "learning_rate": 2e-05, - "loss": 1.7301, - "step": 784 - }, - { - "epoch": 1.28, - "learning_rate": 2e-05, - "loss": 1.7513, - "step": 786 - }, - { - "epoch": 1.28, - "learning_rate": 2e-05, - "loss": 1.6764, - "step": 788 - }, - { - "epoch": 1.28, - "learning_rate": 2e-05, - "loss": 1.6348, - "step": 790 - }, - { - "epoch": 1.29, - "learning_rate": 2e-05, - "loss": 1.7046, - "step": 792 - }, - { - "epoch": 1.29, - "learning_rate": 2e-05, - "loss": 1.8024, - "step": 794 - }, - { - "epoch": 1.29, - "learning_rate": 2e-05, - "loss": 1.8867, - "step": 796 - }, - { - "epoch": 1.3, - "learning_rate": 2e-05, - "loss": 1.8189, - "step": 798 - }, - { - "epoch": 1.3, - "learning_rate": 2e-05, - "loss": 2.0048, - "step": 800 - }, - { - "epoch": 1.3, - "learning_rate": 2e-05, - "loss": 1.7588, - "step": 802 - }, - { - "epoch": 1.31, - "learning_rate": 2e-05, - "loss": 1.9648, - "step": 804 - }, - { - "epoch": 1.31, - "learning_rate": 2e-05, - "loss": 2.1632, - "step": 806 - }, - { - "epoch": 1.31, - "learning_rate": 2e-05, - "loss": 2.1084, - "step": 808 - }, - { - "epoch": 1.32, - "learning_rate": 2e-05, - "loss": 2.1, - "step": 810 - }, - { - "epoch": 1.32, - "learning_rate": 2e-05, - "loss": 2.183, - "step": 812 - }, - { - "epoch": 1.32, - "learning_rate": 2e-05, - "loss": 2.2913, - "step": 814 - }, - { - "epoch": 1.33, - "learning_rate": 2e-05, - "loss": 1.8887, - "step": 816 - }, - { - "epoch": 1.33, - "learning_rate": 2e-05, - "loss": 1.578, - "step": 818 - }, - { - "epoch": 1.33, - "learning_rate": 2e-05, - "loss": 1.4916, - "step": 820 - }, - { - "epoch": 1.34, - "learning_rate": 2e-05, - "loss": 1.5374, - "step": 822 - }, - { - "epoch": 1.34, - "learning_rate": 2e-05, - "loss": 1.625, - "step": 824 - }, - { - "epoch": 1.34, - "learning_rate": 2e-05, - "loss": 1.7883, - "step": 826 - }, - { - "epoch": 1.35, - "learning_rate": 2e-05, - "loss": 1.7382, - "step": 828 - }, - { - "epoch": 1.35, - "learning_rate": 2e-05, - "loss": 1.6878, - "step": 830 - }, - { - "epoch": 1.35, - "learning_rate": 2e-05, - "loss": 1.5995, - "step": 832 - }, - { - "epoch": 1.36, - "learning_rate": 2e-05, - "loss": 1.8318, - "step": 834 - }, - { - "epoch": 1.36, - "learning_rate": 2e-05, - "loss": 1.7415, - "step": 836 - }, - { - "epoch": 1.36, - "learning_rate": 2e-05, - "loss": 1.8562, - "step": 838 - }, - { - "epoch": 1.37, - "learning_rate": 2e-05, - "loss": 1.7258, - "step": 840 - }, - { - "epoch": 1.37, - "learning_rate": 2e-05, - "loss": 1.8745, - "step": 842 - }, - { - "epoch": 1.37, - "learning_rate": 2e-05, - "loss": 1.6258, - "step": 844 - }, - { - "epoch": 1.37, - "learning_rate": 2e-05, - "loss": 1.8662, - "step": 846 - }, - { - "epoch": 1.38, - "learning_rate": 2e-05, - "loss": 1.8824, - "step": 848 - }, - { - "epoch": 1.38, - "learning_rate": 2e-05, - "loss": 1.8848, - "step": 850 - }, - { - "epoch": 1.38, - "learning_rate": 2e-05, - "loss": 2.0283, - "step": 852 - }, - { - "epoch": 1.39, - "learning_rate": 2e-05, - "loss": 1.9087, - "step": 854 - }, - { - "epoch": 1.39, - "learning_rate": 2e-05, - "loss": 2.1342, - "step": 856 - }, - { - "epoch": 1.39, - "learning_rate": 2e-05, - "loss": 2.3005, - "step": 858 - }, - { - "epoch": 1.4, - "learning_rate": 2e-05, - "loss": 2.4106, - "step": 860 - }, - { - "epoch": 1.4, - "learning_rate": 2e-05, - "loss": 2.4499, - "step": 862 - }, - { - "epoch": 1.4, - "learning_rate": 2e-05, - "loss": 2.3442, - "step": 864 - }, - { - "epoch": 1.41, - "learning_rate": 2e-05, - "loss": 2.0918, - "step": 866 - }, - { - "epoch": 1.41, - "learning_rate": 2e-05, - "loss": 1.6702, - "step": 868 - }, - { - "epoch": 1.41, - "learning_rate": 2e-05, - "loss": 1.4252, - "step": 870 - }, - { - "epoch": 1.42, - "learning_rate": 2e-05, - "loss": 1.4626, - "step": 872 - }, - { - "epoch": 1.42, - "learning_rate": 2e-05, - "loss": 1.5953, - "step": 874 - }, - { - "epoch": 1.42, - "learning_rate": 2e-05, - "loss": 1.7418, - "step": 876 - }, - { - "epoch": 1.43, - "learning_rate": 2e-05, - "loss": 1.7604, - "step": 878 - }, - { - "epoch": 1.43, - "learning_rate": 2e-05, - "loss": 1.6774, - "step": 880 - }, - { - "epoch": 1.43, - "learning_rate": 2e-05, - "loss": 1.6192, - "step": 882 - }, - { - "epoch": 1.44, - "learning_rate": 2e-05, - "loss": 1.7959, - "step": 884 - }, - { - "epoch": 1.44, - "learning_rate": 2e-05, - "loss": 1.691, - "step": 886 - }, - { - "epoch": 1.44, - "learning_rate": 2e-05, - "loss": 1.6482, - "step": 888 - }, - { - "epoch": 1.45, - "learning_rate": 2e-05, - "loss": 1.7933, - "step": 890 - }, - { - "epoch": 1.45, - "learning_rate": 2e-05, - "loss": 1.7287, - "step": 892 - }, - { - "epoch": 1.45, - "learning_rate": 2e-05, - "loss": 1.7073, - "step": 894 - }, - { - "epoch": 1.46, - "learning_rate": 2e-05, - "loss": 1.9, - "step": 896 - }, - { - "epoch": 1.46, - "learning_rate": 2e-05, - "loss": 1.8793, - "step": 898 - }, - { - "epoch": 1.46, - "learning_rate": 2e-05, - "loss": 1.7948, - "step": 900 - }, - { - "epoch": 1.47, - "learning_rate": 2e-05, - "loss": 1.9856, - "step": 902 - }, - { - "epoch": 1.47, - "learning_rate": 2e-05, - "loss": 2.0757, - "step": 904 - }, - { - "epoch": 1.47, - "learning_rate": 2e-05, - "loss": 2.0352, - "step": 906 - }, - { - "epoch": 1.48, - "learning_rate": 2e-05, - "loss": 2.2688, - "step": 908 - }, - { - "epoch": 1.48, - "learning_rate": 2e-05, - "loss": 2.041, - "step": 910 - }, - { - "epoch": 1.48, - "learning_rate": 2e-05, - "loss": 2.265, - "step": 912 - }, - { - "epoch": 1.49, - "learning_rate": 2e-05, - "loss": 2.1646, - "step": 914 - }, - { - "epoch": 1.49, - "learning_rate": 2e-05, - "loss": 1.8594, - "step": 916 - }, - { - "epoch": 1.49, - "learning_rate": 2e-05, - "loss": 1.5378, - "step": 918 - }, - { - "epoch": 1.5, - "learning_rate": 2e-05, - "loss": 1.6376, - "step": 920 - }, - { - "epoch": 1.5, - "learning_rate": 2e-05, - "loss": 1.522, - "step": 922 - }, - { - "epoch": 1.5, - "learning_rate": 2e-05, - "loss": 1.5827, - "step": 924 - }, - { - "epoch": 1.5, - "learning_rate": 2e-05, - "loss": 1.633, - "step": 926 - }, - { - "epoch": 1.51, - "learning_rate": 2e-05, - "loss": 1.6128, - "step": 928 - }, - { - "epoch": 1.51, - "learning_rate": 2e-05, - "loss": 1.7496, - "step": 930 - }, - { - "epoch": 1.51, - "learning_rate": 2e-05, - "loss": 1.7611, - "step": 932 - }, - { - "epoch": 1.52, - "learning_rate": 2e-05, - "loss": 1.7722, - "step": 934 - }, - { - "epoch": 1.52, - "learning_rate": 2e-05, - "loss": 1.8811, - "step": 936 - }, - { - "epoch": 1.52, - "learning_rate": 2e-05, - "loss": 1.8488, - "step": 938 - }, - { - "epoch": 1.53, - "learning_rate": 2e-05, - "loss": 1.6723, - "step": 940 - }, - { - "epoch": 1.53, - "learning_rate": 2e-05, - "loss": 1.6077, - "step": 942 - }, - { - "epoch": 1.53, - "learning_rate": 2e-05, - "loss": 1.7119, - "step": 944 - }, - { - "epoch": 1.54, - "learning_rate": 2e-05, - "loss": 1.9478, - "step": 946 - }, - { - "epoch": 1.54, - "learning_rate": 2e-05, - "loss": 1.9045, - "step": 948 - }, - { - "epoch": 1.54, - "learning_rate": 2e-05, - "loss": 1.8904, - "step": 950 - }, - { - "epoch": 1.55, - "learning_rate": 2e-05, - "loss": 2.1644, - "step": 952 - }, - { - "epoch": 1.55, - "learning_rate": 2e-05, - "loss": 2.1249, - "step": 954 - }, - { - "epoch": 1.55, - "learning_rate": 2e-05, - "loss": 1.9582, - "step": 956 - }, - { - "epoch": 1.56, - "learning_rate": 2e-05, - "loss": 1.9816, - "step": 958 - }, - { - "epoch": 1.56, - "learning_rate": 2e-05, - "loss": 2.2362, - "step": 960 - }, - { - "epoch": 1.56, - "learning_rate": 2e-05, - "loss": 2.1076, - "step": 962 - }, - { - "epoch": 1.57, - "learning_rate": 2e-05, - "loss": 2.2971, - "step": 964 - }, - { - "epoch": 1.57, - "learning_rate": 2e-05, - "loss": 2.007, - "step": 966 - }, - { - "epoch": 1.57, - "learning_rate": 2e-05, - "loss": 1.6316, - "step": 968 - }, - { - "epoch": 1.58, - "learning_rate": 2e-05, - "loss": 1.5545, - "step": 970 - }, - { - "epoch": 1.58, - "learning_rate": 2e-05, - "loss": 1.6186, - "step": 972 - }, - { - "epoch": 1.58, - "learning_rate": 2e-05, - "loss": 1.6096, - "step": 974 - }, - { - "epoch": 1.59, - "learning_rate": 2e-05, - "loss": 1.7449, - "step": 976 - }, - { - "epoch": 1.59, - "learning_rate": 2e-05, - "loss": 1.7391, - "step": 978 - }, - { - "epoch": 1.59, - "learning_rate": 2e-05, - "loss": 1.7077, - "step": 980 - }, - { - "epoch": 1.6, - "learning_rate": 2e-05, - "loss": 1.6408, - "step": 982 - }, - { - "epoch": 1.6, - "learning_rate": 2e-05, - "loss": 1.7383, - "step": 984 - }, - { - "epoch": 1.6, - "learning_rate": 2e-05, - "loss": 1.7688, - "step": 986 - }, - { - "epoch": 1.61, - "learning_rate": 2e-05, - "loss": 1.7425, - "step": 988 - }, - { - "epoch": 1.61, - "learning_rate": 2e-05, - "loss": 1.6938, - "step": 990 - }, - { - "epoch": 1.61, - "learning_rate": 2e-05, - "loss": 1.7232, - "step": 992 - }, - { - "epoch": 1.62, - "learning_rate": 2e-05, - "loss": 1.7924, - "step": 994 - }, - { - "epoch": 1.62, - "learning_rate": 2e-05, - "loss": 1.8015, - "step": 996 - }, - { - "epoch": 1.62, - "learning_rate": 2e-05, - "loss": 2.0395, - "step": 998 - }, - { - "epoch": 1.63, - "learning_rate": 2e-05, - "loss": 1.946, - "step": 1000 - }, - { - "epoch": 1.63, - "learning_rate": 2e-05, - "loss": 1.9046, - "step": 1002 - }, - { - "epoch": 1.63, - "learning_rate": 2e-05, - "loss": 2.0198, - "step": 1004 - }, - { - "epoch": 1.63, - "learning_rate": 2e-05, - "loss": 2.1411, - "step": 1006 - }, - { - "epoch": 1.64, - "learning_rate": 2e-05, - "loss": 1.8638, - "step": 1008 - }, - { - "epoch": 1.64, - "learning_rate": 2e-05, - "loss": 2.3288, - "step": 1010 - }, - { - "epoch": 1.64, - "learning_rate": 2e-05, - "loss": 2.1452, - "step": 1012 - }, - { - "epoch": 1.65, - "learning_rate": 2e-05, - "loss": 2.4029, - "step": 1014 - }, - { - "epoch": 1.65, - "learning_rate": 2e-05, - "loss": 1.721, - "step": 1016 - }, - { - "epoch": 1.65, - "learning_rate": 2e-05, - "loss": 1.7016, - "step": 1018 - }, - { - "epoch": 1.66, - "learning_rate": 2e-05, - "loss": 1.6516, - "step": 1020 - }, - { - "epoch": 1.66, - "learning_rate": 2e-05, - "loss": 1.5379, - "step": 1022 - }, - { - "epoch": 1.66, - "learning_rate": 2e-05, - "loss": 1.5188, - "step": 1024 - }, - { - "epoch": 1.67, - "learning_rate": 2e-05, - "loss": 1.6925, - "step": 1026 - }, - { - "epoch": 1.67, - "learning_rate": 2e-05, - "loss": 1.6326, - "step": 1028 - }, - { - "epoch": 1.67, - "learning_rate": 2e-05, - "loss": 1.7466, - "step": 1030 - }, - { - "epoch": 1.68, - "learning_rate": 2e-05, - "loss": 1.7166, - "step": 1032 - }, - { - "epoch": 1.68, - "learning_rate": 2e-05, - "loss": 1.8074, - "step": 1034 - }, - { - "epoch": 1.68, - "learning_rate": 2e-05, - "loss": 1.7404, - "step": 1036 - }, - { - "epoch": 1.69, - "learning_rate": 2e-05, - "loss": 1.6528, - "step": 1038 - }, - { - "epoch": 1.69, - "learning_rate": 2e-05, - "loss": 1.8054, - "step": 1040 - }, - { - "epoch": 1.69, - "learning_rate": 2e-05, - "loss": 1.9392, - "step": 1042 - }, - { - "epoch": 1.7, - "learning_rate": 2e-05, - "loss": 1.8748, - "step": 1044 - }, - { - "epoch": 1.7, - "learning_rate": 2e-05, - "loss": 1.9289, - "step": 1046 - }, - { - "epoch": 1.7, - "learning_rate": 2e-05, - "loss": 1.9047, - "step": 1048 - }, - { - "epoch": 1.71, - "learning_rate": 2e-05, - "loss": 2.0054, - "step": 1050 - }, - { - "epoch": 1.71, - "learning_rate": 2e-05, - "loss": 1.9701, - "step": 1052 - }, - { - "epoch": 1.71, - "learning_rate": 2e-05, - "loss": 2.0893, - "step": 1054 - }, - { - "epoch": 1.72, - "learning_rate": 2e-05, - "loss": 2.073, - "step": 1056 - }, - { - "epoch": 1.72, - "learning_rate": 2e-05, - "loss": 2.2494, - "step": 1058 - }, - { - "epoch": 1.72, - "learning_rate": 2e-05, - "loss": 2.3391, - "step": 1060 - }, - { - "epoch": 1.73, - "learning_rate": 2e-05, - "loss": 2.39, - "step": 1062 - }, - { - "epoch": 1.73, - "learning_rate": 2e-05, - "loss": 2.2983, - "step": 1064 - }, - { - "epoch": 1.73, - "learning_rate": 2e-05, - "loss": 1.9709, - "step": 1066 - }, - { - "epoch": 1.74, - "learning_rate": 2e-05, - "loss": 1.7431, - "step": 1068 - }, - { - "epoch": 1.74, - "learning_rate": 2e-05, - "loss": 1.5658, - "step": 1070 - }, - { - "epoch": 1.74, - "learning_rate": 2e-05, - "loss": 1.8086, - "step": 1072 - }, - { - "epoch": 1.75, - "learning_rate": 2e-05, - "loss": 1.6591, - "step": 1074 - }, - { - "epoch": 1.75, - "learning_rate": 2e-05, - "loss": 1.6473, - "step": 1076 - }, - { - "epoch": 1.75, - "learning_rate": 2e-05, - "loss": 1.6038, - "step": 1078 - }, - { - "epoch": 1.76, - "learning_rate": 2e-05, - "loss": 1.7007, - "step": 1080 - }, - { - "epoch": 1.76, - "learning_rate": 2e-05, - "loss": 1.7281, - "step": 1082 - }, - { - "epoch": 1.76, - "learning_rate": 2e-05, - "loss": 1.6404, - "step": 1084 - }, - { - "epoch": 1.76, - "learning_rate": 2e-05, - "loss": 1.7932, - "step": 1086 - }, - { - "epoch": 1.77, - "learning_rate": 2e-05, - "loss": 1.7749, - "step": 1088 - }, - { - "epoch": 1.77, - "learning_rate": 2e-05, - "loss": 1.7351, - "step": 1090 - }, - { - "epoch": 1.77, - "learning_rate": 2e-05, - "loss": 1.6254, - "step": 1092 - }, - { - "epoch": 1.78, - "learning_rate": 2e-05, - "loss": 1.9305, - "step": 1094 - }, - { - "epoch": 1.78, - "learning_rate": 2e-05, - "loss": 1.8313, - "step": 1096 - }, - { - "epoch": 1.78, - "learning_rate": 2e-05, - "loss": 1.8954, - "step": 1098 - }, - { - "epoch": 1.79, - "learning_rate": 2e-05, - "loss": 1.9984, - "step": 1100 - }, - { - "epoch": 1.79, - "learning_rate": 2e-05, - "loss": 2.0387, - "step": 1102 - }, - { - "epoch": 1.79, - "learning_rate": 2e-05, - "loss": 2.0381, - "step": 1104 - }, - { - "epoch": 1.8, - "learning_rate": 2e-05, - "loss": 2.248, - "step": 1106 - }, - { - "epoch": 1.8, - "learning_rate": 2e-05, - "loss": 2.094, - "step": 1108 - }, - { - "epoch": 1.8, - "learning_rate": 2e-05, - "loss": 2.0483, - "step": 1110 - }, - { - "epoch": 1.81, - "learning_rate": 2e-05, - "loss": 2.2801, - "step": 1112 - }, - { - "epoch": 1.81, - "learning_rate": 2e-05, - "loss": 2.1668, - "step": 1114 - }, - { - "epoch": 1.81, - "learning_rate": 2e-05, - "loss": 1.9295, - "step": 1116 - }, - { - "epoch": 1.82, - "learning_rate": 2e-05, - "loss": 1.5339, - "step": 1118 - }, - { - "epoch": 1.82, - "learning_rate": 2e-05, - "loss": 1.4249, - "step": 1120 - }, - { - "epoch": 1.82, - "learning_rate": 2e-05, - "loss": 1.6543, - "step": 1122 - }, - { - "epoch": 1.83, - "learning_rate": 2e-05, - "loss": 1.5864, - "step": 1124 - }, - { - "epoch": 1.83, - "learning_rate": 2e-05, - "loss": 1.728, - "step": 1126 - }, - { - "epoch": 1.83, - "learning_rate": 2e-05, - "loss": 1.6257, - "step": 1128 - }, - { - "epoch": 1.84, - "learning_rate": 2e-05, - "loss": 1.7116, - "step": 1130 - }, - { - "epoch": 1.84, - "learning_rate": 2e-05, - "loss": 1.7008, - "step": 1132 - }, - { - "epoch": 1.84, - "learning_rate": 2e-05, - "loss": 1.8602, - "step": 1134 - }, - { - "epoch": 1.85, - "learning_rate": 2e-05, - "loss": 1.6803, - "step": 1136 - }, - { - "epoch": 1.85, - "learning_rate": 2e-05, - "loss": 1.5637, - "step": 1138 - }, - { - "epoch": 1.85, - "learning_rate": 2e-05, - "loss": 1.8221, - "step": 1140 - }, - { - "epoch": 1.86, - "learning_rate": 2e-05, - "loss": 1.9582, - "step": 1142 - }, - { - "epoch": 1.86, - "learning_rate": 2e-05, - "loss": 1.9902, - "step": 1144 - }, - { - "epoch": 1.86, - "learning_rate": 2e-05, - "loss": 1.9478, - "step": 1146 - }, - { - "epoch": 1.87, - "learning_rate": 2e-05, - "loss": 2.0016, - "step": 1148 - }, - { - "epoch": 1.87, - "learning_rate": 2e-05, - "loss": 1.9449, - "step": 1150 - }, - { - "epoch": 1.87, - "learning_rate": 2e-05, - "loss": 1.9599, - "step": 1152 - }, - { - "epoch": 1.88, - "learning_rate": 2e-05, - "loss": 2.1458, - "step": 1154 - }, - { - "epoch": 1.88, - "learning_rate": 2e-05, - "loss": 2.0979, - "step": 1156 - }, - { - "epoch": 1.88, - "learning_rate": 2e-05, - "loss": 2.1437, - "step": 1158 - }, - { - "epoch": 1.89, - "learning_rate": 2e-05, - "loss": 2.1176, - "step": 1160 - }, - { - "epoch": 1.89, - "learning_rate": 2e-05, - "loss": 2.0171, - "step": 1162 - }, - { - "epoch": 1.89, - "learning_rate": 2e-05, - "loss": 2.1096, - "step": 1164 - }, - { - "epoch": 1.89, - "learning_rate": 2e-05, - "loss": 1.9555, - "step": 1166 - }, - { - "epoch": 1.9, - "learning_rate": 2e-05, - "loss": 1.5505, - "step": 1168 - }, - { - "epoch": 1.9, - "learning_rate": 2e-05, - "loss": 1.5644, - "step": 1170 - }, - { - "epoch": 1.9, - "learning_rate": 2e-05, - "loss": 1.6082, - "step": 1172 - }, - { - "epoch": 1.91, - "learning_rate": 2e-05, - "loss": 1.675, - "step": 1174 - }, - { - "epoch": 1.91, - "learning_rate": 2e-05, - "loss": 1.7006, - "step": 1176 - }, - { - "epoch": 1.91, - "learning_rate": 2e-05, - "loss": 1.6374, - "step": 1178 - }, - { - "epoch": 1.92, - "learning_rate": 2e-05, - "loss": 1.5624, - "step": 1180 - }, - { - "epoch": 1.92, - "learning_rate": 2e-05, - "loss": 1.875, - "step": 1182 - }, - { - "epoch": 1.92, - "learning_rate": 2e-05, - "loss": 1.7416, - "step": 1184 - }, - { - "epoch": 1.93, - "learning_rate": 2e-05, - "loss": 1.7585, - "step": 1186 - }, - { - "epoch": 1.93, - "learning_rate": 2e-05, - "loss": 1.6764, - "step": 1188 - }, - { - "epoch": 1.93, - "learning_rate": 2e-05, - "loss": 1.7482, - "step": 1190 - }, - { - "epoch": 1.94, - "learning_rate": 2e-05, - "loss": 1.7605, - "step": 1192 - }, - { - "epoch": 1.94, - "learning_rate": 2e-05, - "loss": 1.8099, - "step": 1194 - }, - { - "epoch": 1.94, - "learning_rate": 2e-05, - "loss": 1.7902, - "step": 1196 - }, - { - "epoch": 1.95, - "learning_rate": 2e-05, - "loss": 1.7466, - "step": 1198 - }, - { - "epoch": 1.95, - "learning_rate": 2e-05, - "loss": 1.8851, - "step": 1200 - }, - { - "epoch": 1.95, - "learning_rate": 2e-05, - "loss": 1.8968, - "step": 1202 - }, - { - "epoch": 1.96, - "learning_rate": 2e-05, - "loss": 2.1639, - "step": 1204 - }, - { - "epoch": 1.96, - "learning_rate": 2e-05, - "loss": 2.1004, - "step": 1206 - }, - { - "epoch": 1.96, - "learning_rate": 2e-05, - "loss": 2.068, - "step": 1208 - }, - { - "epoch": 1.97, - "learning_rate": 2e-05, - "loss": 2.1009, - "step": 1210 - }, - { - "epoch": 1.97, - "learning_rate": 2e-05, - "loss": 2.2122, - "step": 1212 - }, - { - "epoch": 1.97, - "learning_rate": 2e-05, - "loss": 2.2953, - "step": 1214 - }, - { - "epoch": 1.98, - "learning_rate": 2e-05, - "loss": 1.9851, - "step": 1216 - }, - { - "epoch": 1.98, - "learning_rate": 2e-05, - "loss": 1.5092, - "step": 1218 - }, - { - "epoch": 1.98, - "learning_rate": 2e-05, - "loss": 1.58, - "step": 1220 - }, - { - "epoch": 1.99, - "learning_rate": 2e-05, - "loss": 1.6222, - "step": 1222 - }, - { - "epoch": 1.99, - "learning_rate": 2e-05, - "loss": 1.7341, - "step": 1224 - }, - { - "epoch": 1.99, - "learning_rate": 2e-05, - "loss": 1.9206, - "step": 1226 - }, - { - "epoch": 2.0, - "learning_rate": 2e-05, - "loss": 1.9424, - "step": 1228 - }, - { - "epoch": 2.0, - "learning_rate": 2e-05, - "loss": 2.1776, - "step": 1230 - }, - { - "epoch": 2.0, - "learning_rate": 2e-05, - "loss": 2.021, - "step": 1232 - }, - { - "epoch": 2.01, - "learning_rate": 2e-05, - "loss": 1.5074, - "step": 1234 - }, - { - "epoch": 2.01, - "learning_rate": 2e-05, - "loss": 1.441, - "step": 1236 - }, - { - "epoch": 2.01, - "learning_rate": 2e-05, - "loss": 1.619, - "step": 1238 - }, - { - "epoch": 2.02, - "learning_rate": 2e-05, - "loss": 1.6916, - "step": 1240 - }, - { - "epoch": 2.02, - "learning_rate": 2e-05, - "loss": 1.5746, - "step": 1242 - }, - { - "epoch": 2.02, - "learning_rate": 2e-05, - "loss": 1.7335, - "step": 1244 - }, - { - "epoch": 2.02, - "learning_rate": 2e-05, - "loss": 1.7544, - "step": 1246 - }, - { - "epoch": 2.03, - "learning_rate": 2e-05, - "loss": 1.6698, - "step": 1248 - }, - { - "epoch": 2.03, - "learning_rate": 2e-05, - "loss": 1.6689, - "step": 1250 - }, - { - "epoch": 2.03, - "learning_rate": 2e-05, - "loss": 1.8378, - "step": 1252 - }, - { - "epoch": 2.04, - "learning_rate": 2e-05, - "loss": 1.807, - "step": 1254 - }, - { - "epoch": 2.04, - "learning_rate": 2e-05, - "loss": 1.6988, - "step": 1256 - }, - { - "epoch": 2.04, - "learning_rate": 2e-05, - "loss": 1.5926, - "step": 1258 - }, - { - "epoch": 2.05, - "learning_rate": 2e-05, - "loss": 1.7638, - "step": 1260 - }, - { - "epoch": 2.05, - "learning_rate": 2e-05, - "loss": 1.7585, - "step": 1262 - }, - { - "epoch": 2.05, - "learning_rate": 2e-05, - "loss": 1.969, - "step": 1264 - }, - { - "epoch": 2.06, - "learning_rate": 2e-05, - "loss": 2.0372, - "step": 1266 - }, - { - "epoch": 2.06, - "learning_rate": 2e-05, - "loss": 1.8681, - "step": 1268 - }, - { - "epoch": 2.06, - "learning_rate": 2e-05, - "loss": 1.9413, - "step": 1270 - }, - { - "epoch": 2.07, - "learning_rate": 2e-05, - "loss": 2.0071, - "step": 1272 - }, - { - "epoch": 2.07, - "learning_rate": 2e-05, - "loss": 2.0781, - "step": 1274 - }, - { - "epoch": 2.07, - "learning_rate": 2e-05, - "loss": 2.2626, - "step": 1276 - }, - { - "epoch": 2.08, - "learning_rate": 2e-05, - "loss": 2.3026, - "step": 1278 - }, - { - "epoch": 2.08, - "learning_rate": 2e-05, - "loss": 2.0699, - "step": 1280 - }, - { - "epoch": 2.08, - "learning_rate": 2e-05, - "loss": 1.6375, - "step": 1282 - }, - { - "epoch": 2.09, - "learning_rate": 2e-05, - "loss": 1.5935, - "step": 1284 - }, - { - "epoch": 2.09, - "learning_rate": 2e-05, - "loss": 1.6579, - "step": 1286 - }, - { - "epoch": 2.09, - "learning_rate": 2e-05, - "loss": 1.5962, - "step": 1288 - }, - { - "epoch": 2.1, - "learning_rate": 2e-05, - "loss": 1.5371, - "step": 1290 - }, - { - "epoch": 2.1, - "learning_rate": 2e-05, - "loss": 1.6297, - "step": 1292 - }, - { - "epoch": 2.1, - "learning_rate": 2e-05, - "loss": 1.5664, - "step": 1294 - }, - { - "epoch": 2.11, - "learning_rate": 2e-05, - "loss": 1.6135, - "step": 1296 - }, - { - "epoch": 2.11, - "learning_rate": 2e-05, - "loss": 1.7271, - "step": 1298 - }, - { - "epoch": 2.11, - "learning_rate": 2e-05, - "loss": 1.7224, - "step": 1300 - }, - { - "epoch": 2.12, - "learning_rate": 2e-05, - "loss": 1.6938, - "step": 1302 - }, - { - "epoch": 2.12, - "learning_rate": 2e-05, - "loss": 1.6161, - "step": 1304 - }, - { - "epoch": 2.12, - "learning_rate": 2e-05, - "loss": 1.7213, - "step": 1306 - }, - { - "epoch": 2.13, - "learning_rate": 2e-05, - "loss": 1.7919, - "step": 1308 - }, - { - "epoch": 2.13, - "learning_rate": 2e-05, - "loss": 1.6923, - "step": 1310 - }, - { - "epoch": 2.13, - "learning_rate": 2e-05, - "loss": 1.9948, - "step": 1312 - }, - { - "epoch": 2.14, - "learning_rate": 2e-05, - "loss": 1.8132, - "step": 1314 - }, - { - "epoch": 2.14, - "learning_rate": 2e-05, - "loss": 1.9876, - "step": 1316 - }, - { - "epoch": 2.14, - "learning_rate": 2e-05, - "loss": 1.9332, - "step": 1318 - }, - { - "epoch": 2.15, - "learning_rate": 2e-05, - "loss": 2.0376, - "step": 1320 - }, - { - "epoch": 2.15, - "learning_rate": 2e-05, - "loss": 2.1336, - "step": 1322 - }, - { - "epoch": 2.15, - "learning_rate": 2e-05, - "loss": 2.1492, - "step": 1324 - }, - { - "epoch": 2.15, - "learning_rate": 2e-05, - "loss": 2.2404, - "step": 1326 - }, - { - "epoch": 2.16, - "learning_rate": 2e-05, - "loss": 2.0328, - "step": 1328 - }, - { - "epoch": 2.16, - "learning_rate": 2e-05, - "loss": 2.2208, - "step": 1330 - }, - { - "epoch": 2.16, - "learning_rate": 2e-05, - "loss": 1.6264, - "step": 1332 - }, - { - "epoch": 2.17, - "learning_rate": 2e-05, - "loss": 1.6469, - "step": 1334 - }, - { - "epoch": 2.17, - "learning_rate": 2e-05, - "loss": 1.4819, - "step": 1336 - }, - { - "epoch": 2.17, - "learning_rate": 2e-05, - "loss": 1.6929, - "step": 1338 - }, - { - "epoch": 2.18, - "learning_rate": 2e-05, - "loss": 1.5354, - "step": 1340 - }, - { - "epoch": 2.18, - "learning_rate": 2e-05, - "loss": 1.5995, - "step": 1342 - }, - { - "epoch": 2.18, - "learning_rate": 2e-05, - "loss": 1.7638, - "step": 1344 - }, - { - "epoch": 2.19, - "learning_rate": 2e-05, - "loss": 1.7488, - "step": 1346 - }, - { - "epoch": 2.19, - "learning_rate": 2e-05, - "loss": 1.8151, - "step": 1348 - }, - { - "epoch": 2.19, - "learning_rate": 2e-05, - "loss": 1.7435, - "step": 1350 - }, - { - "epoch": 2.2, - "learning_rate": 2e-05, - "loss": 1.7724, - "step": 1352 - }, - { - "epoch": 2.2, - "learning_rate": 2e-05, - "loss": 1.7307, - "step": 1354 - }, - { - "epoch": 2.2, - "learning_rate": 2e-05, - "loss": 1.6689, - "step": 1356 - }, - { - "epoch": 2.21, - "learning_rate": 2e-05, - "loss": 1.8689, - "step": 1358 - }, - { - "epoch": 2.21, - "learning_rate": 2e-05, - "loss": 1.6998, - "step": 1360 - }, - { - "epoch": 2.21, - "learning_rate": 2e-05, - "loss": 2.0046, - "step": 1362 - }, - { - "epoch": 2.22, - "learning_rate": 2e-05, - "loss": 1.7219, - "step": 1364 - }, - { - "epoch": 2.22, - "learning_rate": 2e-05, - "loss": 1.8812, - "step": 1366 - }, - { - "epoch": 2.22, - "learning_rate": 2e-05, - "loss": 1.8674, - "step": 1368 - }, - { - "epoch": 2.23, - "learning_rate": 2e-05, - "loss": 2.0153, - "step": 1370 - }, - { - "epoch": 2.23, - "learning_rate": 2e-05, - "loss": 2.1036, - "step": 1372 - }, - { - "epoch": 2.23, - "learning_rate": 2e-05, - "loss": 2.2218, - "step": 1374 - }, - { - "epoch": 2.24, - "learning_rate": 2e-05, - "loss": 2.2705, - "step": 1376 - }, - { - "epoch": 2.24, - "learning_rate": 2e-05, - "loss": 2.1251, - "step": 1378 - }, - { - "epoch": 2.24, - "learning_rate": 2e-05, - "loss": 2.27, - "step": 1380 - }, - { - "epoch": 2.25, - "learning_rate": 2e-05, - "loss": 1.7102, - "step": 1382 - }, - { - "epoch": 2.25, - "learning_rate": 2e-05, - "loss": 1.5466, - "step": 1384 - }, - { - "epoch": 2.25, - "learning_rate": 2e-05, - "loss": 1.7316, - "step": 1386 - }, - { - "epoch": 2.26, - "learning_rate": 2e-05, - "loss": 1.5031, - "step": 1388 - }, - { - "epoch": 2.26, - "learning_rate": 2e-05, - "loss": 1.608, - "step": 1390 - }, - { - "epoch": 2.26, - "learning_rate": 2e-05, - "loss": 1.5033, - "step": 1392 - }, - { - "epoch": 2.27, - "learning_rate": 2e-05, - "loss": 1.733, - "step": 1394 - }, - { - "epoch": 2.27, - "learning_rate": 2e-05, - "loss": 1.7289, - "step": 1396 - }, - { - "epoch": 2.27, - "learning_rate": 2e-05, - "loss": 1.7305, - "step": 1398 - }, - { - "epoch": 2.28, - "learning_rate": 2e-05, - "loss": 1.7743, - "step": 1400 - }, - { - "epoch": 2.28, - "learning_rate": 2e-05, - "loss": 1.8423, - "step": 1402 - }, - { - "epoch": 2.28, - "learning_rate": 2e-05, - "loss": 1.6997, - "step": 1404 - }, - { - "epoch": 2.28, - "learning_rate": 2e-05, - "loss": 1.8342, - "step": 1406 - }, - { - "epoch": 2.29, - "learning_rate": 2e-05, - "loss": 1.6843, - "step": 1408 - }, - { - "epoch": 2.29, - "learning_rate": 2e-05, - "loss": 1.8158, - "step": 1410 - }, - { - "epoch": 2.29, - "learning_rate": 2e-05, - "loss": 1.7417, - "step": 1412 - }, - { - "epoch": 2.3, - "learning_rate": 2e-05, - "loss": 1.8145, - "step": 1414 - }, - { - "epoch": 2.3, - "learning_rate": 2e-05, - "loss": 1.9312, - "step": 1416 - }, - { - "epoch": 2.3, - "learning_rate": 2e-05, - "loss": 1.9542, - "step": 1418 - }, - { - "epoch": 2.31, - "learning_rate": 2e-05, - "loss": 2.0767, - "step": 1420 - }, - { - "epoch": 2.31, - "learning_rate": 2e-05, - "loss": 1.9053, - "step": 1422 - }, - { - "epoch": 2.31, - "learning_rate": 2e-05, - "loss": 2.4192, - "step": 1424 - }, - { - "epoch": 2.32, - "learning_rate": 2e-05, - "loss": 2.1551, - "step": 1426 - }, - { - "epoch": 2.32, - "learning_rate": 2e-05, - "loss": 2.1658, - "step": 1428 - }, - { - "epoch": 2.32, - "learning_rate": 2e-05, - "loss": 2.1308, - "step": 1430 - }, - { - "epoch": 2.33, - "learning_rate": 2e-05, - "loss": 1.6135, - "step": 1432 - }, - { - "epoch": 2.33, - "learning_rate": 2e-05, - "loss": 1.5897, - "step": 1434 - }, - { - "epoch": 2.33, - "learning_rate": 2e-05, - "loss": 1.4971, - "step": 1436 - }, - { - "epoch": 2.34, - "learning_rate": 2e-05, - "loss": 1.5434, - "step": 1438 - }, - { - "epoch": 2.34, - "learning_rate": 2e-05, - "loss": 1.6953, - "step": 1440 - }, - { - "epoch": 2.34, - "learning_rate": 2e-05, - "loss": 1.6321, - "step": 1442 - }, - { - "epoch": 2.35, - "learning_rate": 2e-05, - "loss": 1.572, - "step": 1444 - }, - { - "epoch": 2.35, - "learning_rate": 2e-05, - "loss": 1.5595, - "step": 1446 - }, - { - "epoch": 2.35, - "learning_rate": 2e-05, - "loss": 1.8133, - "step": 1448 - }, - { - "epoch": 2.36, - "learning_rate": 2e-05, - "loss": 1.641, - "step": 1450 - }, - { - "epoch": 2.36, - "learning_rate": 2e-05, - "loss": 1.6171, - "step": 1452 - }, - { - "epoch": 2.36, - "learning_rate": 2e-05, - "loss": 1.6589, - "step": 1454 - }, - { - "epoch": 2.37, - "learning_rate": 2e-05, - "loss": 1.786, - "step": 1456 - }, - { - "epoch": 2.37, - "learning_rate": 2e-05, - "loss": 1.7184, - "step": 1458 - }, - { - "epoch": 2.37, - "learning_rate": 2e-05, - "loss": 1.847, - "step": 1460 - }, - { - "epoch": 2.38, - "learning_rate": 2e-05, - "loss": 1.7423, - "step": 1462 - }, - { - "epoch": 2.38, - "learning_rate": 2e-05, - "loss": 2.0235, - "step": 1464 - }, - { - "epoch": 2.38, - "learning_rate": 2e-05, - "loss": 1.9874, - "step": 1466 - }, - { - "epoch": 2.39, - "learning_rate": 2e-05, - "loss": 1.9473, - "step": 1468 - }, - { - "epoch": 2.39, - "learning_rate": 2e-05, - "loss": 2.0052, - "step": 1470 - }, - { - "epoch": 2.39, - "learning_rate": 2e-05, - "loss": 1.9972, - "step": 1472 - }, - { - "epoch": 2.4, - "learning_rate": 2e-05, - "loss": 2.0354, - "step": 1474 - }, - { - "epoch": 2.4, - "learning_rate": 2e-05, - "loss": 2.1694, - "step": 1476 - }, - { - "epoch": 2.4, - "learning_rate": 2e-05, - "loss": 2.0919, - "step": 1478 - }, - { - "epoch": 2.41, - "learning_rate": 2e-05, - "loss": 2.1576, - "step": 1480 - }, - { - "epoch": 2.41, - "learning_rate": 2e-05, - "loss": 1.6458, - "step": 1482 - }, - { - "epoch": 2.41, - "learning_rate": 2e-05, - "loss": 1.5015, - "step": 1484 - }, - { - "epoch": 2.41, - "learning_rate": 2e-05, - "loss": 1.5359, - "step": 1486 - }, - { - "epoch": 2.42, - "learning_rate": 2e-05, - "loss": 1.7065, - "step": 1488 - }, - { - "epoch": 2.42, - "learning_rate": 2e-05, - "loss": 1.6893, - "step": 1490 - }, - { - "epoch": 2.42, - "learning_rate": 2e-05, - "loss": 1.7842, - "step": 1492 - }, - { - "epoch": 2.43, - "learning_rate": 2e-05, - "loss": 1.7229, - "step": 1494 - }, - { - "epoch": 2.43, - "learning_rate": 2e-05, - "loss": 1.6822, - "step": 1496 - }, - { - "epoch": 2.43, - "learning_rate": 2e-05, - "loss": 1.664, - "step": 1498 - }, - { - "epoch": 2.44, - "learning_rate": 2e-05, - "loss": 1.7587, - "step": 1500 - }, - { - "epoch": 2.44, - "learning_rate": 2e-05, - "loss": 1.8268, - "step": 1502 - }, - { - "epoch": 2.44, - "learning_rate": 2e-05, - "loss": 1.6934, - "step": 1504 - }, - { - "epoch": 2.45, - "learning_rate": 2e-05, - "loss": 1.8765, - "step": 1506 - }, - { - "epoch": 2.45, - "learning_rate": 2e-05, - "loss": 1.845, - "step": 1508 - }, - { - "epoch": 2.45, - "learning_rate": 2e-05, - "loss": 1.8817, - "step": 1510 - }, - { - "epoch": 2.46, - "learning_rate": 2e-05, - "loss": 1.727, - "step": 1512 - }, - { - "epoch": 2.46, - "learning_rate": 2e-05, - "loss": 1.848, - "step": 1514 - }, - { - "epoch": 2.46, - "learning_rate": 2e-05, - "loss": 2.0766, - "step": 1516 - }, - { - "epoch": 2.47, - "learning_rate": 2e-05, - "loss": 2.0203, - "step": 1518 - }, - { - "epoch": 2.47, - "learning_rate": 2e-05, - "loss": 2.0457, - "step": 1520 - }, - { - "epoch": 2.47, - "learning_rate": 2e-05, - "loss": 2.1893, - "step": 1522 - }, - { - "epoch": 2.48, - "learning_rate": 2e-05, - "loss": 1.9883, - "step": 1524 - }, - { - "epoch": 2.48, - "learning_rate": 2e-05, - "loss": 2.0896, - "step": 1526 - }, - { - "epoch": 2.48, - "learning_rate": 2e-05, - "loss": 2.1288, - "step": 1528 - }, - { - "epoch": 2.49, - "learning_rate": 2e-05, - "loss": 2.2434, - "step": 1530 - }, - { - "epoch": 2.49, - "learning_rate": 2e-05, - "loss": 1.8621, - "step": 1532 - }, - { - "epoch": 2.49, - "learning_rate": 2e-05, - "loss": 1.5902, - "step": 1534 - }, - { - "epoch": 2.5, - "learning_rate": 2e-05, - "loss": 1.6382, - "step": 1536 - }, - { - "epoch": 2.5, - "learning_rate": 2e-05, - "loss": 1.5176, - "step": 1538 - }, - { - "epoch": 2.5, - "learning_rate": 2e-05, - "loss": 1.7048, - "step": 1540 - }, - { - "epoch": 2.51, - "learning_rate": 2e-05, - "loss": 1.7238, - "step": 1542 - }, - { - "epoch": 2.51, - "learning_rate": 2e-05, - "loss": 1.7027, - "step": 1544 - }, - { - "epoch": 2.51, - "learning_rate": 2e-05, - "loss": 1.6627, - "step": 1546 - }, - { - "epoch": 2.52, - "learning_rate": 2e-05, - "loss": 1.6683, - "step": 1548 - }, - { - "epoch": 2.52, - "learning_rate": 2e-05, - "loss": 1.732, - "step": 1550 - }, - { - "epoch": 2.52, - "learning_rate": 2e-05, - "loss": 1.7039, - "step": 1552 - }, - { - "epoch": 2.53, - "learning_rate": 2e-05, - "loss": 1.5855, - "step": 1554 - }, - { - "epoch": 2.53, - "learning_rate": 2e-05, - "loss": 1.6969, - "step": 1556 - }, - { - "epoch": 2.53, - "learning_rate": 2e-05, - "loss": 1.7362, - "step": 1558 - }, - { - "epoch": 2.54, - "learning_rate": 2e-05, - "loss": 1.6448, - "step": 1560 - }, - { - "epoch": 2.54, - "learning_rate": 2e-05, - "loss": 1.7654, - "step": 1562 - }, - { - "epoch": 2.54, - "learning_rate": 2e-05, - "loss": 1.7655, - "step": 1564 - }, - { - "epoch": 2.54, - "learning_rate": 2e-05, - "loss": 1.8978, - "step": 1566 - }, - { - "epoch": 2.55, - "learning_rate": 2e-05, - "loss": 1.9586, - "step": 1568 - }, - { - "epoch": 2.55, - "learning_rate": 2e-05, - "loss": 2.0546, - "step": 1570 - }, - { - "epoch": 2.55, - "learning_rate": 2e-05, - "loss": 1.9625, - "step": 1572 - }, - { - "epoch": 2.56, - "learning_rate": 2e-05, - "loss": 2.0044, - "step": 1574 - }, - { - "epoch": 2.56, - "learning_rate": 2e-05, - "loss": 2.0529, - "step": 1576 - }, - { - "epoch": 2.56, - "learning_rate": 2e-05, - "loss": 2.1048, - "step": 1578 - }, - { - "epoch": 2.57, - "learning_rate": 2e-05, - "loss": 2.0548, - "step": 1580 - }, - { - "epoch": 2.57, - "learning_rate": 2e-05, - "loss": 1.7534, - "step": 1582 - }, - { - "epoch": 2.57, - "learning_rate": 2e-05, - "loss": 1.4283, - "step": 1584 - }, - { - "epoch": 2.58, - "learning_rate": 2e-05, - "loss": 1.6125, - "step": 1586 - }, - { - "epoch": 2.58, - "learning_rate": 2e-05, - "loss": 1.5568, - "step": 1588 - }, - { - "epoch": 2.58, - "learning_rate": 2e-05, - "loss": 1.6719, - "step": 1590 - }, - { - "epoch": 2.59, - "learning_rate": 2e-05, - "loss": 1.6142, - "step": 1592 - }, - { - "epoch": 2.59, - "learning_rate": 2e-05, - "loss": 1.6434, - "step": 1594 - }, - { - "epoch": 2.59, - "learning_rate": 2e-05, - "loss": 1.7194, - "step": 1596 - }, - { - "epoch": 2.6, - "learning_rate": 2e-05, - "loss": 1.7541, - "step": 1598 - }, - { - "epoch": 2.6, - "learning_rate": 2e-05, - "loss": 1.6681, - "step": 1600 - }, - { - "epoch": 2.6, - "learning_rate": 2e-05, - "loss": 1.81, - "step": 1602 - }, - { - "epoch": 2.61, - "learning_rate": 2e-05, - "loss": 1.6948, - "step": 1604 - }, - { - "epoch": 2.61, - "learning_rate": 2e-05, - "loss": 1.7533, - "step": 1606 - }, - { - "epoch": 2.61, - "learning_rate": 2e-05, - "loss": 1.8256, - "step": 1608 - }, - { - "epoch": 2.62, - "learning_rate": 2e-05, - "loss": 1.8959, - "step": 1610 - }, - { - "epoch": 2.62, - "learning_rate": 2e-05, - "loss": 1.7526, - "step": 1612 - }, - { - "epoch": 2.62, - "learning_rate": 2e-05, - "loss": 1.8277, - "step": 1614 - }, - { - "epoch": 2.63, - "learning_rate": 2e-05, - "loss": 1.9229, - "step": 1616 - }, - { - "epoch": 2.63, - "learning_rate": 2e-05, - "loss": 1.9832, - "step": 1618 - }, - { - "epoch": 2.63, - "learning_rate": 2e-05, - "loss": 1.903, - "step": 1620 - }, - { - "epoch": 2.64, - "learning_rate": 2e-05, - "loss": 2.1626, - "step": 1622 - }, - { - "epoch": 2.64, - "learning_rate": 2e-05, - "loss": 2.2053, - "step": 1624 - }, - { - "epoch": 2.64, - "learning_rate": 2e-05, - "loss": 2.176, - "step": 1626 - }, - { - "epoch": 2.65, - "learning_rate": 2e-05, - "loss": 2.1653, - "step": 1628 - }, - { - "epoch": 2.65, - "learning_rate": 2e-05, - "loss": 2.179, - "step": 1630 - }, - { - "epoch": 2.65, - "learning_rate": 2e-05, - "loss": 1.7573, - "step": 1632 - }, - { - "epoch": 2.66, - "learning_rate": 2e-05, - "loss": 1.5932, - "step": 1634 - }, - { - "epoch": 2.66, - "learning_rate": 2e-05, - "loss": 1.5588, - "step": 1636 - }, - { - "epoch": 2.66, - "learning_rate": 2e-05, - "loss": 1.6959, - "step": 1638 - }, - { - "epoch": 2.67, - "learning_rate": 2e-05, - "loss": 1.598, - "step": 1640 - }, - { - "epoch": 2.67, - "learning_rate": 2e-05, - "loss": 1.748, - "step": 1642 - }, - { - "epoch": 2.67, - "learning_rate": 2e-05, - "loss": 1.7074, - "step": 1644 - }, - { - "epoch": 2.67, - "learning_rate": 2e-05, - "loss": 1.8215, - "step": 1646 - }, - { - "epoch": 2.68, - "learning_rate": 2e-05, - "loss": 1.6872, - "step": 1648 - }, - { - "epoch": 2.68, - "learning_rate": 2e-05, - "loss": 1.734, - "step": 1650 - }, - { - "epoch": 2.68, - "learning_rate": 2e-05, - "loss": 1.6693, - "step": 1652 - }, - { - "epoch": 2.69, - "learning_rate": 2e-05, - "loss": 1.7714, - "step": 1654 - }, - { - "epoch": 2.69, - "learning_rate": 2e-05, - "loss": 1.7195, - "step": 1656 - }, - { - "epoch": 2.69, - "learning_rate": 2e-05, - "loss": 1.7976, - "step": 1658 - }, - { - "epoch": 2.7, - "learning_rate": 2e-05, - "loss": 1.8392, - "step": 1660 - }, - { - "epoch": 2.7, - "learning_rate": 2e-05, - "loss": 1.7095, - "step": 1662 - }, - { - "epoch": 2.7, - "learning_rate": 2e-05, - "loss": 1.9426, - "step": 1664 - }, - { - "epoch": 2.71, - "learning_rate": 2e-05, - "loss": 1.9583, - "step": 1666 - }, - { - "epoch": 2.71, - "learning_rate": 2e-05, - "loss": 2.028, - "step": 1668 - }, - { - "epoch": 2.71, - "learning_rate": 2e-05, - "loss": 2.1083, - "step": 1670 - }, - { - "epoch": 2.72, - "learning_rate": 2e-05, - "loss": 2.1001, - "step": 1672 - }, - { - "epoch": 2.72, - "learning_rate": 2e-05, - "loss": 2.2984, - "step": 1674 - }, - { - "epoch": 2.72, - "learning_rate": 2e-05, - "loss": 2.0889, - "step": 1676 - }, - { - "epoch": 2.73, - "learning_rate": 2e-05, - "loss": 2.3576, - "step": 1678 - }, - { - "epoch": 2.73, - "learning_rate": 2e-05, - "loss": 2.0517, - "step": 1680 - }, - { - "epoch": 2.73, - "learning_rate": 2e-05, - "loss": 1.6826, - "step": 1682 - }, - { - "epoch": 2.74, - "learning_rate": 2e-05, - "loss": 1.5038, - "step": 1684 - }, - { - "epoch": 2.74, - "learning_rate": 2e-05, - "loss": 1.6221, - "step": 1686 - }, - { - "epoch": 2.74, - "learning_rate": 2e-05, - "loss": 1.5329, - "step": 1688 - }, - { - "epoch": 2.75, - "learning_rate": 2e-05, - "loss": 1.6428, - "step": 1690 - }, - { - "epoch": 2.75, - "learning_rate": 2e-05, - "loss": 1.6739, - "step": 1692 - }, - { - "epoch": 2.75, - "learning_rate": 2e-05, - "loss": 1.6846, - "step": 1694 - }, - { - "epoch": 2.76, - "learning_rate": 2e-05, - "loss": 1.7064, - "step": 1696 - }, - { - "epoch": 2.76, - "learning_rate": 2e-05, - "loss": 1.6818, - "step": 1698 - }, - { - "epoch": 2.76, - "learning_rate": 2e-05, - "loss": 1.7676, - "step": 1700 - }, - { - "epoch": 2.77, - "learning_rate": 2e-05, - "loss": 1.6708, - "step": 1702 - }, - { - "epoch": 2.77, - "learning_rate": 2e-05, - "loss": 1.7152, - "step": 1704 - }, - { - "epoch": 2.77, - "learning_rate": 2e-05, - "loss": 1.4785, - "step": 1706 - }, - { - "epoch": 2.78, - "learning_rate": 2e-05, - "loss": 1.9122, - "step": 1708 - }, - { - "epoch": 2.78, - "learning_rate": 2e-05, - "loss": 1.9274, - "step": 1710 - }, - { - "epoch": 2.78, - "learning_rate": 2e-05, - "loss": 1.982, - "step": 1712 - }, - { - "epoch": 2.79, - "learning_rate": 2e-05, - "loss": 1.9373, - "step": 1714 - }, - { - "epoch": 2.79, - "learning_rate": 2e-05, - "loss": 1.8395, - "step": 1716 - }, - { - "epoch": 2.79, - "learning_rate": 2e-05, - "loss": 1.8692, - "step": 1718 - }, - { - "epoch": 2.8, - "learning_rate": 2e-05, - "loss": 2.1762, - "step": 1720 - }, - { - "epoch": 2.8, - "learning_rate": 2e-05, - "loss": 2.0499, - "step": 1722 - }, - { - "epoch": 2.8, - "learning_rate": 2e-05, - "loss": 2.1598, - "step": 1724 - }, - { - "epoch": 2.8, - "learning_rate": 2e-05, - "loss": 2.2524, - "step": 1726 - }, - { - "epoch": 2.81, - "learning_rate": 2e-05, - "loss": 2.2295, - "step": 1728 - }, - { - "epoch": 2.81, - "learning_rate": 2e-05, - "loss": 2.4519, - "step": 1730 - }, - { - "epoch": 2.81, - "learning_rate": 2e-05, - "loss": 1.7711, - "step": 1732 - }, - { - "epoch": 2.82, - "learning_rate": 2e-05, - "loss": 1.5325, - "step": 1734 - }, - { - "epoch": 2.82, - "learning_rate": 2e-05, - "loss": 1.6951, - "step": 1736 - }, - { - "epoch": 2.82, - "learning_rate": 2e-05, - "loss": 1.5366, - "step": 1738 - }, - { - "epoch": 2.83, - "learning_rate": 2e-05, - "loss": 1.5947, - "step": 1740 - }, - { - "epoch": 2.83, - "learning_rate": 2e-05, - "loss": 1.6359, - "step": 1742 - }, - { - "epoch": 2.83, - "learning_rate": 2e-05, - "loss": 1.6945, - "step": 1744 - }, - { - "epoch": 2.84, - "learning_rate": 2e-05, - "loss": 1.7055, - "step": 1746 - }, - { - "epoch": 2.84, - "learning_rate": 2e-05, - "loss": 1.5715, - "step": 1748 - }, - { - "epoch": 2.84, - "learning_rate": 2e-05, - "loss": 1.6131, - "step": 1750 - }, - { - "epoch": 2.85, - "learning_rate": 2e-05, - "loss": 1.7353, - "step": 1752 - }, - { - "epoch": 2.85, - "learning_rate": 2e-05, - "loss": 1.6901, - "step": 1754 - }, - { - "epoch": 2.85, - "learning_rate": 2e-05, - "loss": 1.6287, - "step": 1756 - }, - { - "epoch": 2.86, - "learning_rate": 2e-05, - "loss": 1.7596, - "step": 1758 - }, - { - "epoch": 2.86, - "learning_rate": 2e-05, - "loss": 1.6615, - "step": 1760 - }, - { - "epoch": 2.86, - "learning_rate": 2e-05, - "loss": 1.7761, - "step": 1762 - }, - { - "epoch": 2.87, - "learning_rate": 2e-05, - "loss": 1.7353, - "step": 1764 - }, - { - "epoch": 2.87, - "learning_rate": 2e-05, - "loss": 2.0409, - "step": 1766 - }, - { - "epoch": 2.87, - "learning_rate": 2e-05, - "loss": 1.8417, - "step": 1768 - }, - { - "epoch": 2.88, - "learning_rate": 2e-05, - "loss": 1.9822, - "step": 1770 - }, - { - "epoch": 2.88, - "learning_rate": 2e-05, - "loss": 1.935, - "step": 1772 - }, - { - "epoch": 2.88, - "learning_rate": 2e-05, - "loss": 2.191, - "step": 1774 - }, - { - "epoch": 2.89, - "learning_rate": 2e-05, - "loss": 2.1619, - "step": 1776 - }, - { - "epoch": 2.89, - "learning_rate": 2e-05, - "loss": 2.179, - "step": 1778 - }, - { - "epoch": 2.89, - "learning_rate": 2e-05, - "loss": 2.4226, - "step": 1780 - }, - { - "epoch": 2.9, - "learning_rate": 2e-05, - "loss": 1.752, - "step": 1782 - }, - { - "epoch": 2.9, - "learning_rate": 2e-05, - "loss": 1.4133, - "step": 1784 - }, - { - "epoch": 2.9, - "learning_rate": 2e-05, - "loss": 1.3559, - "step": 1786 - }, - { - "epoch": 2.91, - "learning_rate": 2e-05, - "loss": 1.4966, - "step": 1788 - }, - { - "epoch": 2.91, - "learning_rate": 2e-05, - "loss": 1.5845, - "step": 1790 - }, - { - "epoch": 2.91, - "learning_rate": 2e-05, - "loss": 1.6532, - "step": 1792 - }, - { - "epoch": 2.92, - "learning_rate": 2e-05, - "loss": 1.6712, - "step": 1794 - }, - { - "epoch": 2.92, - "learning_rate": 2e-05, - "loss": 1.7321, - "step": 1796 - }, - { - "epoch": 2.92, - "learning_rate": 2e-05, - "loss": 1.4819, - "step": 1798 - }, - { - "epoch": 2.93, - "learning_rate": 2e-05, - "loss": 1.7174, - "step": 1800 } ], - "max_steps": 1845, + "logging_steps": 2, + "max_steps": 6807, "num_train_epochs": 3, - "total_flos": 3.908152132295885e+16, + "save_steps": 50, + "total_flos": 7760442301267968.0, "trial_name": null, "trial_params": null }