diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,136 +1,5416 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.36272953978689637, - "global_step": 200, + "epoch": 2.9250457038391224, + "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ + { + "epoch": 0.0, + "learning_rate": 2e-05, + "loss": 1.6531, + "step": 2 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 1.7623, + "step": 4 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 1.4946, + "step": 6 + }, + { + "epoch": 0.01, + "learning_rate": 2e-05, + "loss": 1.8882, + "step": 8 + }, { "epoch": 0.02, "learning_rate": 2e-05, - "loss": 1.9524, + "loss": 1.7562, "step": 10 }, { - "epoch": 0.04, + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 1.875, + "step": 12 + }, + { + "epoch": 0.02, + "learning_rate": 2e-05, + "loss": 1.9855, + "step": 14 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 1.833, + "step": 16 + }, + { + "epoch": 0.03, "learning_rate": 2e-05, - "loss": 1.9249, + "loss": 1.7799, + "step": 18 + }, + { + "epoch": 0.03, + "learning_rate": 2e-05, + "loss": 1.9502, "step": 20 }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 1.8134, + "step": 22 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 1.8352, + "step": 24 + }, + { + "epoch": 0.04, + "learning_rate": 2e-05, + "loss": 1.9382, + "step": 26 + }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 2.2001, + "step": 28 + }, { "epoch": 0.05, "learning_rate": 2e-05, - "loss": 1.9685, + "loss": 1.9664, "step": 30 }, + { + "epoch": 0.05, + "learning_rate": 2e-05, + "loss": 2.0627, + "step": 32 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 2.0913, + "step": 34 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 2.1183, + "step": 36 + }, + { + "epoch": 0.06, + "learning_rate": 2e-05, + "loss": 2.2824, + "step": 38 + }, { "epoch": 0.07, "learning_rate": 2e-05, - "loss": 2.1624, + "loss": 2.2368, "step": 40 }, { - "epoch": 0.09, + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 2.3417, + "step": 42 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 2.3669, + "step": 44 + }, + { + "epoch": 0.07, + "learning_rate": 2e-05, + "loss": 2.5686, + "step": 46 + }, + { + "epoch": 0.08, + "learning_rate": 2e-05, + "loss": 2.6004, + "step": 48 + }, + { + "epoch": 0.08, "learning_rate": 2e-05, - "loss": 2.3455, + "loss": 2.7763, "step": 50 }, { - "epoch": 0.11, + "epoch": 0.08, + "learning_rate": 2e-05, + "loss": 1.7123, + "step": 52 + }, + { + "epoch": 0.09, + "learning_rate": 2e-05, + "loss": 1.5842, + "step": 54 + }, + { + "epoch": 0.09, + "learning_rate": 2e-05, + "loss": 1.634, + "step": 56 + }, + { + "epoch": 0.09, + "learning_rate": 2e-05, + "loss": 1.6568, + "step": 58 + }, + { + "epoch": 0.1, "learning_rate": 2e-05, - "loss": 1.7288, + "loss": 1.7682, "step": 60 }, { - "epoch": 0.13, + "epoch": 0.1, "learning_rate": 2e-05, - "loss": 1.712, + "loss": 1.7746, + "step": 62 + }, + { + "epoch": 0.1, + "learning_rate": 2e-05, + "loss": 1.7256, + "step": 64 + }, + { + "epoch": 0.11, + "learning_rate": 2e-05, + "loss": 1.8528, + "step": 66 + }, + { + "epoch": 0.11, + "learning_rate": 2e-05, + "loss": 1.8566, + "step": 68 + }, + { + "epoch": 0.11, + "learning_rate": 2e-05, + "loss": 1.8693, "step": 70 }, { - "epoch": 0.15, + "epoch": 0.12, + "learning_rate": 2e-05, + "loss": 1.9164, + "step": 72 + }, + { + "epoch": 0.12, + "learning_rate": 2e-05, + "loss": 1.8215, + "step": 74 + }, + { + "epoch": 0.12, "learning_rate": 2e-05, - "loss": 1.8443, + "loss": 1.6364, + "step": 76 + }, + { + "epoch": 0.13, + "learning_rate": 2e-05, + "loss": 1.9358, + "step": 78 + }, + { + "epoch": 0.13, + "learning_rate": 2e-05, + "loss": 1.829, "step": 80 }, { - "epoch": 0.16, + "epoch": 0.13, + "learning_rate": 2e-05, + "loss": 2.0402, + "step": 82 + }, + { + "epoch": 0.14, + "learning_rate": 2e-05, + "loss": 2.046, + "step": 84 + }, + { + "epoch": 0.14, "learning_rate": 2e-05, - "loss": 1.9731, + "loss": 1.9946, + "step": 86 + }, + { + "epoch": 0.14, + "learning_rate": 2e-05, + "loss": 2.0116, + "step": 88 + }, + { + "epoch": 0.15, + "learning_rate": 2e-05, + "loss": 2.1454, "step": 90 }, { - "epoch": 0.18, + "epoch": 0.15, + "learning_rate": 2e-05, + "loss": 2.3368, + "step": 92 + }, + { + "epoch": 0.15, + "learning_rate": 2e-05, + "loss": 2.3731, + "step": 94 + }, + { + "epoch": 0.16, + "learning_rate": 2e-05, + "loss": 2.3366, + "step": 96 + }, + { + "epoch": 0.16, + "learning_rate": 2e-05, + "loss": 2.3437, + "step": 98 + }, + { + "epoch": 0.16, "learning_rate": 2e-05, - "loss": 2.1967, + "loss": 2.524, "step": 100 }, { - "epoch": 0.2, + "epoch": 0.17, + "learning_rate": 2e-05, + "loss": 1.5793, + "step": 102 + }, + { + "epoch": 0.17, + "learning_rate": 2e-05, + "loss": 1.6267, + "step": 104 + }, + { + "epoch": 0.17, "learning_rate": 2e-05, - "loss": 1.6612, + "loss": 1.6976, + "step": 106 + }, + { + "epoch": 0.18, + "learning_rate": 2e-05, + "loss": 1.7402, + "step": 108 + }, + { + "epoch": 0.18, + "learning_rate": 2e-05, + "loss": 1.6939, "step": 110 }, { - "epoch": 0.22, + "epoch": 0.18, + "learning_rate": 2e-05, + "loss": 1.8356, + "step": 112 + }, + { + "epoch": 0.19, + "learning_rate": 2e-05, + "loss": 1.8122, + "step": 114 + }, + { + "epoch": 0.19, "learning_rate": 2e-05, - "loss": 1.6775, + "loss": 1.6891, + "step": 116 + }, + { + "epoch": 0.19, + "learning_rate": 2e-05, + "loss": 1.7671, + "step": 118 + }, + { + "epoch": 0.2, + "learning_rate": 2e-05, + "loss": 1.8497, "step": 120 }, { - "epoch": 0.24, + "epoch": 0.2, + "learning_rate": 2e-05, + "loss": 1.7755, + "step": 122 + }, + { + "epoch": 0.2, + "learning_rate": 2e-05, + "loss": 1.8383, + "step": 124 + }, + { + "epoch": 0.2, + "learning_rate": 2e-05, + "loss": 1.9323, + "step": 126 + }, + { + "epoch": 0.21, "learning_rate": 2e-05, - "loss": 1.7389, + "loss": 1.8231, + "step": 128 + }, + { + "epoch": 0.21, + "learning_rate": 2e-05, + "loss": 1.8705, "step": 130 }, { - "epoch": 0.25, + "epoch": 0.21, + "learning_rate": 2e-05, + "loss": 1.8914, + "step": 132 + }, + { + "epoch": 0.22, + "learning_rate": 2e-05, + "loss": 2.1717, + "step": 134 + }, + { + "epoch": 0.22, "learning_rate": 2e-05, - "loss": 1.9169, + "loss": 2.0413, + "step": 136 + }, + { + "epoch": 0.22, + "learning_rate": 2e-05, + "loss": 2.0281, + "step": 138 + }, + { + "epoch": 0.23, + "learning_rate": 2e-05, + "loss": 2.1997, "step": 140 }, { - "epoch": 0.27, + "epoch": 0.23, + "learning_rate": 2e-05, + "loss": 2.3263, + "step": 142 + }, + { + "epoch": 0.23, + "learning_rate": 2e-05, + "loss": 2.1469, + "step": 144 + }, + { + "epoch": 0.24, + "learning_rate": 2e-05, + "loss": 2.2804, + "step": 146 + }, + { + "epoch": 0.24, + "learning_rate": 2e-05, + "loss": 2.4742, + "step": 148 + }, + { + "epoch": 0.24, "learning_rate": 2e-05, - "loss": 2.2075, + "loss": 2.3649, "step": 150 }, { - "epoch": 0.29, + "epoch": 0.25, "learning_rate": 2e-05, - "loss": 1.6245, - "step": 160 + "loss": 1.6274, + "step": 152 }, { - "epoch": 0.31, + "epoch": 0.25, "learning_rate": 2e-05, - "loss": 1.6433, - "step": 170 + "loss": 1.6698, + "step": 154 }, { - "epoch": 0.33, + "epoch": 0.25, "learning_rate": 2e-05, - "loss": 1.7877, - "step": 180 + "loss": 1.7127, + "step": 156 }, { - "epoch": 0.34, + "epoch": 0.26, "learning_rate": 2e-05, - "loss": 1.9041, - "step": 190 + "loss": 1.6559, + "step": 158 }, { - "epoch": 0.36, + "epoch": 0.26, "learning_rate": 2e-05, - "loss": 2.1649, - "step": 200 + "loss": 1.7203, + "step": 160 + }, + { + "epoch": 0.26, + "learning_rate": 2e-05, + "loss": 1.7946, + "step": 162 + }, + { + "epoch": 0.27, + "learning_rate": 2e-05, + "loss": 1.71, + "step": 164 + }, + { + "epoch": 0.27, + "learning_rate": 2e-05, + "loss": 1.8896, + "step": 166 + }, + { + "epoch": 0.27, + "learning_rate": 2e-05, + "loss": 1.9708, + "step": 168 + }, + { + "epoch": 0.28, + "learning_rate": 2e-05, + "loss": 1.8426, + "step": 170 + }, + { + "epoch": 0.28, + "learning_rate": 2e-05, + "loss": 1.7623, + "step": 172 + }, + { + "epoch": 0.28, + "learning_rate": 2e-05, + "loss": 1.8608, + "step": 174 + }, + { + "epoch": 0.29, + "learning_rate": 2e-05, + "loss": 1.8288, + "step": 176 + }, + { + "epoch": 0.29, + "learning_rate": 2e-05, + "loss": 1.8339, + "step": 178 + }, + { + "epoch": 0.29, + "learning_rate": 2e-05, + "loss": 1.955, + "step": 180 + }, + { + "epoch": 0.3, + "learning_rate": 2e-05, + "loss": 2.0129, + "step": 182 + }, + { + "epoch": 0.3, + "learning_rate": 2e-05, + "loss": 2.0187, + "step": 184 + }, + { + "epoch": 0.3, + "learning_rate": 2e-05, + "loss": 2.0102, + "step": 186 + }, + { + "epoch": 0.31, + "learning_rate": 2e-05, + "loss": 2.0275, + "step": 188 + }, + { + "epoch": 0.31, + "learning_rate": 2e-05, + "loss": 2.2052, + "step": 190 + }, + { + "epoch": 0.31, + "learning_rate": 2e-05, + "loss": 2.2022, + "step": 192 + }, + { + "epoch": 0.32, + "learning_rate": 2e-05, + "loss": 2.207, + "step": 194 + }, + { + "epoch": 0.32, + "learning_rate": 2e-05, + "loss": 2.2069, + "step": 196 + }, + { + "epoch": 0.32, + "learning_rate": 2e-05, + "loss": 2.4695, + "step": 198 + }, + { + "epoch": 0.33, + "learning_rate": 2e-05, + "loss": 2.3821, + "step": 200 + }, + { + "epoch": 0.33, + "learning_rate": 2e-05, + "loss": 1.5257, + "step": 202 + }, + { + "epoch": 0.33, + "learning_rate": 2e-05, + "loss": 1.6402, + "step": 204 + }, + { + "epoch": 0.33, + "learning_rate": 2e-05, + "loss": 1.6208, + "step": 206 + }, + { + "epoch": 0.34, + "learning_rate": 2e-05, + "loss": 1.7039, + "step": 208 + }, + { + "epoch": 0.34, + "learning_rate": 2e-05, + "loss": 1.6589, + "step": 210 + }, + { + "epoch": 0.34, + "learning_rate": 2e-05, + "loss": 1.6637, + "step": 212 + }, + { + "epoch": 0.35, + "learning_rate": 2e-05, + "loss": 1.6445, + "step": 214 + }, + { + "epoch": 0.35, + "learning_rate": 2e-05, + "loss": 1.6848, + "step": 216 + }, + { + "epoch": 0.35, + "learning_rate": 2e-05, + "loss": 1.5876, + "step": 218 + }, + { + "epoch": 0.36, + "learning_rate": 2e-05, + "loss": 1.7515, + "step": 220 + }, + { + "epoch": 0.36, + "learning_rate": 2e-05, + "loss": 1.7731, + "step": 222 + }, + { + "epoch": 0.36, + "learning_rate": 2e-05, + "loss": 1.8234, + "step": 224 + }, + { + "epoch": 0.37, + "learning_rate": 2e-05, + "loss": 1.7628, + "step": 226 + }, + { + "epoch": 0.37, + "learning_rate": 2e-05, + "loss": 1.8525, + "step": 228 + }, + { + "epoch": 0.37, + "learning_rate": 2e-05, + "loss": 1.8992, + "step": 230 + }, + { + "epoch": 0.38, + "learning_rate": 2e-05, + "loss": 1.8982, + "step": 232 + }, + { + "epoch": 0.38, + "learning_rate": 2e-05, + "loss": 2.0687, + "step": 234 + }, + { + "epoch": 0.38, + "learning_rate": 2e-05, + "loss": 2.0344, + "step": 236 + }, + { + "epoch": 0.39, + "learning_rate": 2e-05, + "loss": 2.2035, + "step": 238 + }, + { + "epoch": 0.39, + "learning_rate": 2e-05, + "loss": 2.1326, + "step": 240 + }, + { + "epoch": 0.39, + "learning_rate": 2e-05, + "loss": 2.3343, + "step": 242 + }, + { + "epoch": 0.4, + "learning_rate": 2e-05, + "loss": 2.331, + "step": 244 + }, + { + "epoch": 0.4, + "learning_rate": 2e-05, + "loss": 2.1588, + "step": 246 + }, + { + "epoch": 0.4, + "learning_rate": 2e-05, + "loss": 2.4497, + "step": 248 + }, + { + "epoch": 0.41, + "learning_rate": 2e-05, + "loss": 2.3209, + "step": 250 + }, + { + "epoch": 0.41, + "learning_rate": 2e-05, + "loss": 1.4431, + "step": 252 + }, + { + "epoch": 0.41, + "learning_rate": 2e-05, + "loss": 1.5372, + "step": 254 + }, + { + "epoch": 0.42, + "learning_rate": 2e-05, + "loss": 1.5978, + "step": 256 + }, + { + "epoch": 0.42, + "learning_rate": 2e-05, + "loss": 1.7773, + "step": 258 + }, + { + "epoch": 0.42, + "learning_rate": 2e-05, + "loss": 1.8033, + "step": 260 + }, + { + "epoch": 0.43, + "learning_rate": 2e-05, + "loss": 1.7154, + "step": 262 + }, + { + "epoch": 0.43, + "learning_rate": 2e-05, + "loss": 1.7038, + "step": 264 + }, + { + "epoch": 0.43, + "learning_rate": 2e-05, + "loss": 1.9145, + "step": 266 + }, + { + "epoch": 0.44, + "learning_rate": 2e-05, + "loss": 1.7382, + "step": 268 + }, + { + "epoch": 0.44, + "learning_rate": 2e-05, + "loss": 1.712, + "step": 270 + }, + { + "epoch": 0.44, + "learning_rate": 2e-05, + "loss": 1.6205, + "step": 272 + }, + { + "epoch": 0.45, + "learning_rate": 2e-05, + "loss": 1.8052, + "step": 274 + }, + { + "epoch": 0.45, + "learning_rate": 2e-05, + "loss": 1.7942, + "step": 276 + }, + { + "epoch": 0.45, + "learning_rate": 2e-05, + "loss": 1.941, + "step": 278 + }, + { + "epoch": 0.46, + "learning_rate": 2e-05, + "loss": 1.8929, + "step": 280 + }, + { + "epoch": 0.46, + "learning_rate": 2e-05, + "loss": 1.9285, + "step": 282 + }, + { + "epoch": 0.46, + "learning_rate": 2e-05, + "loss": 2.0238, + "step": 284 + }, + { + "epoch": 0.46, + "learning_rate": 2e-05, + "loss": 2.0938, + "step": 286 + }, + { + "epoch": 0.47, + "learning_rate": 2e-05, + "loss": 2.0362, + "step": 288 + }, + { + "epoch": 0.47, + "learning_rate": 2e-05, + "loss": 2.2545, + "step": 290 + }, + { + "epoch": 0.47, + "learning_rate": 2e-05, + "loss": 2.1432, + "step": 292 + }, + { + "epoch": 0.48, + "learning_rate": 2e-05, + "loss": 2.3454, + "step": 294 + }, + { + "epoch": 0.48, + "learning_rate": 2e-05, + "loss": 2.359, + "step": 296 + }, + { + "epoch": 0.48, + "learning_rate": 2e-05, + "loss": 2.4653, + "step": 298 + }, + { + "epoch": 0.49, + "learning_rate": 2e-05, + "loss": 2.2918, + "step": 300 + }, + { + "epoch": 0.49, + "learning_rate": 2e-05, + "loss": 1.748, + "step": 302 + }, + { + "epoch": 0.49, + "learning_rate": 2e-05, + "loss": 1.6338, + "step": 304 + }, + { + "epoch": 0.5, + "learning_rate": 2e-05, + "loss": 1.5935, + "step": 306 + }, + { + "epoch": 0.5, + "learning_rate": 2e-05, + "loss": 1.6318, + "step": 308 + }, + { + "epoch": 0.5, + "learning_rate": 2e-05, + "loss": 1.7291, + "step": 310 + }, + { + "epoch": 0.51, + "learning_rate": 2e-05, + "loss": 1.8349, + "step": 312 + }, + { + "epoch": 0.51, + "learning_rate": 2e-05, + "loss": 1.6695, + "step": 314 + }, + { + "epoch": 0.51, + "learning_rate": 2e-05, + "loss": 1.8873, + "step": 316 + }, + { + "epoch": 0.52, + "learning_rate": 2e-05, + "loss": 1.7472, + "step": 318 + }, + { + "epoch": 0.52, + "learning_rate": 2e-05, + "loss": 1.8549, + "step": 320 + }, + { + "epoch": 0.52, + "learning_rate": 2e-05, + "loss": 1.7141, + "step": 322 + }, + { + "epoch": 0.53, + "learning_rate": 2e-05, + "loss": 1.7487, + "step": 324 + }, + { + "epoch": 0.53, + "learning_rate": 2e-05, + "loss": 1.8681, + "step": 326 + }, + { + "epoch": 0.53, + "learning_rate": 2e-05, + "loss": 1.9445, + "step": 328 + }, + { + "epoch": 0.54, + "learning_rate": 2e-05, + "loss": 1.8244, + "step": 330 + }, + { + "epoch": 0.54, + "learning_rate": 2e-05, + "loss": 1.8444, + "step": 332 + }, + { + "epoch": 0.54, + "learning_rate": 2e-05, + "loss": 1.9026, + "step": 334 + }, + { + "epoch": 0.55, + "learning_rate": 2e-05, + "loss": 2.0311, + "step": 336 + }, + { + "epoch": 0.55, + "learning_rate": 2e-05, + "loss": 1.9602, + "step": 338 + }, + { + "epoch": 0.55, + "learning_rate": 2e-05, + "loss": 2.1369, + "step": 340 + }, + { + "epoch": 0.56, + "learning_rate": 2e-05, + "loss": 1.9471, + "step": 342 + }, + { + "epoch": 0.56, + "learning_rate": 2e-05, + "loss": 2.3028, + "step": 344 + }, + { + "epoch": 0.56, + "learning_rate": 2e-05, + "loss": 2.1903, + "step": 346 + }, + { + "epoch": 0.57, + "learning_rate": 2e-05, + "loss": 2.4402, + "step": 348 + }, + { + "epoch": 0.57, + "learning_rate": 2e-05, + "loss": 2.5361, + "step": 350 + }, + { + "epoch": 0.57, + "learning_rate": 2e-05, + "loss": 1.5716, + "step": 352 + }, + { + "epoch": 0.58, + "learning_rate": 2e-05, + "loss": 1.5262, + "step": 354 + }, + { + "epoch": 0.58, + "learning_rate": 2e-05, + "loss": 1.5327, + "step": 356 + }, + { + "epoch": 0.58, + "learning_rate": 2e-05, + "loss": 1.5762, + "step": 358 + }, + { + "epoch": 0.59, + "learning_rate": 2e-05, + "loss": 1.7237, + "step": 360 + }, + { + "epoch": 0.59, + "learning_rate": 2e-05, + "loss": 1.6061, + "step": 362 + }, + { + "epoch": 0.59, + "learning_rate": 2e-05, + "loss": 1.6987, + "step": 364 + }, + { + "epoch": 0.59, + "learning_rate": 2e-05, + "loss": 1.7565, + "step": 366 + }, + { + "epoch": 0.6, + "learning_rate": 2e-05, + "loss": 1.8116, + "step": 368 + }, + { + "epoch": 0.6, + "learning_rate": 2e-05, + "loss": 1.9457, + "step": 370 + }, + { + "epoch": 0.6, + "learning_rate": 2e-05, + "loss": 1.6663, + "step": 372 + }, + { + "epoch": 0.61, + "learning_rate": 2e-05, + "loss": 1.8635, + "step": 374 + }, + { + "epoch": 0.61, + "learning_rate": 2e-05, + "loss": 1.7305, + "step": 376 + }, + { + "epoch": 0.61, + "learning_rate": 2e-05, + "loss": 1.8786, + "step": 378 + }, + { + "epoch": 0.62, + "learning_rate": 2e-05, + "loss": 1.87, + "step": 380 + }, + { + "epoch": 0.62, + "learning_rate": 2e-05, + "loss": 1.8725, + "step": 382 + }, + { + "epoch": 0.62, + "learning_rate": 2e-05, + "loss": 1.9773, + "step": 384 + }, + { + "epoch": 0.63, + "learning_rate": 2e-05, + "loss": 2.0312, + "step": 386 + }, + { + "epoch": 0.63, + "learning_rate": 2e-05, + "loss": 2.0488, + "step": 388 + }, + { + "epoch": 0.63, + "learning_rate": 2e-05, + "loss": 2.3269, + "step": 390 + }, + { + "epoch": 0.64, + "learning_rate": 2e-05, + "loss": 2.1795, + "step": 392 + }, + { + "epoch": 0.64, + "learning_rate": 2e-05, + "loss": 2.3712, + "step": 394 + }, + { + "epoch": 0.64, + "learning_rate": 2e-05, + "loss": 2.4111, + "step": 396 + }, + { + "epoch": 0.65, + "learning_rate": 2e-05, + "loss": 2.1541, + "step": 398 + }, + { + "epoch": 0.65, + "learning_rate": 2e-05, + "loss": 2.549, + "step": 400 + }, + { + "epoch": 0.65, + "learning_rate": 2e-05, + "loss": 1.6017, + "step": 402 + }, + { + "epoch": 0.66, + "learning_rate": 2e-05, + "loss": 1.5165, + "step": 404 + }, + { + "epoch": 0.66, + "learning_rate": 2e-05, + "loss": 1.682, + "step": 406 + }, + { + "epoch": 0.66, + "learning_rate": 2e-05, + "loss": 1.7208, + "step": 408 + }, + { + "epoch": 0.67, + "learning_rate": 2e-05, + "loss": 1.6538, + "step": 410 + }, + { + "epoch": 0.67, + "learning_rate": 2e-05, + "loss": 1.7341, + "step": 412 + }, + { + "epoch": 0.67, + "learning_rate": 2e-05, + "loss": 1.6098, + "step": 414 + }, + { + "epoch": 0.68, + "learning_rate": 2e-05, + "loss": 1.518, + "step": 416 + }, + { + "epoch": 0.68, + "learning_rate": 2e-05, + "loss": 1.7724, + "step": 418 + }, + { + "epoch": 0.68, + "learning_rate": 2e-05, + "loss": 1.9174, + "step": 420 + }, + { + "epoch": 0.69, + "learning_rate": 2e-05, + "loss": 1.8182, + "step": 422 + }, + { + "epoch": 0.69, + "learning_rate": 2e-05, + "loss": 1.7367, + "step": 424 + }, + { + "epoch": 0.69, + "learning_rate": 2e-05, + "loss": 1.643, + "step": 426 + }, + { + "epoch": 0.7, + "learning_rate": 2e-05, + "loss": 1.8868, + "step": 428 + }, + { + "epoch": 0.7, + "learning_rate": 2e-05, + "loss": 1.9231, + "step": 430 + }, + { + "epoch": 0.7, + "learning_rate": 2e-05, + "loss": 1.9085, + "step": 432 + }, + { + "epoch": 0.71, + "learning_rate": 2e-05, + "loss": 1.8751, + "step": 434 + }, + { + "epoch": 0.71, + "learning_rate": 2e-05, + "loss": 2.0751, + "step": 436 + }, + { + "epoch": 0.71, + "learning_rate": 2e-05, + "loss": 2.1407, + "step": 438 + }, + { + "epoch": 0.72, + "learning_rate": 2e-05, + "loss": 1.9238, + "step": 440 + }, + { + "epoch": 0.72, + "learning_rate": 2e-05, + "loss": 2.3091, + "step": 442 + }, + { + "epoch": 0.72, + "learning_rate": 2e-05, + "loss": 2.2479, + "step": 444 + }, + { + "epoch": 0.72, + "learning_rate": 2e-05, + "loss": 2.3011, + "step": 446 + }, + { + "epoch": 0.73, + "learning_rate": 2e-05, + "loss": 2.185, + "step": 448 + }, + { + "epoch": 0.73, + "learning_rate": 2e-05, + "loss": 2.2973, + "step": 450 + }, + { + "epoch": 0.73, + "learning_rate": 2e-05, + "loss": 1.7113, + "step": 452 + }, + { + "epoch": 0.74, + "learning_rate": 2e-05, + "loss": 1.5509, + "step": 454 + }, + { + "epoch": 0.74, + "learning_rate": 2e-05, + "loss": 1.6546, + "step": 456 + }, + { + "epoch": 0.74, + "learning_rate": 2e-05, + "loss": 1.6055, + "step": 458 + }, + { + "epoch": 0.75, + "learning_rate": 2e-05, + "loss": 1.7226, + "step": 460 + }, + { + "epoch": 0.75, + "learning_rate": 2e-05, + "loss": 1.7313, + "step": 462 + }, + { + "epoch": 0.75, + "learning_rate": 2e-05, + "loss": 1.7648, + "step": 464 + }, + { + "epoch": 0.76, + "learning_rate": 2e-05, + "loss": 1.6838, + "step": 466 + }, + { + "epoch": 0.76, + "learning_rate": 2e-05, + "loss": 1.7465, + "step": 468 + }, + { + "epoch": 0.76, + "learning_rate": 2e-05, + "loss": 1.7302, + "step": 470 + }, + { + "epoch": 0.77, + "learning_rate": 2e-05, + "loss": 1.7998, + "step": 472 + }, + { + "epoch": 0.77, + "learning_rate": 2e-05, + "loss": 1.8177, + "step": 474 + }, + { + "epoch": 0.77, + "learning_rate": 2e-05, + "loss": 1.6845, + "step": 476 + }, + { + "epoch": 0.78, + "learning_rate": 2e-05, + "loss": 1.7208, + "step": 478 + }, + { + "epoch": 0.78, + "learning_rate": 2e-05, + "loss": 1.9087, + "step": 480 + }, + { + "epoch": 0.78, + "learning_rate": 2e-05, + "loss": 1.7116, + "step": 482 + }, + { + "epoch": 0.79, + "learning_rate": 2e-05, + "loss": 2.0417, + "step": 484 + }, + { + "epoch": 0.79, + "learning_rate": 2e-05, + "loss": 1.9781, + "step": 486 + }, + { + "epoch": 0.79, + "learning_rate": 2e-05, + "loss": 2.1279, + "step": 488 + }, + { + "epoch": 0.8, + "learning_rate": 2e-05, + "loss": 1.9915, + "step": 490 + }, + { + "epoch": 0.8, + "learning_rate": 2e-05, + "loss": 2.1453, + "step": 492 + }, + { + "epoch": 0.8, + "learning_rate": 2e-05, + "loss": 2.1639, + "step": 494 + }, + { + "epoch": 0.81, + "learning_rate": 2e-05, + "loss": 2.2218, + "step": 496 + }, + { + "epoch": 0.81, + "learning_rate": 2e-05, + "loss": 2.2928, + "step": 498 + }, + { + "epoch": 0.81, + "learning_rate": 2e-05, + "loss": 2.1805, + "step": 500 + }, + { + "epoch": 0.82, + "learning_rate": 2e-05, + "loss": 1.6048, + "step": 502 + }, + { + "epoch": 0.82, + "learning_rate": 2e-05, + "loss": 1.6373, + "step": 504 + }, + { + "epoch": 0.82, + "learning_rate": 2e-05, + "loss": 1.6171, + "step": 506 + }, + { + "epoch": 0.83, + "learning_rate": 2e-05, + "loss": 1.6666, + "step": 508 + }, + { + "epoch": 0.83, + "learning_rate": 2e-05, + "loss": 1.5267, + "step": 510 + }, + { + "epoch": 0.83, + "learning_rate": 2e-05, + "loss": 1.8299, + "step": 512 + }, + { + "epoch": 0.84, + "learning_rate": 2e-05, + "loss": 1.6865, + "step": 514 + }, + { + "epoch": 0.84, + "learning_rate": 2e-05, + "loss": 1.899, + "step": 516 + }, + { + "epoch": 0.84, + "learning_rate": 2e-05, + "loss": 1.7581, + "step": 518 + }, + { + "epoch": 0.85, + "learning_rate": 2e-05, + "loss": 1.7595, + "step": 520 + }, + { + "epoch": 0.85, + "learning_rate": 2e-05, + "loss": 1.7733, + "step": 522 + }, + { + "epoch": 0.85, + "learning_rate": 2e-05, + "loss": 1.8707, + "step": 524 + }, + { + "epoch": 0.85, + "learning_rate": 2e-05, + "loss": 1.773, + "step": 526 + }, + { + "epoch": 0.86, + "learning_rate": 2e-05, + "loss": 1.8177, + "step": 528 + }, + { + "epoch": 0.86, + "learning_rate": 2e-05, + "loss": 1.8523, + "step": 530 + }, + { + "epoch": 0.86, + "learning_rate": 2e-05, + "loss": 1.9317, + "step": 532 + }, + { + "epoch": 0.87, + "learning_rate": 2e-05, + "loss": 2.0409, + "step": 534 + }, + { + "epoch": 0.87, + "learning_rate": 2e-05, + "loss": 2.059, + "step": 536 + }, + { + "epoch": 0.87, + "learning_rate": 2e-05, + "loss": 2.1391, + "step": 538 + }, + { + "epoch": 0.88, + "learning_rate": 2e-05, + "loss": 2.2914, + "step": 540 + }, + { + "epoch": 0.88, + "learning_rate": 2e-05, + "loss": 2.1879, + "step": 542 + }, + { + "epoch": 0.88, + "learning_rate": 2e-05, + "loss": 2.3531, + "step": 544 + }, + { + "epoch": 0.89, + "learning_rate": 2e-05, + "loss": 2.2852, + "step": 546 + }, + { + "epoch": 0.89, + "learning_rate": 2e-05, + "loss": 2.3946, + "step": 548 + }, + { + "epoch": 0.89, + "learning_rate": 2e-05, + "loss": 2.1933, + "step": 550 + }, + { + "epoch": 0.9, + "learning_rate": 2e-05, + "loss": 1.587, + "step": 552 + }, + { + "epoch": 0.9, + "learning_rate": 2e-05, + "loss": 1.5762, + "step": 554 + }, + { + "epoch": 0.9, + "learning_rate": 2e-05, + "loss": 1.4593, + "step": 556 + }, + { + "epoch": 0.91, + "learning_rate": 2e-05, + "loss": 1.6386, + "step": 558 + }, + { + "epoch": 0.91, + "learning_rate": 2e-05, + "loss": 1.759, + "step": 560 + }, + { + "epoch": 0.91, + "learning_rate": 2e-05, + "loss": 1.6945, + "step": 562 + }, + { + "epoch": 0.92, + "learning_rate": 2e-05, + "loss": 1.6703, + "step": 564 + }, + { + "epoch": 0.92, + "learning_rate": 2e-05, + "loss": 1.7655, + "step": 566 + }, + { + "epoch": 0.92, + "learning_rate": 2e-05, + "loss": 1.7459, + "step": 568 + }, + { + "epoch": 0.93, + "learning_rate": 2e-05, + "loss": 1.8065, + "step": 570 + }, + { + "epoch": 0.93, + "learning_rate": 2e-05, + "loss": 1.7765, + "step": 572 + }, + { + "epoch": 0.93, + "learning_rate": 2e-05, + "loss": 1.7079, + "step": 574 + }, + { + "epoch": 0.94, + "learning_rate": 2e-05, + "loss": 1.8376, + "step": 576 + }, + { + "epoch": 0.94, + "learning_rate": 2e-05, + "loss": 2.0, + "step": 578 + }, + { + "epoch": 0.94, + "learning_rate": 2e-05, + "loss": 1.876, + "step": 580 + }, + { + "epoch": 0.95, + "learning_rate": 2e-05, + "loss": 1.8461, + "step": 582 + }, + { + "epoch": 0.95, + "learning_rate": 2e-05, + "loss": 1.9589, + "step": 584 + }, + { + "epoch": 0.95, + "learning_rate": 2e-05, + "loss": 1.9262, + "step": 586 + }, + { + "epoch": 0.96, + "learning_rate": 2e-05, + "loss": 2.0984, + "step": 588 + }, + { + "epoch": 0.96, + "learning_rate": 2e-05, + "loss": 2.1432, + "step": 590 + }, + { + "epoch": 0.96, + "learning_rate": 2e-05, + "loss": 2.2204, + "step": 592 + }, + { + "epoch": 0.97, + "learning_rate": 2e-05, + "loss": 2.3121, + "step": 594 + }, + { + "epoch": 0.97, + "learning_rate": 2e-05, + "loss": 2.3432, + "step": 596 + }, + { + "epoch": 0.97, + "learning_rate": 2e-05, + "loss": 2.2153, + "step": 598 + }, + { + "epoch": 0.98, + "learning_rate": 2e-05, + "loss": 2.4357, + "step": 600 + }, + { + "epoch": 0.98, + "learning_rate": 2e-05, + "loss": 1.6654, + "step": 602 + }, + { + "epoch": 0.98, + "learning_rate": 2e-05, + "loss": 1.5651, + "step": 604 + }, + { + "epoch": 0.98, + "learning_rate": 2e-05, + "loss": 1.8015, + "step": 606 + }, + { + "epoch": 0.99, + "learning_rate": 2e-05, + "loss": 1.8798, + "step": 608 + }, + { + "epoch": 0.99, + "learning_rate": 2e-05, + "loss": 1.8118, + "step": 610 + }, + { + "epoch": 0.99, + "learning_rate": 2e-05, + "loss": 2.101, + "step": 612 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 2.2339, + "step": 614 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 2.0958, + "step": 616 + }, + { + "epoch": 1.0, + "learning_rate": 2e-05, + "loss": 1.5011, + "step": 618 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 1.6487, + "step": 620 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 1.663, + "step": 622 + }, + { + "epoch": 1.01, + "learning_rate": 2e-05, + "loss": 1.7526, + "step": 624 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 1.6576, + "step": 626 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 1.8096, + "step": 628 + }, + { + "epoch": 1.02, + "learning_rate": 2e-05, + "loss": 1.6533, + "step": 630 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 1.8164, + "step": 632 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 1.7376, + "step": 634 + }, + { + "epoch": 1.03, + "learning_rate": 2e-05, + "loss": 1.7395, + "step": 636 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 1.7583, + "step": 638 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 1.8188, + "step": 640 + }, + { + "epoch": 1.04, + "learning_rate": 2e-05, + "loss": 1.8144, + "step": 642 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 1.8275, + "step": 644 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 1.8898, + "step": 646 + }, + { + "epoch": 1.05, + "learning_rate": 2e-05, + "loss": 1.831, + "step": 648 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 2.017, + "step": 650 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 2.0152, + "step": 652 + }, + { + "epoch": 1.06, + "learning_rate": 2e-05, + "loss": 2.0229, + "step": 654 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 2.0969, + "step": 656 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 2.2473, + "step": 658 + }, + { + "epoch": 1.07, + "learning_rate": 2e-05, + "loss": 2.2124, + "step": 660 + }, + { + "epoch": 1.08, + "learning_rate": 2e-05, + "loss": 2.4132, + "step": 662 + }, + { + "epoch": 1.08, + "learning_rate": 2e-05, + "loss": 2.2386, + "step": 664 + }, + { + "epoch": 1.08, + "learning_rate": 2e-05, + "loss": 1.9457, + "step": 666 + }, + { + "epoch": 1.09, + "learning_rate": 2e-05, + "loss": 1.5253, + "step": 668 + }, + { + "epoch": 1.09, + "learning_rate": 2e-05, + "loss": 1.6223, + "step": 670 + }, + { + "epoch": 1.09, + "learning_rate": 2e-05, + "loss": 1.5866, + "step": 672 + }, + { + "epoch": 1.1, + "learning_rate": 2e-05, + "loss": 1.639, + "step": 674 + }, + { + "epoch": 1.1, + "learning_rate": 2e-05, + "loss": 1.7485, + "step": 676 + }, + { + "epoch": 1.1, + "learning_rate": 2e-05, + "loss": 1.7922, + "step": 678 + }, + { + "epoch": 1.11, + "learning_rate": 2e-05, + "loss": 1.6644, + "step": 680 + }, + { + "epoch": 1.11, + "learning_rate": 2e-05, + "loss": 1.7133, + "step": 682 + }, + { + "epoch": 1.11, + "learning_rate": 2e-05, + "loss": 1.7215, + "step": 684 + }, + { + "epoch": 1.11, + "learning_rate": 2e-05, + "loss": 1.6665, + "step": 686 + }, + { + "epoch": 1.12, + "learning_rate": 2e-05, + "loss": 1.809, + "step": 688 + }, + { + "epoch": 1.12, + "learning_rate": 2e-05, + "loss": 1.8063, + "step": 690 + }, + { + "epoch": 1.12, + "learning_rate": 2e-05, + "loss": 1.8331, + "step": 692 + }, + { + "epoch": 1.13, + "learning_rate": 2e-05, + "loss": 1.6631, + "step": 694 + }, + { + "epoch": 1.13, + "learning_rate": 2e-05, + "loss": 1.7219, + "step": 696 + }, + { + "epoch": 1.13, + "learning_rate": 2e-05, + "loss": 1.793, + "step": 698 + }, + { + "epoch": 1.14, + "learning_rate": 2e-05, + "loss": 1.9089, + "step": 700 + }, + { + "epoch": 1.14, + "learning_rate": 2e-05, + "loss": 2.0203, + "step": 702 + }, + { + "epoch": 1.14, + "learning_rate": 2e-05, + "loss": 2.0187, + "step": 704 + }, + { + "epoch": 1.15, + "learning_rate": 2e-05, + "loss": 2.2132, + "step": 706 + }, + { + "epoch": 1.15, + "learning_rate": 2e-05, + "loss": 2.3242, + "step": 708 + }, + { + "epoch": 1.15, + "learning_rate": 2e-05, + "loss": 2.3611, + "step": 710 + }, + { + "epoch": 1.16, + "learning_rate": 2e-05, + "loss": 2.1036, + "step": 712 + }, + { + "epoch": 1.16, + "learning_rate": 2e-05, + "loss": 2.1611, + "step": 714 + }, + { + "epoch": 1.16, + "learning_rate": 2e-05, + "loss": 2.084, + "step": 716 + }, + { + "epoch": 1.17, + "learning_rate": 2e-05, + "loss": 1.5422, + "step": 718 + }, + { + "epoch": 1.17, + "learning_rate": 2e-05, + "loss": 1.5357, + "step": 720 + }, + { + "epoch": 1.17, + "learning_rate": 2e-05, + "loss": 1.5656, + "step": 722 + }, + { + "epoch": 1.18, + "learning_rate": 2e-05, + "loss": 1.665, + "step": 724 + }, + { + "epoch": 1.18, + "learning_rate": 2e-05, + "loss": 1.6541, + "step": 726 + }, + { + "epoch": 1.18, + "learning_rate": 2e-05, + "loss": 1.7007, + "step": 728 + }, + { + "epoch": 1.19, + "learning_rate": 2e-05, + "loss": 1.6715, + "step": 730 + }, + { + "epoch": 1.19, + "learning_rate": 2e-05, + "loss": 1.7646, + "step": 732 + }, + { + "epoch": 1.19, + "learning_rate": 2e-05, + "loss": 1.7159, + "step": 734 + }, + { + "epoch": 1.2, + "learning_rate": 2e-05, + "loss": 1.8008, + "step": 736 + }, + { + "epoch": 1.2, + "learning_rate": 2e-05, + "loss": 1.7396, + "step": 738 + }, + { + "epoch": 1.2, + "learning_rate": 2e-05, + "loss": 1.5893, + "step": 740 + }, + { + "epoch": 1.21, + "learning_rate": 2e-05, + "loss": 1.7379, + "step": 742 + }, + { + "epoch": 1.21, + "learning_rate": 2e-05, + "loss": 1.9598, + "step": 744 + }, + { + "epoch": 1.21, + "learning_rate": 2e-05, + "loss": 1.767, + "step": 746 + }, + { + "epoch": 1.22, + "learning_rate": 2e-05, + "loss": 1.6946, + "step": 748 + }, + { + "epoch": 1.22, + "learning_rate": 2e-05, + "loss": 1.9439, + "step": 750 + }, + { + "epoch": 1.22, + "learning_rate": 2e-05, + "loss": 1.809, + "step": 752 + }, + { + "epoch": 1.23, + "learning_rate": 2e-05, + "loss": 1.9658, + "step": 754 + }, + { + "epoch": 1.23, + "learning_rate": 2e-05, + "loss": 2.3269, + "step": 756 + }, + { + "epoch": 1.23, + "learning_rate": 2e-05, + "loss": 2.1706, + "step": 758 + }, + { + "epoch": 1.24, + "learning_rate": 2e-05, + "loss": 2.1369, + "step": 760 + }, + { + "epoch": 1.24, + "learning_rate": 2e-05, + "loss": 2.2149, + "step": 762 + }, + { + "epoch": 1.24, + "learning_rate": 2e-05, + "loss": 2.2046, + "step": 764 + }, + { + "epoch": 1.24, + "learning_rate": 2e-05, + "loss": 2.0422, + "step": 766 + }, + { + "epoch": 1.25, + "learning_rate": 2e-05, + "loss": 1.5257, + "step": 768 + }, + { + "epoch": 1.25, + "learning_rate": 2e-05, + "loss": 1.5342, + "step": 770 + }, + { + "epoch": 1.25, + "learning_rate": 2e-05, + "loss": 1.6119, + "step": 772 + }, + { + "epoch": 1.26, + "learning_rate": 2e-05, + "loss": 1.4974, + "step": 774 + }, + { + "epoch": 1.26, + "learning_rate": 2e-05, + "loss": 1.6021, + "step": 776 + }, + { + "epoch": 1.26, + "learning_rate": 2e-05, + "loss": 1.7111, + "step": 778 + }, + { + "epoch": 1.27, + "learning_rate": 2e-05, + "loss": 1.7096, + "step": 780 + }, + { + "epoch": 1.27, + "learning_rate": 2e-05, + "loss": 1.7536, + "step": 782 + }, + { + "epoch": 1.27, + "learning_rate": 2e-05, + "loss": 1.7301, + "step": 784 + }, + { + "epoch": 1.28, + "learning_rate": 2e-05, + "loss": 1.7513, + "step": 786 + }, + { + "epoch": 1.28, + "learning_rate": 2e-05, + "loss": 1.6764, + "step": 788 + }, + { + "epoch": 1.28, + "learning_rate": 2e-05, + "loss": 1.6348, + "step": 790 + }, + { + "epoch": 1.29, + "learning_rate": 2e-05, + "loss": 1.7046, + "step": 792 + }, + { + "epoch": 1.29, + "learning_rate": 2e-05, + "loss": 1.8024, + "step": 794 + }, + { + "epoch": 1.29, + "learning_rate": 2e-05, + "loss": 1.8867, + "step": 796 + }, + { + "epoch": 1.3, + "learning_rate": 2e-05, + "loss": 1.8189, + "step": 798 + }, + { + "epoch": 1.3, + "learning_rate": 2e-05, + "loss": 2.0048, + "step": 800 + }, + { + "epoch": 1.3, + "learning_rate": 2e-05, + "loss": 1.7588, + "step": 802 + }, + { + "epoch": 1.31, + "learning_rate": 2e-05, + "loss": 1.9648, + "step": 804 + }, + { + "epoch": 1.31, + "learning_rate": 2e-05, + "loss": 2.1632, + "step": 806 + }, + { + "epoch": 1.31, + "learning_rate": 2e-05, + "loss": 2.1084, + "step": 808 + }, + { + "epoch": 1.32, + "learning_rate": 2e-05, + "loss": 2.1, + "step": 810 + }, + { + "epoch": 1.32, + "learning_rate": 2e-05, + "loss": 2.183, + "step": 812 + }, + { + "epoch": 1.32, + "learning_rate": 2e-05, + "loss": 2.2913, + "step": 814 + }, + { + "epoch": 1.33, + "learning_rate": 2e-05, + "loss": 1.8887, + "step": 816 + }, + { + "epoch": 1.33, + "learning_rate": 2e-05, + "loss": 1.578, + "step": 818 + }, + { + "epoch": 1.33, + "learning_rate": 2e-05, + "loss": 1.4916, + "step": 820 + }, + { + "epoch": 1.34, + "learning_rate": 2e-05, + "loss": 1.5374, + "step": 822 + }, + { + "epoch": 1.34, + "learning_rate": 2e-05, + "loss": 1.625, + "step": 824 + }, + { + "epoch": 1.34, + "learning_rate": 2e-05, + "loss": 1.7883, + "step": 826 + }, + { + "epoch": 1.35, + "learning_rate": 2e-05, + "loss": 1.7382, + "step": 828 + }, + { + "epoch": 1.35, + "learning_rate": 2e-05, + "loss": 1.6878, + "step": 830 + }, + { + "epoch": 1.35, + "learning_rate": 2e-05, + "loss": 1.5995, + "step": 832 + }, + { + "epoch": 1.36, + "learning_rate": 2e-05, + "loss": 1.8318, + "step": 834 + }, + { + "epoch": 1.36, + "learning_rate": 2e-05, + "loss": 1.7415, + "step": 836 + }, + { + "epoch": 1.36, + "learning_rate": 2e-05, + "loss": 1.8562, + "step": 838 + }, + { + "epoch": 1.37, + "learning_rate": 2e-05, + "loss": 1.7258, + "step": 840 + }, + { + "epoch": 1.37, + "learning_rate": 2e-05, + "loss": 1.8745, + "step": 842 + }, + { + "epoch": 1.37, + "learning_rate": 2e-05, + "loss": 1.6258, + "step": 844 + }, + { + "epoch": 1.37, + "learning_rate": 2e-05, + "loss": 1.8662, + "step": 846 + }, + { + "epoch": 1.38, + "learning_rate": 2e-05, + "loss": 1.8824, + "step": 848 + }, + { + "epoch": 1.38, + "learning_rate": 2e-05, + "loss": 1.8848, + "step": 850 + }, + { + "epoch": 1.38, + "learning_rate": 2e-05, + "loss": 2.0283, + "step": 852 + }, + { + "epoch": 1.39, + "learning_rate": 2e-05, + "loss": 1.9087, + "step": 854 + }, + { + "epoch": 1.39, + "learning_rate": 2e-05, + "loss": 2.1342, + "step": 856 + }, + { + "epoch": 1.39, + "learning_rate": 2e-05, + "loss": 2.3005, + "step": 858 + }, + { + "epoch": 1.4, + "learning_rate": 2e-05, + "loss": 2.4106, + "step": 860 + }, + { + "epoch": 1.4, + "learning_rate": 2e-05, + "loss": 2.4499, + "step": 862 + }, + { + "epoch": 1.4, + "learning_rate": 2e-05, + "loss": 2.3442, + "step": 864 + }, + { + "epoch": 1.41, + "learning_rate": 2e-05, + "loss": 2.0918, + "step": 866 + }, + { + "epoch": 1.41, + "learning_rate": 2e-05, + "loss": 1.6702, + "step": 868 + }, + { + "epoch": 1.41, + "learning_rate": 2e-05, + "loss": 1.4252, + "step": 870 + }, + { + "epoch": 1.42, + "learning_rate": 2e-05, + "loss": 1.4626, + "step": 872 + }, + { + "epoch": 1.42, + "learning_rate": 2e-05, + "loss": 1.5953, + "step": 874 + }, + { + "epoch": 1.42, + "learning_rate": 2e-05, + "loss": 1.7418, + "step": 876 + }, + { + "epoch": 1.43, + "learning_rate": 2e-05, + "loss": 1.7604, + "step": 878 + }, + { + "epoch": 1.43, + "learning_rate": 2e-05, + "loss": 1.6774, + "step": 880 + }, + { + "epoch": 1.43, + "learning_rate": 2e-05, + "loss": 1.6192, + "step": 882 + }, + { + "epoch": 1.44, + "learning_rate": 2e-05, + "loss": 1.7959, + "step": 884 + }, + { + "epoch": 1.44, + "learning_rate": 2e-05, + "loss": 1.691, + "step": 886 + }, + { + "epoch": 1.44, + "learning_rate": 2e-05, + "loss": 1.6482, + "step": 888 + }, + { + "epoch": 1.45, + "learning_rate": 2e-05, + "loss": 1.7933, + "step": 890 + }, + { + "epoch": 1.45, + "learning_rate": 2e-05, + "loss": 1.7287, + "step": 892 + }, + { + "epoch": 1.45, + "learning_rate": 2e-05, + "loss": 1.7073, + "step": 894 + }, + { + "epoch": 1.46, + "learning_rate": 2e-05, + "loss": 1.9, + "step": 896 + }, + { + "epoch": 1.46, + "learning_rate": 2e-05, + "loss": 1.8793, + "step": 898 + }, + { + "epoch": 1.46, + "learning_rate": 2e-05, + "loss": 1.7948, + "step": 900 + }, + { + "epoch": 1.47, + "learning_rate": 2e-05, + "loss": 1.9856, + "step": 902 + }, + { + "epoch": 1.47, + "learning_rate": 2e-05, + "loss": 2.0757, + "step": 904 + }, + { + "epoch": 1.47, + "learning_rate": 2e-05, + "loss": 2.0352, + "step": 906 + }, + { + "epoch": 1.48, + "learning_rate": 2e-05, + "loss": 2.2688, + "step": 908 + }, + { + "epoch": 1.48, + "learning_rate": 2e-05, + "loss": 2.041, + "step": 910 + }, + { + "epoch": 1.48, + "learning_rate": 2e-05, + "loss": 2.265, + "step": 912 + }, + { + "epoch": 1.49, + "learning_rate": 2e-05, + "loss": 2.1646, + "step": 914 + }, + { + "epoch": 1.49, + "learning_rate": 2e-05, + "loss": 1.8594, + "step": 916 + }, + { + "epoch": 1.49, + "learning_rate": 2e-05, + "loss": 1.5378, + "step": 918 + }, + { + "epoch": 1.5, + "learning_rate": 2e-05, + "loss": 1.6376, + "step": 920 + }, + { + "epoch": 1.5, + "learning_rate": 2e-05, + "loss": 1.522, + "step": 922 + }, + { + "epoch": 1.5, + "learning_rate": 2e-05, + "loss": 1.5827, + "step": 924 + }, + { + "epoch": 1.5, + "learning_rate": 2e-05, + "loss": 1.633, + "step": 926 + }, + { + "epoch": 1.51, + "learning_rate": 2e-05, + "loss": 1.6128, + "step": 928 + }, + { + "epoch": 1.51, + "learning_rate": 2e-05, + "loss": 1.7496, + "step": 930 + }, + { + "epoch": 1.51, + "learning_rate": 2e-05, + "loss": 1.7611, + "step": 932 + }, + { + "epoch": 1.52, + "learning_rate": 2e-05, + "loss": 1.7722, + "step": 934 + }, + { + "epoch": 1.52, + "learning_rate": 2e-05, + "loss": 1.8811, + "step": 936 + }, + { + "epoch": 1.52, + "learning_rate": 2e-05, + "loss": 1.8488, + "step": 938 + }, + { + "epoch": 1.53, + "learning_rate": 2e-05, + "loss": 1.6723, + "step": 940 + }, + { + "epoch": 1.53, + "learning_rate": 2e-05, + "loss": 1.6077, + "step": 942 + }, + { + "epoch": 1.53, + "learning_rate": 2e-05, + "loss": 1.7119, + "step": 944 + }, + { + "epoch": 1.54, + "learning_rate": 2e-05, + "loss": 1.9478, + "step": 946 + }, + { + "epoch": 1.54, + "learning_rate": 2e-05, + "loss": 1.9045, + "step": 948 + }, + { + "epoch": 1.54, + "learning_rate": 2e-05, + "loss": 1.8904, + "step": 950 + }, + { + "epoch": 1.55, + "learning_rate": 2e-05, + "loss": 2.1644, + "step": 952 + }, + { + "epoch": 1.55, + "learning_rate": 2e-05, + "loss": 2.1249, + "step": 954 + }, + { + "epoch": 1.55, + "learning_rate": 2e-05, + "loss": 1.9582, + "step": 956 + }, + { + "epoch": 1.56, + "learning_rate": 2e-05, + "loss": 1.9816, + "step": 958 + }, + { + "epoch": 1.56, + "learning_rate": 2e-05, + "loss": 2.2362, + "step": 960 + }, + { + "epoch": 1.56, + "learning_rate": 2e-05, + "loss": 2.1076, + "step": 962 + }, + { + "epoch": 1.57, + "learning_rate": 2e-05, + "loss": 2.2971, + "step": 964 + }, + { + "epoch": 1.57, + "learning_rate": 2e-05, + "loss": 2.007, + "step": 966 + }, + { + "epoch": 1.57, + "learning_rate": 2e-05, + "loss": 1.6316, + "step": 968 + }, + { + "epoch": 1.58, + "learning_rate": 2e-05, + "loss": 1.5545, + "step": 970 + }, + { + "epoch": 1.58, + "learning_rate": 2e-05, + "loss": 1.6186, + "step": 972 + }, + { + "epoch": 1.58, + "learning_rate": 2e-05, + "loss": 1.6096, + "step": 974 + }, + { + "epoch": 1.59, + "learning_rate": 2e-05, + "loss": 1.7449, + "step": 976 + }, + { + "epoch": 1.59, + "learning_rate": 2e-05, + "loss": 1.7391, + "step": 978 + }, + { + "epoch": 1.59, + "learning_rate": 2e-05, + "loss": 1.7077, + "step": 980 + }, + { + "epoch": 1.6, + "learning_rate": 2e-05, + "loss": 1.6408, + "step": 982 + }, + { + "epoch": 1.6, + "learning_rate": 2e-05, + "loss": 1.7383, + "step": 984 + }, + { + "epoch": 1.6, + "learning_rate": 2e-05, + "loss": 1.7688, + "step": 986 + }, + { + "epoch": 1.61, + "learning_rate": 2e-05, + "loss": 1.7425, + "step": 988 + }, + { + "epoch": 1.61, + "learning_rate": 2e-05, + "loss": 1.6938, + "step": 990 + }, + { + "epoch": 1.61, + "learning_rate": 2e-05, + "loss": 1.7232, + "step": 992 + }, + { + "epoch": 1.62, + "learning_rate": 2e-05, + "loss": 1.7924, + "step": 994 + }, + { + "epoch": 1.62, + "learning_rate": 2e-05, + "loss": 1.8015, + "step": 996 + }, + { + "epoch": 1.62, + "learning_rate": 2e-05, + "loss": 2.0395, + "step": 998 + }, + { + "epoch": 1.63, + "learning_rate": 2e-05, + "loss": 1.946, + "step": 1000 + }, + { + "epoch": 1.63, + "learning_rate": 2e-05, + "loss": 1.9046, + "step": 1002 + }, + { + "epoch": 1.63, + "learning_rate": 2e-05, + "loss": 2.0198, + "step": 1004 + }, + { + "epoch": 1.63, + "learning_rate": 2e-05, + "loss": 2.1411, + "step": 1006 + }, + { + "epoch": 1.64, + "learning_rate": 2e-05, + "loss": 1.8638, + "step": 1008 + }, + { + "epoch": 1.64, + "learning_rate": 2e-05, + "loss": 2.3288, + "step": 1010 + }, + { + "epoch": 1.64, + "learning_rate": 2e-05, + "loss": 2.1452, + "step": 1012 + }, + { + "epoch": 1.65, + "learning_rate": 2e-05, + "loss": 2.4029, + "step": 1014 + }, + { + "epoch": 1.65, + "learning_rate": 2e-05, + "loss": 1.721, + "step": 1016 + }, + { + "epoch": 1.65, + "learning_rate": 2e-05, + "loss": 1.7016, + "step": 1018 + }, + { + "epoch": 1.66, + "learning_rate": 2e-05, + "loss": 1.6516, + "step": 1020 + }, + { + "epoch": 1.66, + "learning_rate": 2e-05, + "loss": 1.5379, + "step": 1022 + }, + { + "epoch": 1.66, + "learning_rate": 2e-05, + "loss": 1.5188, + "step": 1024 + }, + { + "epoch": 1.67, + "learning_rate": 2e-05, + "loss": 1.6925, + "step": 1026 + }, + { + "epoch": 1.67, + "learning_rate": 2e-05, + "loss": 1.6326, + "step": 1028 + }, + { + "epoch": 1.67, + "learning_rate": 2e-05, + "loss": 1.7466, + "step": 1030 + }, + { + "epoch": 1.68, + "learning_rate": 2e-05, + "loss": 1.7166, + "step": 1032 + }, + { + "epoch": 1.68, + "learning_rate": 2e-05, + "loss": 1.8074, + "step": 1034 + }, + { + "epoch": 1.68, + "learning_rate": 2e-05, + "loss": 1.7404, + "step": 1036 + }, + { + "epoch": 1.69, + "learning_rate": 2e-05, + "loss": 1.6528, + "step": 1038 + }, + { + "epoch": 1.69, + "learning_rate": 2e-05, + "loss": 1.8054, + "step": 1040 + }, + { + "epoch": 1.69, + "learning_rate": 2e-05, + "loss": 1.9392, + "step": 1042 + }, + { + "epoch": 1.7, + "learning_rate": 2e-05, + "loss": 1.8748, + "step": 1044 + }, + { + "epoch": 1.7, + "learning_rate": 2e-05, + "loss": 1.9289, + "step": 1046 + }, + { + "epoch": 1.7, + "learning_rate": 2e-05, + "loss": 1.9047, + "step": 1048 + }, + { + "epoch": 1.71, + "learning_rate": 2e-05, + "loss": 2.0054, + "step": 1050 + }, + { + "epoch": 1.71, + "learning_rate": 2e-05, + "loss": 1.9701, + "step": 1052 + }, + { + "epoch": 1.71, + "learning_rate": 2e-05, + "loss": 2.0893, + "step": 1054 + }, + { + "epoch": 1.72, + "learning_rate": 2e-05, + "loss": 2.073, + "step": 1056 + }, + { + "epoch": 1.72, + "learning_rate": 2e-05, + "loss": 2.2494, + "step": 1058 + }, + { + "epoch": 1.72, + "learning_rate": 2e-05, + "loss": 2.3391, + "step": 1060 + }, + { + "epoch": 1.73, + "learning_rate": 2e-05, + "loss": 2.39, + "step": 1062 + }, + { + "epoch": 1.73, + "learning_rate": 2e-05, + "loss": 2.2983, + "step": 1064 + }, + { + "epoch": 1.73, + "learning_rate": 2e-05, + "loss": 1.9709, + "step": 1066 + }, + { + "epoch": 1.74, + "learning_rate": 2e-05, + "loss": 1.7431, + "step": 1068 + }, + { + "epoch": 1.74, + "learning_rate": 2e-05, + "loss": 1.5658, + "step": 1070 + }, + { + "epoch": 1.74, + "learning_rate": 2e-05, + "loss": 1.8086, + "step": 1072 + }, + { + "epoch": 1.75, + "learning_rate": 2e-05, + "loss": 1.6591, + "step": 1074 + }, + { + "epoch": 1.75, + "learning_rate": 2e-05, + "loss": 1.6473, + "step": 1076 + }, + { + "epoch": 1.75, + "learning_rate": 2e-05, + "loss": 1.6038, + "step": 1078 + }, + { + "epoch": 1.76, + "learning_rate": 2e-05, + "loss": 1.7007, + "step": 1080 + }, + { + "epoch": 1.76, + "learning_rate": 2e-05, + "loss": 1.7281, + "step": 1082 + }, + { + "epoch": 1.76, + "learning_rate": 2e-05, + "loss": 1.6404, + "step": 1084 + }, + { + "epoch": 1.76, + "learning_rate": 2e-05, + "loss": 1.7932, + "step": 1086 + }, + { + "epoch": 1.77, + "learning_rate": 2e-05, + "loss": 1.7749, + "step": 1088 + }, + { + "epoch": 1.77, + "learning_rate": 2e-05, + "loss": 1.7351, + "step": 1090 + }, + { + "epoch": 1.77, + "learning_rate": 2e-05, + "loss": 1.6254, + "step": 1092 + }, + { + "epoch": 1.78, + "learning_rate": 2e-05, + "loss": 1.9305, + "step": 1094 + }, + { + "epoch": 1.78, + "learning_rate": 2e-05, + "loss": 1.8313, + "step": 1096 + }, + { + "epoch": 1.78, + "learning_rate": 2e-05, + "loss": 1.8954, + "step": 1098 + }, + { + "epoch": 1.79, + "learning_rate": 2e-05, + "loss": 1.9984, + "step": 1100 + }, + { + "epoch": 1.79, + "learning_rate": 2e-05, + "loss": 2.0387, + "step": 1102 + }, + { + "epoch": 1.79, + "learning_rate": 2e-05, + "loss": 2.0381, + "step": 1104 + }, + { + "epoch": 1.8, + "learning_rate": 2e-05, + "loss": 2.248, + "step": 1106 + }, + { + "epoch": 1.8, + "learning_rate": 2e-05, + "loss": 2.094, + "step": 1108 + }, + { + "epoch": 1.8, + "learning_rate": 2e-05, + "loss": 2.0483, + "step": 1110 + }, + { + "epoch": 1.81, + "learning_rate": 2e-05, + "loss": 2.2801, + "step": 1112 + }, + { + "epoch": 1.81, + "learning_rate": 2e-05, + "loss": 2.1668, + "step": 1114 + }, + { + "epoch": 1.81, + "learning_rate": 2e-05, + "loss": 1.9295, + "step": 1116 + }, + { + "epoch": 1.82, + "learning_rate": 2e-05, + "loss": 1.5339, + "step": 1118 + }, + { + "epoch": 1.82, + "learning_rate": 2e-05, + "loss": 1.4249, + "step": 1120 + }, + { + "epoch": 1.82, + "learning_rate": 2e-05, + "loss": 1.6543, + "step": 1122 + }, + { + "epoch": 1.83, + "learning_rate": 2e-05, + "loss": 1.5864, + "step": 1124 + }, + { + "epoch": 1.83, + "learning_rate": 2e-05, + "loss": 1.728, + "step": 1126 + }, + { + "epoch": 1.83, + "learning_rate": 2e-05, + "loss": 1.6257, + "step": 1128 + }, + { + "epoch": 1.84, + "learning_rate": 2e-05, + "loss": 1.7116, + "step": 1130 + }, + { + "epoch": 1.84, + "learning_rate": 2e-05, + "loss": 1.7008, + "step": 1132 + }, + { + "epoch": 1.84, + "learning_rate": 2e-05, + "loss": 1.8602, + "step": 1134 + }, + { + "epoch": 1.85, + "learning_rate": 2e-05, + "loss": 1.6803, + "step": 1136 + }, + { + "epoch": 1.85, + "learning_rate": 2e-05, + "loss": 1.5637, + "step": 1138 + }, + { + "epoch": 1.85, + "learning_rate": 2e-05, + "loss": 1.8221, + "step": 1140 + }, + { + "epoch": 1.86, + "learning_rate": 2e-05, + "loss": 1.9582, + "step": 1142 + }, + { + "epoch": 1.86, + "learning_rate": 2e-05, + "loss": 1.9902, + "step": 1144 + }, + { + "epoch": 1.86, + "learning_rate": 2e-05, + "loss": 1.9478, + "step": 1146 + }, + { + "epoch": 1.87, + "learning_rate": 2e-05, + "loss": 2.0016, + "step": 1148 + }, + { + "epoch": 1.87, + "learning_rate": 2e-05, + "loss": 1.9449, + "step": 1150 + }, + { + "epoch": 1.87, + "learning_rate": 2e-05, + "loss": 1.9599, + "step": 1152 + }, + { + "epoch": 1.88, + "learning_rate": 2e-05, + "loss": 2.1458, + "step": 1154 + }, + { + "epoch": 1.88, + "learning_rate": 2e-05, + "loss": 2.0979, + "step": 1156 + }, + { + "epoch": 1.88, + "learning_rate": 2e-05, + "loss": 2.1437, + "step": 1158 + }, + { + "epoch": 1.89, + "learning_rate": 2e-05, + "loss": 2.1176, + "step": 1160 + }, + { + "epoch": 1.89, + "learning_rate": 2e-05, + "loss": 2.0171, + "step": 1162 + }, + { + "epoch": 1.89, + "learning_rate": 2e-05, + "loss": 2.1096, + "step": 1164 + }, + { + "epoch": 1.89, + "learning_rate": 2e-05, + "loss": 1.9555, + "step": 1166 + }, + { + "epoch": 1.9, + "learning_rate": 2e-05, + "loss": 1.5505, + "step": 1168 + }, + { + "epoch": 1.9, + "learning_rate": 2e-05, + "loss": 1.5644, + "step": 1170 + }, + { + "epoch": 1.9, + "learning_rate": 2e-05, + "loss": 1.6082, + "step": 1172 + }, + { + "epoch": 1.91, + "learning_rate": 2e-05, + "loss": 1.675, + "step": 1174 + }, + { + "epoch": 1.91, + "learning_rate": 2e-05, + "loss": 1.7006, + "step": 1176 + }, + { + "epoch": 1.91, + "learning_rate": 2e-05, + "loss": 1.6374, + "step": 1178 + }, + { + "epoch": 1.92, + "learning_rate": 2e-05, + "loss": 1.5624, + "step": 1180 + }, + { + "epoch": 1.92, + "learning_rate": 2e-05, + "loss": 1.875, + "step": 1182 + }, + { + "epoch": 1.92, + "learning_rate": 2e-05, + "loss": 1.7416, + "step": 1184 + }, + { + "epoch": 1.93, + "learning_rate": 2e-05, + "loss": 1.7585, + "step": 1186 + }, + { + "epoch": 1.93, + "learning_rate": 2e-05, + "loss": 1.6764, + "step": 1188 + }, + { + "epoch": 1.93, + "learning_rate": 2e-05, + "loss": 1.7482, + "step": 1190 + }, + { + "epoch": 1.94, + "learning_rate": 2e-05, + "loss": 1.7605, + "step": 1192 + }, + { + "epoch": 1.94, + "learning_rate": 2e-05, + "loss": 1.8099, + "step": 1194 + }, + { + "epoch": 1.94, + "learning_rate": 2e-05, + "loss": 1.7902, + "step": 1196 + }, + { + "epoch": 1.95, + "learning_rate": 2e-05, + "loss": 1.7466, + "step": 1198 + }, + { + "epoch": 1.95, + "learning_rate": 2e-05, + "loss": 1.8851, + "step": 1200 + }, + { + "epoch": 1.95, + "learning_rate": 2e-05, + "loss": 1.8968, + "step": 1202 + }, + { + "epoch": 1.96, + "learning_rate": 2e-05, + "loss": 2.1639, + "step": 1204 + }, + { + "epoch": 1.96, + "learning_rate": 2e-05, + "loss": 2.1004, + "step": 1206 + }, + { + "epoch": 1.96, + "learning_rate": 2e-05, + "loss": 2.068, + "step": 1208 + }, + { + "epoch": 1.97, + "learning_rate": 2e-05, + "loss": 2.1009, + "step": 1210 + }, + { + "epoch": 1.97, + "learning_rate": 2e-05, + "loss": 2.2122, + "step": 1212 + }, + { + "epoch": 1.97, + "learning_rate": 2e-05, + "loss": 2.2953, + "step": 1214 + }, + { + "epoch": 1.98, + "learning_rate": 2e-05, + "loss": 1.9851, + "step": 1216 + }, + { + "epoch": 1.98, + "learning_rate": 2e-05, + "loss": 1.5092, + "step": 1218 + }, + { + "epoch": 1.98, + "learning_rate": 2e-05, + "loss": 1.58, + "step": 1220 + }, + { + "epoch": 1.99, + "learning_rate": 2e-05, + "loss": 1.6222, + "step": 1222 + }, + { + "epoch": 1.99, + "learning_rate": 2e-05, + "loss": 1.7341, + "step": 1224 + }, + { + "epoch": 1.99, + "learning_rate": 2e-05, + "loss": 1.9206, + "step": 1226 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 1.9424, + "step": 1228 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 2.1776, + "step": 1230 + }, + { + "epoch": 2.0, + "learning_rate": 2e-05, + "loss": 2.021, + "step": 1232 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 1.5074, + "step": 1234 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 1.441, + "step": 1236 + }, + { + "epoch": 2.01, + "learning_rate": 2e-05, + "loss": 1.619, + "step": 1238 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 1.6916, + "step": 1240 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 1.5746, + "step": 1242 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 1.7335, + "step": 1244 + }, + { + "epoch": 2.02, + "learning_rate": 2e-05, + "loss": 1.7544, + "step": 1246 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 1.6698, + "step": 1248 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 1.6689, + "step": 1250 + }, + { + "epoch": 2.03, + "learning_rate": 2e-05, + "loss": 1.8378, + "step": 1252 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 1.807, + "step": 1254 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 1.6988, + "step": 1256 + }, + { + "epoch": 2.04, + "learning_rate": 2e-05, + "loss": 1.5926, + "step": 1258 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 1.7638, + "step": 1260 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 1.7585, + "step": 1262 + }, + { + "epoch": 2.05, + "learning_rate": 2e-05, + "loss": 1.969, + "step": 1264 + }, + { + "epoch": 2.06, + "learning_rate": 2e-05, + "loss": 2.0372, + "step": 1266 + }, + { + "epoch": 2.06, + "learning_rate": 2e-05, + "loss": 1.8681, + "step": 1268 + }, + { + "epoch": 2.06, + "learning_rate": 2e-05, + "loss": 1.9413, + "step": 1270 + }, + { + "epoch": 2.07, + "learning_rate": 2e-05, + "loss": 2.0071, + "step": 1272 + }, + { + "epoch": 2.07, + "learning_rate": 2e-05, + "loss": 2.0781, + "step": 1274 + }, + { + "epoch": 2.07, + "learning_rate": 2e-05, + "loss": 2.2626, + "step": 1276 + }, + { + "epoch": 2.08, + "learning_rate": 2e-05, + "loss": 2.3026, + "step": 1278 + }, + { + "epoch": 2.08, + "learning_rate": 2e-05, + "loss": 2.0699, + "step": 1280 + }, + { + "epoch": 2.08, + "learning_rate": 2e-05, + "loss": 1.6375, + "step": 1282 + }, + { + "epoch": 2.09, + "learning_rate": 2e-05, + "loss": 1.5935, + "step": 1284 + }, + { + "epoch": 2.09, + "learning_rate": 2e-05, + "loss": 1.6579, + "step": 1286 + }, + { + "epoch": 2.09, + "learning_rate": 2e-05, + "loss": 1.5962, + "step": 1288 + }, + { + "epoch": 2.1, + "learning_rate": 2e-05, + "loss": 1.5371, + "step": 1290 + }, + { + "epoch": 2.1, + "learning_rate": 2e-05, + "loss": 1.6297, + "step": 1292 + }, + { + "epoch": 2.1, + "learning_rate": 2e-05, + "loss": 1.5664, + "step": 1294 + }, + { + "epoch": 2.11, + "learning_rate": 2e-05, + "loss": 1.6135, + "step": 1296 + }, + { + "epoch": 2.11, + "learning_rate": 2e-05, + "loss": 1.7271, + "step": 1298 + }, + { + "epoch": 2.11, + "learning_rate": 2e-05, + "loss": 1.7224, + "step": 1300 + }, + { + "epoch": 2.12, + "learning_rate": 2e-05, + "loss": 1.6938, + "step": 1302 + }, + { + "epoch": 2.12, + "learning_rate": 2e-05, + "loss": 1.6161, + "step": 1304 + }, + { + "epoch": 2.12, + "learning_rate": 2e-05, + "loss": 1.7213, + "step": 1306 + }, + { + "epoch": 2.13, + "learning_rate": 2e-05, + "loss": 1.7919, + "step": 1308 + }, + { + "epoch": 2.13, + "learning_rate": 2e-05, + "loss": 1.6923, + "step": 1310 + }, + { + "epoch": 2.13, + "learning_rate": 2e-05, + "loss": 1.9948, + "step": 1312 + }, + { + "epoch": 2.14, + "learning_rate": 2e-05, + "loss": 1.8132, + "step": 1314 + }, + { + "epoch": 2.14, + "learning_rate": 2e-05, + "loss": 1.9876, + "step": 1316 + }, + { + "epoch": 2.14, + "learning_rate": 2e-05, + "loss": 1.9332, + "step": 1318 + }, + { + "epoch": 2.15, + "learning_rate": 2e-05, + "loss": 2.0376, + "step": 1320 + }, + { + "epoch": 2.15, + "learning_rate": 2e-05, + "loss": 2.1336, + "step": 1322 + }, + { + "epoch": 2.15, + "learning_rate": 2e-05, + "loss": 2.1492, + "step": 1324 + }, + { + "epoch": 2.15, + "learning_rate": 2e-05, + "loss": 2.2404, + "step": 1326 + }, + { + "epoch": 2.16, + "learning_rate": 2e-05, + "loss": 2.0328, + "step": 1328 + }, + { + "epoch": 2.16, + "learning_rate": 2e-05, + "loss": 2.2208, + "step": 1330 + }, + { + "epoch": 2.16, + "learning_rate": 2e-05, + "loss": 1.6264, + "step": 1332 + }, + { + "epoch": 2.17, + "learning_rate": 2e-05, + "loss": 1.6469, + "step": 1334 + }, + { + "epoch": 2.17, + "learning_rate": 2e-05, + "loss": 1.4819, + "step": 1336 + }, + { + "epoch": 2.17, + "learning_rate": 2e-05, + "loss": 1.6929, + "step": 1338 + }, + { + "epoch": 2.18, + "learning_rate": 2e-05, + "loss": 1.5354, + "step": 1340 + }, + { + "epoch": 2.18, + "learning_rate": 2e-05, + "loss": 1.5995, + "step": 1342 + }, + { + "epoch": 2.18, + "learning_rate": 2e-05, + "loss": 1.7638, + "step": 1344 + }, + { + "epoch": 2.19, + "learning_rate": 2e-05, + "loss": 1.7488, + "step": 1346 + }, + { + "epoch": 2.19, + "learning_rate": 2e-05, + "loss": 1.8151, + "step": 1348 + }, + { + "epoch": 2.19, + "learning_rate": 2e-05, + "loss": 1.7435, + "step": 1350 + }, + { + "epoch": 2.2, + "learning_rate": 2e-05, + "loss": 1.7724, + "step": 1352 + }, + { + "epoch": 2.2, + "learning_rate": 2e-05, + "loss": 1.7307, + "step": 1354 + }, + { + "epoch": 2.2, + "learning_rate": 2e-05, + "loss": 1.6689, + "step": 1356 + }, + { + "epoch": 2.21, + "learning_rate": 2e-05, + "loss": 1.8689, + "step": 1358 + }, + { + "epoch": 2.21, + "learning_rate": 2e-05, + "loss": 1.6998, + "step": 1360 + }, + { + "epoch": 2.21, + "learning_rate": 2e-05, + "loss": 2.0046, + "step": 1362 + }, + { + "epoch": 2.22, + "learning_rate": 2e-05, + "loss": 1.7219, + "step": 1364 + }, + { + "epoch": 2.22, + "learning_rate": 2e-05, + "loss": 1.8812, + "step": 1366 + }, + { + "epoch": 2.22, + "learning_rate": 2e-05, + "loss": 1.8674, + "step": 1368 + }, + { + "epoch": 2.23, + "learning_rate": 2e-05, + "loss": 2.0153, + "step": 1370 + }, + { + "epoch": 2.23, + "learning_rate": 2e-05, + "loss": 2.1036, + "step": 1372 + }, + { + "epoch": 2.23, + "learning_rate": 2e-05, + "loss": 2.2218, + "step": 1374 + }, + { + "epoch": 2.24, + "learning_rate": 2e-05, + "loss": 2.2705, + "step": 1376 + }, + { + "epoch": 2.24, + "learning_rate": 2e-05, + "loss": 2.1251, + "step": 1378 + }, + { + "epoch": 2.24, + "learning_rate": 2e-05, + "loss": 2.27, + "step": 1380 + }, + { + "epoch": 2.25, + "learning_rate": 2e-05, + "loss": 1.7102, + "step": 1382 + }, + { + "epoch": 2.25, + "learning_rate": 2e-05, + "loss": 1.5466, + "step": 1384 + }, + { + "epoch": 2.25, + "learning_rate": 2e-05, + "loss": 1.7316, + "step": 1386 + }, + { + "epoch": 2.26, + "learning_rate": 2e-05, + "loss": 1.5031, + "step": 1388 + }, + { + "epoch": 2.26, + "learning_rate": 2e-05, + "loss": 1.608, + "step": 1390 + }, + { + "epoch": 2.26, + "learning_rate": 2e-05, + "loss": 1.5033, + "step": 1392 + }, + { + "epoch": 2.27, + "learning_rate": 2e-05, + "loss": 1.733, + "step": 1394 + }, + { + "epoch": 2.27, + "learning_rate": 2e-05, + "loss": 1.7289, + "step": 1396 + }, + { + "epoch": 2.27, + "learning_rate": 2e-05, + "loss": 1.7305, + "step": 1398 + }, + { + "epoch": 2.28, + "learning_rate": 2e-05, + "loss": 1.7743, + "step": 1400 + }, + { + "epoch": 2.28, + "learning_rate": 2e-05, + "loss": 1.8423, + "step": 1402 + }, + { + "epoch": 2.28, + "learning_rate": 2e-05, + "loss": 1.6997, + "step": 1404 + }, + { + "epoch": 2.28, + "learning_rate": 2e-05, + "loss": 1.8342, + "step": 1406 + }, + { + "epoch": 2.29, + "learning_rate": 2e-05, + "loss": 1.6843, + "step": 1408 + }, + { + "epoch": 2.29, + "learning_rate": 2e-05, + "loss": 1.8158, + "step": 1410 + }, + { + "epoch": 2.29, + "learning_rate": 2e-05, + "loss": 1.7417, + "step": 1412 + }, + { + "epoch": 2.3, + "learning_rate": 2e-05, + "loss": 1.8145, + "step": 1414 + }, + { + "epoch": 2.3, + "learning_rate": 2e-05, + "loss": 1.9312, + "step": 1416 + }, + { + "epoch": 2.3, + "learning_rate": 2e-05, + "loss": 1.9542, + "step": 1418 + }, + { + "epoch": 2.31, + "learning_rate": 2e-05, + "loss": 2.0767, + "step": 1420 + }, + { + "epoch": 2.31, + "learning_rate": 2e-05, + "loss": 1.9053, + "step": 1422 + }, + { + "epoch": 2.31, + "learning_rate": 2e-05, + "loss": 2.4192, + "step": 1424 + }, + { + "epoch": 2.32, + "learning_rate": 2e-05, + "loss": 2.1551, + "step": 1426 + }, + { + "epoch": 2.32, + "learning_rate": 2e-05, + "loss": 2.1658, + "step": 1428 + }, + { + "epoch": 2.32, + "learning_rate": 2e-05, + "loss": 2.1308, + "step": 1430 + }, + { + "epoch": 2.33, + "learning_rate": 2e-05, + "loss": 1.6135, + "step": 1432 + }, + { + "epoch": 2.33, + "learning_rate": 2e-05, + "loss": 1.5897, + "step": 1434 + }, + { + "epoch": 2.33, + "learning_rate": 2e-05, + "loss": 1.4971, + "step": 1436 + }, + { + "epoch": 2.34, + "learning_rate": 2e-05, + "loss": 1.5434, + "step": 1438 + }, + { + "epoch": 2.34, + "learning_rate": 2e-05, + "loss": 1.6953, + "step": 1440 + }, + { + "epoch": 2.34, + "learning_rate": 2e-05, + "loss": 1.6321, + "step": 1442 + }, + { + "epoch": 2.35, + "learning_rate": 2e-05, + "loss": 1.572, + "step": 1444 + }, + { + "epoch": 2.35, + "learning_rate": 2e-05, + "loss": 1.5595, + "step": 1446 + }, + { + "epoch": 2.35, + "learning_rate": 2e-05, + "loss": 1.8133, + "step": 1448 + }, + { + "epoch": 2.36, + "learning_rate": 2e-05, + "loss": 1.641, + "step": 1450 + }, + { + "epoch": 2.36, + "learning_rate": 2e-05, + "loss": 1.6171, + "step": 1452 + }, + { + "epoch": 2.36, + "learning_rate": 2e-05, + "loss": 1.6589, + "step": 1454 + }, + { + "epoch": 2.37, + "learning_rate": 2e-05, + "loss": 1.786, + "step": 1456 + }, + { + "epoch": 2.37, + "learning_rate": 2e-05, + "loss": 1.7184, + "step": 1458 + }, + { + "epoch": 2.37, + "learning_rate": 2e-05, + "loss": 1.847, + "step": 1460 + }, + { + "epoch": 2.38, + "learning_rate": 2e-05, + "loss": 1.7423, + "step": 1462 + }, + { + "epoch": 2.38, + "learning_rate": 2e-05, + "loss": 2.0235, + "step": 1464 + }, + { + "epoch": 2.38, + "learning_rate": 2e-05, + "loss": 1.9874, + "step": 1466 + }, + { + "epoch": 2.39, + "learning_rate": 2e-05, + "loss": 1.9473, + "step": 1468 + }, + { + "epoch": 2.39, + "learning_rate": 2e-05, + "loss": 2.0052, + "step": 1470 + }, + { + "epoch": 2.39, + "learning_rate": 2e-05, + "loss": 1.9972, + "step": 1472 + }, + { + "epoch": 2.4, + "learning_rate": 2e-05, + "loss": 2.0354, + "step": 1474 + }, + { + "epoch": 2.4, + "learning_rate": 2e-05, + "loss": 2.1694, + "step": 1476 + }, + { + "epoch": 2.4, + "learning_rate": 2e-05, + "loss": 2.0919, + "step": 1478 + }, + { + "epoch": 2.41, + "learning_rate": 2e-05, + "loss": 2.1576, + "step": 1480 + }, + { + "epoch": 2.41, + "learning_rate": 2e-05, + "loss": 1.6458, + "step": 1482 + }, + { + "epoch": 2.41, + "learning_rate": 2e-05, + "loss": 1.5015, + "step": 1484 + }, + { + "epoch": 2.41, + "learning_rate": 2e-05, + "loss": 1.5359, + "step": 1486 + }, + { + "epoch": 2.42, + "learning_rate": 2e-05, + "loss": 1.7065, + "step": 1488 + }, + { + "epoch": 2.42, + "learning_rate": 2e-05, + "loss": 1.6893, + "step": 1490 + }, + { + "epoch": 2.42, + "learning_rate": 2e-05, + "loss": 1.7842, + "step": 1492 + }, + { + "epoch": 2.43, + "learning_rate": 2e-05, + "loss": 1.7229, + "step": 1494 + }, + { + "epoch": 2.43, + "learning_rate": 2e-05, + "loss": 1.6822, + "step": 1496 + }, + { + "epoch": 2.43, + "learning_rate": 2e-05, + "loss": 1.664, + "step": 1498 + }, + { + "epoch": 2.44, + "learning_rate": 2e-05, + "loss": 1.7587, + "step": 1500 + }, + { + "epoch": 2.44, + "learning_rate": 2e-05, + "loss": 1.8268, + "step": 1502 + }, + { + "epoch": 2.44, + "learning_rate": 2e-05, + "loss": 1.6934, + "step": 1504 + }, + { + "epoch": 2.45, + "learning_rate": 2e-05, + "loss": 1.8765, + "step": 1506 + }, + { + "epoch": 2.45, + "learning_rate": 2e-05, + "loss": 1.845, + "step": 1508 + }, + { + "epoch": 2.45, + "learning_rate": 2e-05, + "loss": 1.8817, + "step": 1510 + }, + { + "epoch": 2.46, + "learning_rate": 2e-05, + "loss": 1.727, + "step": 1512 + }, + { + "epoch": 2.46, + "learning_rate": 2e-05, + "loss": 1.848, + "step": 1514 + }, + { + "epoch": 2.46, + "learning_rate": 2e-05, + "loss": 2.0766, + "step": 1516 + }, + { + "epoch": 2.47, + "learning_rate": 2e-05, + "loss": 2.0203, + "step": 1518 + }, + { + "epoch": 2.47, + "learning_rate": 2e-05, + "loss": 2.0457, + "step": 1520 + }, + { + "epoch": 2.47, + "learning_rate": 2e-05, + "loss": 2.1893, + "step": 1522 + }, + { + "epoch": 2.48, + "learning_rate": 2e-05, + "loss": 1.9883, + "step": 1524 + }, + { + "epoch": 2.48, + "learning_rate": 2e-05, + "loss": 2.0896, + "step": 1526 + }, + { + "epoch": 2.48, + "learning_rate": 2e-05, + "loss": 2.1288, + "step": 1528 + }, + { + "epoch": 2.49, + "learning_rate": 2e-05, + "loss": 2.2434, + "step": 1530 + }, + { + "epoch": 2.49, + "learning_rate": 2e-05, + "loss": 1.8621, + "step": 1532 + }, + { + "epoch": 2.49, + "learning_rate": 2e-05, + "loss": 1.5902, + "step": 1534 + }, + { + "epoch": 2.5, + "learning_rate": 2e-05, + "loss": 1.6382, + "step": 1536 + }, + { + "epoch": 2.5, + "learning_rate": 2e-05, + "loss": 1.5176, + "step": 1538 + }, + { + "epoch": 2.5, + "learning_rate": 2e-05, + "loss": 1.7048, + "step": 1540 + }, + { + "epoch": 2.51, + "learning_rate": 2e-05, + "loss": 1.7238, + "step": 1542 + }, + { + "epoch": 2.51, + "learning_rate": 2e-05, + "loss": 1.7027, + "step": 1544 + }, + { + "epoch": 2.51, + "learning_rate": 2e-05, + "loss": 1.6627, + "step": 1546 + }, + { + "epoch": 2.52, + "learning_rate": 2e-05, + "loss": 1.6683, + "step": 1548 + }, + { + "epoch": 2.52, + "learning_rate": 2e-05, + "loss": 1.732, + "step": 1550 + }, + { + "epoch": 2.52, + "learning_rate": 2e-05, + "loss": 1.7039, + "step": 1552 + }, + { + "epoch": 2.53, + "learning_rate": 2e-05, + "loss": 1.5855, + "step": 1554 + }, + { + "epoch": 2.53, + "learning_rate": 2e-05, + "loss": 1.6969, + "step": 1556 + }, + { + "epoch": 2.53, + "learning_rate": 2e-05, + "loss": 1.7362, + "step": 1558 + }, + { + "epoch": 2.54, + "learning_rate": 2e-05, + "loss": 1.6448, + "step": 1560 + }, + { + "epoch": 2.54, + "learning_rate": 2e-05, + "loss": 1.7654, + "step": 1562 + }, + { + "epoch": 2.54, + "learning_rate": 2e-05, + "loss": 1.7655, + "step": 1564 + }, + { + "epoch": 2.54, + "learning_rate": 2e-05, + "loss": 1.8978, + "step": 1566 + }, + { + "epoch": 2.55, + "learning_rate": 2e-05, + "loss": 1.9586, + "step": 1568 + }, + { + "epoch": 2.55, + "learning_rate": 2e-05, + "loss": 2.0546, + "step": 1570 + }, + { + "epoch": 2.55, + "learning_rate": 2e-05, + "loss": 1.9625, + "step": 1572 + }, + { + "epoch": 2.56, + "learning_rate": 2e-05, + "loss": 2.0044, + "step": 1574 + }, + { + "epoch": 2.56, + "learning_rate": 2e-05, + "loss": 2.0529, + "step": 1576 + }, + { + "epoch": 2.56, + "learning_rate": 2e-05, + "loss": 2.1048, + "step": 1578 + }, + { + "epoch": 2.57, + "learning_rate": 2e-05, + "loss": 2.0548, + "step": 1580 + }, + { + "epoch": 2.57, + "learning_rate": 2e-05, + "loss": 1.7534, + "step": 1582 + }, + { + "epoch": 2.57, + "learning_rate": 2e-05, + "loss": 1.4283, + "step": 1584 + }, + { + "epoch": 2.58, + "learning_rate": 2e-05, + "loss": 1.6125, + "step": 1586 + }, + { + "epoch": 2.58, + "learning_rate": 2e-05, + "loss": 1.5568, + "step": 1588 + }, + { + "epoch": 2.58, + "learning_rate": 2e-05, + "loss": 1.6719, + "step": 1590 + }, + { + "epoch": 2.59, + "learning_rate": 2e-05, + "loss": 1.6142, + "step": 1592 + }, + { + "epoch": 2.59, + "learning_rate": 2e-05, + "loss": 1.6434, + "step": 1594 + }, + { + "epoch": 2.59, + "learning_rate": 2e-05, + "loss": 1.7194, + "step": 1596 + }, + { + "epoch": 2.6, + "learning_rate": 2e-05, + "loss": 1.7541, + "step": 1598 + }, + { + "epoch": 2.6, + "learning_rate": 2e-05, + "loss": 1.6681, + "step": 1600 + }, + { + "epoch": 2.6, + "learning_rate": 2e-05, + "loss": 1.81, + "step": 1602 + }, + { + "epoch": 2.61, + "learning_rate": 2e-05, + "loss": 1.6948, + "step": 1604 + }, + { + "epoch": 2.61, + "learning_rate": 2e-05, + "loss": 1.7533, + "step": 1606 + }, + { + "epoch": 2.61, + "learning_rate": 2e-05, + "loss": 1.8256, + "step": 1608 + }, + { + "epoch": 2.62, + "learning_rate": 2e-05, + "loss": 1.8959, + "step": 1610 + }, + { + "epoch": 2.62, + "learning_rate": 2e-05, + "loss": 1.7526, + "step": 1612 + }, + { + "epoch": 2.62, + "learning_rate": 2e-05, + "loss": 1.8277, + "step": 1614 + }, + { + "epoch": 2.63, + "learning_rate": 2e-05, + "loss": 1.9229, + "step": 1616 + }, + { + "epoch": 2.63, + "learning_rate": 2e-05, + "loss": 1.9832, + "step": 1618 + }, + { + "epoch": 2.63, + "learning_rate": 2e-05, + "loss": 1.903, + "step": 1620 + }, + { + "epoch": 2.64, + "learning_rate": 2e-05, + "loss": 2.1626, + "step": 1622 + }, + { + "epoch": 2.64, + "learning_rate": 2e-05, + "loss": 2.2053, + "step": 1624 + }, + { + "epoch": 2.64, + "learning_rate": 2e-05, + "loss": 2.176, + "step": 1626 + }, + { + "epoch": 2.65, + "learning_rate": 2e-05, + "loss": 2.1653, + "step": 1628 + }, + { + "epoch": 2.65, + "learning_rate": 2e-05, + "loss": 2.179, + "step": 1630 + }, + { + "epoch": 2.65, + "learning_rate": 2e-05, + "loss": 1.7573, + "step": 1632 + }, + { + "epoch": 2.66, + "learning_rate": 2e-05, + "loss": 1.5932, + "step": 1634 + }, + { + "epoch": 2.66, + "learning_rate": 2e-05, + "loss": 1.5588, + "step": 1636 + }, + { + "epoch": 2.66, + "learning_rate": 2e-05, + "loss": 1.6959, + "step": 1638 + }, + { + "epoch": 2.67, + "learning_rate": 2e-05, + "loss": 1.598, + "step": 1640 + }, + { + "epoch": 2.67, + "learning_rate": 2e-05, + "loss": 1.748, + "step": 1642 + }, + { + "epoch": 2.67, + "learning_rate": 2e-05, + "loss": 1.7074, + "step": 1644 + }, + { + "epoch": 2.67, + "learning_rate": 2e-05, + "loss": 1.8215, + "step": 1646 + }, + { + "epoch": 2.68, + "learning_rate": 2e-05, + "loss": 1.6872, + "step": 1648 + }, + { + "epoch": 2.68, + "learning_rate": 2e-05, + "loss": 1.734, + "step": 1650 + }, + { + "epoch": 2.68, + "learning_rate": 2e-05, + "loss": 1.6693, + "step": 1652 + }, + { + "epoch": 2.69, + "learning_rate": 2e-05, + "loss": 1.7714, + "step": 1654 + }, + { + "epoch": 2.69, + "learning_rate": 2e-05, + "loss": 1.7195, + "step": 1656 + }, + { + "epoch": 2.69, + "learning_rate": 2e-05, + "loss": 1.7976, + "step": 1658 + }, + { + "epoch": 2.7, + "learning_rate": 2e-05, + "loss": 1.8392, + "step": 1660 + }, + { + "epoch": 2.7, + "learning_rate": 2e-05, + "loss": 1.7095, + "step": 1662 + }, + { + "epoch": 2.7, + "learning_rate": 2e-05, + "loss": 1.9426, + "step": 1664 + }, + { + "epoch": 2.71, + "learning_rate": 2e-05, + "loss": 1.9583, + "step": 1666 + }, + { + "epoch": 2.71, + "learning_rate": 2e-05, + "loss": 2.028, + "step": 1668 + }, + { + "epoch": 2.71, + "learning_rate": 2e-05, + "loss": 2.1083, + "step": 1670 + }, + { + "epoch": 2.72, + "learning_rate": 2e-05, + "loss": 2.1001, + "step": 1672 + }, + { + "epoch": 2.72, + "learning_rate": 2e-05, + "loss": 2.2984, + "step": 1674 + }, + { + "epoch": 2.72, + "learning_rate": 2e-05, + "loss": 2.0889, + "step": 1676 + }, + { + "epoch": 2.73, + "learning_rate": 2e-05, + "loss": 2.3576, + "step": 1678 + }, + { + "epoch": 2.73, + "learning_rate": 2e-05, + "loss": 2.0517, + "step": 1680 + }, + { + "epoch": 2.73, + "learning_rate": 2e-05, + "loss": 1.6826, + "step": 1682 + }, + { + "epoch": 2.74, + "learning_rate": 2e-05, + "loss": 1.5038, + "step": 1684 + }, + { + "epoch": 2.74, + "learning_rate": 2e-05, + "loss": 1.6221, + "step": 1686 + }, + { + "epoch": 2.74, + "learning_rate": 2e-05, + "loss": 1.5329, + "step": 1688 + }, + { + "epoch": 2.75, + "learning_rate": 2e-05, + "loss": 1.6428, + "step": 1690 + }, + { + "epoch": 2.75, + "learning_rate": 2e-05, + "loss": 1.6739, + "step": 1692 + }, + { + "epoch": 2.75, + "learning_rate": 2e-05, + "loss": 1.6846, + "step": 1694 + }, + { + "epoch": 2.76, + "learning_rate": 2e-05, + "loss": 1.7064, + "step": 1696 + }, + { + "epoch": 2.76, + "learning_rate": 2e-05, + "loss": 1.6818, + "step": 1698 + }, + { + "epoch": 2.76, + "learning_rate": 2e-05, + "loss": 1.7676, + "step": 1700 + }, + { + "epoch": 2.77, + "learning_rate": 2e-05, + "loss": 1.6708, + "step": 1702 + }, + { + "epoch": 2.77, + "learning_rate": 2e-05, + "loss": 1.7152, + "step": 1704 + }, + { + "epoch": 2.77, + "learning_rate": 2e-05, + "loss": 1.4785, + "step": 1706 + }, + { + "epoch": 2.78, + "learning_rate": 2e-05, + "loss": 1.9122, + "step": 1708 + }, + { + "epoch": 2.78, + "learning_rate": 2e-05, + "loss": 1.9274, + "step": 1710 + }, + { + "epoch": 2.78, + "learning_rate": 2e-05, + "loss": 1.982, + "step": 1712 + }, + { + "epoch": 2.79, + "learning_rate": 2e-05, + "loss": 1.9373, + "step": 1714 + }, + { + "epoch": 2.79, + "learning_rate": 2e-05, + "loss": 1.8395, + "step": 1716 + }, + { + "epoch": 2.79, + "learning_rate": 2e-05, + "loss": 1.8692, + "step": 1718 + }, + { + "epoch": 2.8, + "learning_rate": 2e-05, + "loss": 2.1762, + "step": 1720 + }, + { + "epoch": 2.8, + "learning_rate": 2e-05, + "loss": 2.0499, + "step": 1722 + }, + { + "epoch": 2.8, + "learning_rate": 2e-05, + "loss": 2.1598, + "step": 1724 + }, + { + "epoch": 2.8, + "learning_rate": 2e-05, + "loss": 2.2524, + "step": 1726 + }, + { + "epoch": 2.81, + "learning_rate": 2e-05, + "loss": 2.2295, + "step": 1728 + }, + { + "epoch": 2.81, + "learning_rate": 2e-05, + "loss": 2.4519, + "step": 1730 + }, + { + "epoch": 2.81, + "learning_rate": 2e-05, + "loss": 1.7711, + "step": 1732 + }, + { + "epoch": 2.82, + "learning_rate": 2e-05, + "loss": 1.5325, + "step": 1734 + }, + { + "epoch": 2.82, + "learning_rate": 2e-05, + "loss": 1.6951, + "step": 1736 + }, + { + "epoch": 2.82, + "learning_rate": 2e-05, + "loss": 1.5366, + "step": 1738 + }, + { + "epoch": 2.83, + "learning_rate": 2e-05, + "loss": 1.5947, + "step": 1740 + }, + { + "epoch": 2.83, + "learning_rate": 2e-05, + "loss": 1.6359, + "step": 1742 + }, + { + "epoch": 2.83, + "learning_rate": 2e-05, + "loss": 1.6945, + "step": 1744 + }, + { + "epoch": 2.84, + "learning_rate": 2e-05, + "loss": 1.7055, + "step": 1746 + }, + { + "epoch": 2.84, + "learning_rate": 2e-05, + "loss": 1.5715, + "step": 1748 + }, + { + "epoch": 2.84, + "learning_rate": 2e-05, + "loss": 1.6131, + "step": 1750 + }, + { + "epoch": 2.85, + "learning_rate": 2e-05, + "loss": 1.7353, + "step": 1752 + }, + { + "epoch": 2.85, + "learning_rate": 2e-05, + "loss": 1.6901, + "step": 1754 + }, + { + "epoch": 2.85, + "learning_rate": 2e-05, + "loss": 1.6287, + "step": 1756 + }, + { + "epoch": 2.86, + "learning_rate": 2e-05, + "loss": 1.7596, + "step": 1758 + }, + { + "epoch": 2.86, + "learning_rate": 2e-05, + "loss": 1.6615, + "step": 1760 + }, + { + "epoch": 2.86, + "learning_rate": 2e-05, + "loss": 1.7761, + "step": 1762 + }, + { + "epoch": 2.87, + "learning_rate": 2e-05, + "loss": 1.7353, + "step": 1764 + }, + { + "epoch": 2.87, + "learning_rate": 2e-05, + "loss": 2.0409, + "step": 1766 + }, + { + "epoch": 2.87, + "learning_rate": 2e-05, + "loss": 1.8417, + "step": 1768 + }, + { + "epoch": 2.88, + "learning_rate": 2e-05, + "loss": 1.9822, + "step": 1770 + }, + { + "epoch": 2.88, + "learning_rate": 2e-05, + "loss": 1.935, + "step": 1772 + }, + { + "epoch": 2.88, + "learning_rate": 2e-05, + "loss": 2.191, + "step": 1774 + }, + { + "epoch": 2.89, + "learning_rate": 2e-05, + "loss": 2.1619, + "step": 1776 + }, + { + "epoch": 2.89, + "learning_rate": 2e-05, + "loss": 2.179, + "step": 1778 + }, + { + "epoch": 2.89, + "learning_rate": 2e-05, + "loss": 2.4226, + "step": 1780 + }, + { + "epoch": 2.9, + "learning_rate": 2e-05, + "loss": 1.752, + "step": 1782 + }, + { + "epoch": 2.9, + "learning_rate": 2e-05, + "loss": 1.4133, + "step": 1784 + }, + { + "epoch": 2.9, + "learning_rate": 2e-05, + "loss": 1.3559, + "step": 1786 + }, + { + "epoch": 2.91, + "learning_rate": 2e-05, + "loss": 1.4966, + "step": 1788 + }, + { + "epoch": 2.91, + "learning_rate": 2e-05, + "loss": 1.5845, + "step": 1790 + }, + { + "epoch": 2.91, + "learning_rate": 2e-05, + "loss": 1.6532, + "step": 1792 + }, + { + "epoch": 2.92, + "learning_rate": 2e-05, + "loss": 1.6712, + "step": 1794 + }, + { + "epoch": 2.92, + "learning_rate": 2e-05, + "loss": 1.7321, + "step": 1796 + }, + { + "epoch": 2.92, + "learning_rate": 2e-05, + "loss": 1.4819, + "step": 1798 + }, + { + "epoch": 2.93, + "learning_rate": 2e-05, + "loss": 1.7174, + "step": 1800 } ], - "max_steps": 1875, - "num_train_epochs": 4, - "total_flos": 4967090265784320.0, + "max_steps": 1845, + "num_train_epochs": 3, + "total_flos": 3.908152132295885e+16, "trial_name": null, "trial_params": null }