{ "best_metric": null, "best_model_checkpoint": null, <<<<<<< HEAD "epoch": 408.505875769446, "global_step": 730000, ======= "epoch": 111.9194180190263, "global_step": 200000, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { <<<<<<< HEAD "epoch": 5.6, "learning_rate": 2.3437499999999998e-07, "loss": 0.8947, "step": 10000 }, { "epoch": 5.6, "eval_loss": 0.7632947564125061, "eval_runtime": 76.0776, "eval_samples_per_second": 101.633, "eval_steps_per_second": 12.711, "step": 10000 }, { "epoch": 11.19, "learning_rate": 4.6874999999999996e-07, "loss": 0.7738, "step": 20000 }, { "epoch": 11.19, "eval_loss": 0.7603365182876587, "eval_runtime": 76.429, "eval_samples_per_second": 101.166, "eval_steps_per_second": 12.652, "step": 20000 }, { "epoch": 16.79, "learning_rate": 7.031249999999999e-07, "loss": 0.7725, "step": 30000 }, { "epoch": 16.79, "eval_loss": 0.7571617960929871, "eval_runtime": 76.146, "eval_samples_per_second": 101.542, "eval_steps_per_second": 12.699, "step": 30000 }, { "epoch": 22.38, "learning_rate": 9.374999999999999e-07, "loss": 0.7715, "step": 40000 }, { "epoch": 22.38, "eval_loss": 0.7568734884262085, "eval_runtime": 76.4339, "eval_samples_per_second": 101.159, "eval_steps_per_second": 12.651, ======= "epoch": 11.19, "learning_rate": 1.8749999999999998e-06, "loss": 0.8164, "step": 10000 }, { "epoch": 11.19, "eval_loss": 0.7568955421447754, "eval_runtime": 301.8174, "eval_samples_per_second": 25.618, "eval_steps_per_second": 1.604, "step": 10000 }, { "epoch": 22.37, "learning_rate": 3.7499999999999997e-06, "loss": 0.7702, "step": 20000 }, { "epoch": 22.37, "eval_loss": 0.7498099207878113, "eval_runtime": 249.204, "eval_samples_per_second": 31.027, "eval_steps_per_second": 1.942, "step": 20000 }, { "epoch": 33.56, "learning_rate": 5.6249999999999995e-06, "loss": 0.7668, "step": 30000 }, { "epoch": 33.56, "eval_loss": 0.7477062344551086, "eval_runtime": 189.0409, "eval_samples_per_second": 40.901, "eval_steps_per_second": 2.56, "step": 30000 }, { "epoch": 44.74, "learning_rate": 7.499999999999999e-06, "loss": 0.7655, "step": 40000 }, { "epoch": 44.74, "eval_loss": 0.7450574040412903, "eval_runtime": 66.2284, "eval_samples_per_second": 116.748, "eval_steps_per_second": 7.308, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 40000 }, { "epoch": 27.98, <<<<<<< HEAD "learning_rate": 1.171875e-06, "loss": 0.7695, ======= "learning_rate": 9.375e-06, "loss": 0.7653, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 50000 }, { "epoch": 27.98, <<<<<<< HEAD "eval_loss": 0.7500209212303162, "eval_runtime": 76.3619, "eval_samples_per_second": 101.255, "eval_steps_per_second": 12.663, ======= "eval_loss": 0.7478589415550232, "eval_runtime": 76.5001, "eval_samples_per_second": 101.072, "eval_steps_per_second": 12.641, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 50000 }, { "epoch": 33.58, <<<<<<< HEAD "learning_rate": 1.4062499999999999e-06, "loss": 0.7688, ======= "learning_rate": 1.1249999999999999e-05, "loss": 0.7648, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 60000 }, { "epoch": 33.58, <<<<<<< HEAD "eval_loss": 0.7491664886474609, "eval_runtime": 76.419, "eval_samples_per_second": 101.179, "eval_steps_per_second": 12.654, ======= "eval_loss": 0.7447686195373535, "eval_runtime": 76.3539, "eval_samples_per_second": 101.265, "eval_steps_per_second": 12.665, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 60000 }, { "epoch": 39.17, <<<<<<< HEAD "learning_rate": 1.6406249999999999e-06, "loss": 0.768, ======= "learning_rate": 1.3124999999999999e-05, "loss": 0.7645, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 70000 }, { "epoch": 39.17, <<<<<<< HEAD "eval_loss": 0.748078465461731, "eval_runtime": 76.5312, "eval_samples_per_second": 101.031, "eval_steps_per_second": 12.635, ======= "eval_loss": 0.7464274764060974, "eval_runtime": 76.7958, "eval_samples_per_second": 100.683, "eval_steps_per_second": 12.592, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 70000 }, { "epoch": 44.77, <<<<<<< HEAD "learning_rate": 1.8749999999999998e-06, "loss": 0.7667, ======= "learning_rate": 1.4999999999999999e-05, "loss": 0.7642, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 80000 }, { "epoch": 44.77, <<<<<<< HEAD "eval_loss": 0.7448051571846008, "eval_runtime": 76.5688, "eval_samples_per_second": 100.981, "eval_steps_per_second": 12.629, ======= "eval_loss": 0.7449608445167542, "eval_runtime": 122.9116, "eval_samples_per_second": 62.907, "eval_steps_per_second": 7.867, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 80000 }, { "epoch": 50.36, <<<<<<< HEAD "learning_rate": 2.109375e-06, "loss": 0.7663, ======= "learning_rate": 1.6875e-05, "loss": 0.7636, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 90000 }, { "epoch": 50.36, <<<<<<< HEAD "eval_loss": 0.7472007870674133, "eval_runtime": 76.5244, "eval_samples_per_second": 101.04, "eval_steps_per_second": 12.636, ======= "eval_loss": 0.7427342534065247, "eval_runtime": 76.4172, "eval_samples_per_second": 101.181, "eval_steps_per_second": 12.654, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 90000 }, { "epoch": 55.96, <<<<<<< HEAD "learning_rate": 2.34375e-06, "loss": 0.766, ======= "learning_rate": 2e-05, "loss": 0.7602, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 100000 }, { "epoch": 55.96, <<<<<<< HEAD "eval_loss": 0.7444973587989807, "eval_runtime": 76.6067, "eval_samples_per_second": 100.931, "eval_steps_per_second": 12.623, ======= "eval_loss": 0.726163387298584, "eval_runtime": 76.3938, "eval_samples_per_second": 101.212, "eval_steps_per_second": 12.658, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 100000 }, { "epoch": 61.56, <<<<<<< HEAD "learning_rate": 2.578125e-06, "loss": 0.7656, ======= "learning_rate": 2e-05, "loss": 0.7279, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 110000 }, { "epoch": 61.56, <<<<<<< HEAD "eval_loss": 0.7434288263320923, "eval_runtime": 76.5916, "eval_samples_per_second": 100.951, "eval_steps_per_second": 12.625, ======= "eval_loss": 0.6971690654754639, "eval_runtime": 76.7625, "eval_samples_per_second": 100.726, "eval_steps_per_second": 12.597, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 110000 }, { "epoch": 67.15, <<<<<<< HEAD "learning_rate": 2.8124999999999998e-06, "loss": 0.7654, ======= "learning_rate": 2e-05, "loss": 0.6981, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 120000 }, { "epoch": 67.15, <<<<<<< HEAD "eval_loss": 0.7411925196647644, "eval_runtime": 76.517, "eval_samples_per_second": 101.049, "eval_steps_per_second": 12.638, ======= "eval_loss": 0.6809367537498474, "eval_runtime": 76.4831, "eval_samples_per_second": 101.094, "eval_steps_per_second": 12.643, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 120000 }, { "epoch": 72.75, <<<<<<< HEAD "learning_rate": 3.046875e-06, "loss": 0.7652, ======= "learning_rate": 2e-05, "loss": 0.6781, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 130000 }, { "epoch": 72.75, <<<<<<< HEAD "eval_loss": 0.7399063110351562, "eval_runtime": 76.4205, "eval_samples_per_second": 101.177, "eval_steps_per_second": 12.654, ======= "eval_loss": 0.6643149852752686, "eval_runtime": 76.5075, "eval_samples_per_second": 101.062, "eval_steps_per_second": 12.639, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 130000 }, { "epoch": 78.34, <<<<<<< HEAD "learning_rate": 3.2812499999999997e-06, "loss": 0.7649, ======= "learning_rate": 2e-05, "loss": 0.6612, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 140000 }, { "epoch": 78.34, <<<<<<< HEAD "eval_loss": 0.7432417869567871, "eval_runtime": 76.3896, "eval_samples_per_second": 101.218, "eval_steps_per_second": 12.659, ======= "eval_loss": 0.653438150882721, "eval_runtime": 76.7069, "eval_samples_per_second": 100.799, "eval_steps_per_second": 12.606, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 140000 }, { "epoch": 83.94, <<<<<<< HEAD "learning_rate": 3.515625e-06, "loss": 0.7647, ======= "learning_rate": 2e-05, "loss": 0.6483, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 150000 }, { "epoch": 83.94, <<<<<<< HEAD "eval_loss": 0.7411432862281799, "eval_runtime": 76.5523, "eval_samples_per_second": 101.003, "eval_steps_per_second": 12.632, ======= "eval_loss": 0.6426078081130981, "eval_runtime": 76.587, "eval_samples_per_second": 100.957, "eval_steps_per_second": 12.626, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 150000 }, { "epoch": 89.54, <<<<<<< HEAD "learning_rate": 3.7499999999999997e-06, "loss": 0.7645, ======= "learning_rate": 2e-05, "loss": 0.6389, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 160000 }, { "epoch": 89.54, <<<<<<< HEAD "eval_loss": 0.7415673136711121, "eval_runtime": 76.1013, "eval_samples_per_second": 101.601, "eval_steps_per_second": 12.707, ======= "eval_loss": 0.6356751918792725, "eval_runtime": 76.2962, "eval_samples_per_second": 101.342, "eval_steps_per_second": 12.674, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 160000 }, { "epoch": 95.13, <<<<<<< HEAD "learning_rate": 3.9843749999999994e-06, "loss": 0.7642, ======= "learning_rate": 2e-05, "loss": 0.6318, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 170000 }, { "epoch": 95.13, <<<<<<< HEAD "eval_loss": 0.742856502532959, "eval_runtime": 76.276, "eval_samples_per_second": 101.369, "eval_steps_per_second": 12.678, ======= "eval_loss": 0.6319578289985657, "eval_runtime": 134.8378, "eval_samples_per_second": 57.343, "eval_steps_per_second": 7.172, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 170000 }, { "epoch": 100.73, <<<<<<< HEAD "learning_rate": 4.21875e-06, "loss": 0.764, ======= "learning_rate": 2e-05, "loss": 0.6261, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 180000 }, { "epoch": 100.73, <<<<<<< HEAD "eval_loss": 0.7411246299743652, "eval_runtime": 76.2112, "eval_samples_per_second": 101.455, "eval_steps_per_second": 12.688, ======= "eval_loss": 0.6279829740524292, "eval_runtime": 76.2996, "eval_samples_per_second": 101.337, "eval_steps_per_second": 12.674, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 180000 }, { "epoch": 106.32, <<<<<<< HEAD "learning_rate": 4.453125e-06, "loss": 0.764, ======= "learning_rate": 2e-05, "loss": 0.6214, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 190000 }, { "epoch": 106.32, <<<<<<< HEAD "eval_loss": 0.7412048578262329, "eval_runtime": 76.6531, "eval_samples_per_second": 100.87, "eval_steps_per_second": 12.615, ======= "eval_loss": 0.6199918389320374, "eval_runtime": 76.2832, "eval_samples_per_second": 101.359, "eval_steps_per_second": 12.676, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 190000 }, { "epoch": 111.92, <<<<<<< HEAD "learning_rate": 1e-05, "loss": 0.7632, ======= "learning_rate": 2e-05, "loss": 0.6177, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "step": 200000 }, { "epoch": 111.92, <<<<<<< HEAD "eval_loss": 0.7407946586608887, "eval_runtime": 76.5545, "eval_samples_per_second": 101.0, "eval_steps_per_second": 12.632, "step": 200000 }, { "epoch": 117.52, "learning_rate": 1e-05, "loss": 0.7575, "step": 210000 }, { "epoch": 117.52, "eval_loss": 0.7322171330451965, "eval_runtime": 76.7018, "eval_samples_per_second": 100.806, "eval_steps_per_second": 12.607, "step": 210000 }, { "epoch": 123.11, "learning_rate": 1e-05, "loss": 0.7422, "step": 220000 }, { "epoch": 123.11, "eval_loss": 0.7116619944572449, "eval_runtime": 76.7768, "eval_samples_per_second": 100.707, "eval_steps_per_second": 12.595, "step": 220000 }, { "epoch": 128.71, "learning_rate": 1e-05, "loss": 0.7142, "step": 230000 }, { "epoch": 128.71, "eval_loss": 0.6831667423248291, "eval_runtime": 76.4706, "eval_samples_per_second": 101.111, "eval_steps_per_second": 12.645, "step": 230000 }, { "epoch": 134.3, "learning_rate": 1e-05, "loss": 0.6903, "step": 240000 }, { "epoch": 134.3, "eval_loss": 0.6659817695617676, "eval_runtime": 76.4365, "eval_samples_per_second": 101.156, "eval_steps_per_second": 12.651, "step": 240000 }, { "epoch": 139.9, "learning_rate": 1e-05, "loss": 0.6732, "step": 250000 }, { "epoch": 139.9, "eval_loss": 0.6514819860458374, "eval_runtime": 76.6466, "eval_samples_per_second": 100.879, "eval_steps_per_second": 12.616, "step": 250000 }, { "epoch": 145.5, "learning_rate": 1e-05, "loss": 0.661, "step": 260000 }, { "epoch": 145.5, "eval_loss": 0.6453074812889099, "eval_runtime": 76.4842, "eval_samples_per_second": 101.093, "eval_steps_per_second": 12.643, "step": 260000 }, { "epoch": 151.09, "learning_rate": 1e-05, "loss": 0.652, "step": 270000 }, { "epoch": 151.09, "eval_loss": 0.637267529964447, "eval_runtime": 76.4521, "eval_samples_per_second": 101.135, "eval_steps_per_second": 12.648, "step": 270000 }, { "epoch": 156.69, "learning_rate": 1e-05, "loss": 0.6446, "step": 280000 }, { "epoch": 156.69, "eval_loss": 0.6328049302101135, "eval_runtime": 76.4339, "eval_samples_per_second": 101.159, "eval_steps_per_second": 12.651, "step": 280000 }, { "epoch": 162.28, "learning_rate": 1e-05, "loss": 0.6384, "step": 290000 }, { "epoch": 162.28, "eval_loss": 0.6286044120788574, "eval_runtime": 76.4408, "eval_samples_per_second": 101.15, "eval_steps_per_second": 12.65, "step": 290000 }, { "epoch": 167.88, "learning_rate": 1e-05, "loss": 0.6313, "step": 300000 }, { "epoch": 167.88, "eval_loss": 0.627047598361969, "eval_runtime": 76.2029, "eval_samples_per_second": 101.466, "eval_steps_per_second": 12.69, "step": 300000 }, { "epoch": 173.48, "learning_rate": 1e-05, "loss": 0.6267, "step": 310000 }, { "epoch": 173.48, "eval_loss": 0.6226180195808411, "eval_runtime": 76.351, "eval_samples_per_second": 101.269, "eval_steps_per_second": 12.665, "step": 310000 }, { "epoch": 179.07, "learning_rate": 1e-05, "loss": 0.6225, "step": 320000 }, { "epoch": 179.07, "eval_loss": 0.6174684166908264, "eval_runtime": 77.0828, "eval_samples_per_second": 100.308, "eval_steps_per_second": 12.545, "step": 320000 }, { "epoch": 184.67, "learning_rate": 1e-05, "loss": 0.6195, "step": 330000 }, { "epoch": 184.67, "eval_loss": 0.6189109086990356, "eval_runtime": 76.6515, "eval_samples_per_second": 100.872, "eval_steps_per_second": 12.616, "step": 330000 }, { "epoch": 190.26, "learning_rate": 1e-05, "loss": 0.6166, "step": 340000 }, { "epoch": 190.26, "eval_loss": 0.6162586808204651, "eval_runtime": 76.6287, "eval_samples_per_second": 100.902, "eval_steps_per_second": 12.619, "step": 340000 }, { "epoch": 195.86, "learning_rate": 1e-05, "loss": 0.614, "step": 350000 }, { "epoch": 195.86, "eval_loss": 0.6159895658493042, "eval_runtime": 76.4934, "eval_samples_per_second": 101.081, "eval_steps_per_second": 12.642, "step": 350000 }, { "epoch": 201.45, "learning_rate": 1e-05, "loss": 0.6117, "step": 360000 }, { "epoch": 201.45, "eval_loss": 0.6115593910217285, "eval_runtime": 75.8517, "eval_samples_per_second": 101.936, "eval_steps_per_second": 12.749, "step": 360000 }, { "epoch": 207.05, "learning_rate": 1e-05, "loss": 0.6094, "step": 370000 }, { "epoch": 207.05, "eval_loss": 0.6114900708198547, "eval_runtime": 76.348, "eval_samples_per_second": 101.273, "eval_steps_per_second": 12.666, "step": 370000 }, { "epoch": 212.65, "learning_rate": 1e-05, "loss": 0.6071, "step": 380000 }, { "epoch": 212.65, "eval_loss": 0.6110843420028687, "eval_runtime": 76.4292, "eval_samples_per_second": 101.165, "eval_steps_per_second": 12.652, "step": 380000 }, { "epoch": 218.24, "learning_rate": 1e-05, "loss": 0.6048, "step": 390000 }, { "epoch": 218.24, "eval_loss": 0.6108397245407104, "eval_runtime": 76.2883, "eval_samples_per_second": 101.352, "eval_steps_per_second": 12.676, "step": 390000 }, { "epoch": 223.84, "learning_rate": 1e-05, "loss": 0.6025, "step": 400000 }, { "epoch": 223.84, "eval_loss": 0.6071902513504028, "eval_runtime": 76.1442, "eval_samples_per_second": 101.544, "eval_steps_per_second": 12.7, "step": 400000 }, { "epoch": 229.43, "learning_rate": 1e-05, "loss": 0.6006, "step": 410000 }, { "epoch": 229.43, "eval_loss": 0.6058005690574646, "eval_runtime": 76.1177, "eval_samples_per_second": 101.579, "eval_steps_per_second": 12.704, "step": 410000 }, { "epoch": 235.03, "learning_rate": 1e-05, "loss": 0.599, "step": 420000 }, { "epoch": 235.03, "eval_loss": 0.6018807888031006, "eval_runtime": 76.6301, "eval_samples_per_second": 100.9, "eval_steps_per_second": 12.619, "step": 420000 }, { "epoch": 240.63, "learning_rate": 1e-05, "loss": 0.5969, "step": 430000 }, { "epoch": 240.63, "eval_loss": 0.6019513010978699, "eval_runtime": 76.6382, "eval_samples_per_second": 100.89, "eval_steps_per_second": 12.618, "step": 430000 }, { "epoch": 246.22, "learning_rate": 1e-05, "loss": 0.5956, "step": 440000 }, { "epoch": 246.22, "eval_loss": 0.6009297370910645, "eval_runtime": 76.7463, "eval_samples_per_second": 100.747, "eval_steps_per_second": 12.6, "step": 440000 }, { "epoch": 251.82, "learning_rate": 1e-05, "loss": 0.5937, "step": 450000 }, { "epoch": 251.82, "eval_loss": 0.6020432114601135, "eval_runtime": 76.498, "eval_samples_per_second": 101.075, "eval_steps_per_second": 12.641, "step": 450000 }, { "epoch": 257.41, "learning_rate": 1e-05, "loss": 0.5923, "step": 460000 }, { "epoch": 257.41, "eval_loss": 0.5997503399848938, "eval_runtime": 76.4264, "eval_samples_per_second": 101.169, "eval_steps_per_second": 12.653, "step": 460000 }, { "epoch": 263.01, "learning_rate": 1e-05, "loss": 0.5907, "step": 470000 }, { "epoch": 263.01, "eval_loss": 0.6007161140441895, "eval_runtime": 76.517, "eval_samples_per_second": 101.049, "eval_steps_per_second": 12.638, "step": 470000 }, { "epoch": 268.61, "learning_rate": 1e-05, "loss": 0.5894, "step": 480000 }, { "epoch": 268.61, "eval_loss": 0.5984556674957275, "eval_runtime": 76.4528, "eval_samples_per_second": 101.134, "eval_steps_per_second": 12.648, "step": 480000 }, { "epoch": 274.2, "learning_rate": 1e-05, "loss": 0.5876, "step": 490000 }, { "epoch": 274.2, "eval_loss": 0.5970821976661682, "eval_runtime": 76.2171, "eval_samples_per_second": 101.447, "eval_steps_per_second": 12.687, "step": 490000 }, { "epoch": 279.8, "learning_rate": 1e-05, "loss": 0.5863, "step": 500000 }, { "epoch": 279.8, "eval_loss": 0.5982722640037537, "eval_runtime": 76.3256, "eval_samples_per_second": 101.303, "eval_steps_per_second": 12.669, "step": 500000 }, { "epoch": 285.39, "learning_rate": 1e-05, "loss": 0.585, "step": 510000 }, { "epoch": 285.39, "eval_loss": 0.5990148782730103, "eval_runtime": 76.2053, "eval_samples_per_second": 101.463, "eval_steps_per_second": 12.689, "step": 510000 }, { "epoch": 290.99, "learning_rate": 1e-05, "loss": 0.583, "step": 520000 }, { "epoch": 290.99, "eval_loss": 0.5960124135017395, "eval_runtime": 76.218, "eval_samples_per_second": 101.446, "eval_steps_per_second": 12.687, "step": 520000 }, { "epoch": 296.59, "learning_rate": 1e-05, "loss": 0.5822, "step": 530000 }, { "epoch": 296.59, "eval_loss": 0.593532145023346, "eval_runtime": 76.3226, "eval_samples_per_second": 101.307, "eval_steps_per_second": 12.67, "step": 530000 }, { "epoch": 302.18, "learning_rate": 1e-05, "loss": 0.5808, "step": 540000 }, { "epoch": 302.18, "eval_loss": 0.596666693687439, "eval_runtime": 76.1588, "eval_samples_per_second": 101.525, "eval_steps_per_second": 12.697, "step": 540000 }, { "epoch": 307.78, "learning_rate": 1e-05, "loss": 0.5794, "step": 550000 }, { "epoch": 307.78, "eval_loss": 0.5946430563926697, "eval_runtime": 76.3226, "eval_samples_per_second": 101.307, "eval_steps_per_second": 12.67, "step": 550000 }, { "epoch": 313.37, "learning_rate": 1e-05, "loss": 0.578, "step": 560000 }, { "epoch": 313.37, "eval_loss": 0.5950666666030884, "eval_runtime": 76.243, "eval_samples_per_second": 101.413, "eval_steps_per_second": 12.683, "step": 560000 }, { "epoch": 318.97, "learning_rate": 1e-05, "loss": 0.5766, "step": 570000 }, { "epoch": 318.97, "eval_loss": 0.5932120680809021, "eval_runtime": 76.0598, "eval_samples_per_second": 101.657, "eval_steps_per_second": 12.714, "step": 570000 }, { "epoch": 324.57, "learning_rate": 1e-05, "loss": 0.5752, "step": 580000 }, { "epoch": 324.57, "eval_loss": 0.5916844606399536, "eval_runtime": 76.215, "eval_samples_per_second": 101.45, "eval_steps_per_second": 12.688, "step": 580000 }, { "epoch": 330.16, "learning_rate": 1e-05, "loss": 0.5739, "step": 590000 }, { "epoch": 330.16, "eval_loss": 0.592149019241333, "eval_runtime": 76.1575, "eval_samples_per_second": 101.526, "eval_steps_per_second": 12.697, "step": 590000 }, { "epoch": 335.76, "learning_rate": 1e-05, "loss": 0.5726, "step": 600000 }, { "epoch": 335.76, "eval_loss": 0.5907247066497803, "eval_runtime": 76.2114, "eval_samples_per_second": 101.455, "eval_steps_per_second": 12.688, "step": 600000 }, { "epoch": 341.35, "learning_rate": 1e-05, "loss": 0.5714, "step": 610000 }, { "epoch": 341.35, "eval_loss": 0.5907928347587585, "eval_runtime": 76.207, "eval_samples_per_second": 101.461, "eval_steps_per_second": 12.689, "step": 610000 }, { "epoch": 346.95, "learning_rate": 1e-05, "loss": 0.5702, "step": 620000 }, { "epoch": 346.95, "eval_loss": 0.5909689664840698, "eval_runtime": 76.3919, "eval_samples_per_second": 101.215, "eval_steps_per_second": 12.658, "step": 620000 }, { "epoch": 352.55, "learning_rate": 1e-05, "loss": 0.5686, "step": 630000 }, { "epoch": 352.55, "eval_loss": 0.5894390940666199, "eval_runtime": 76.3494, "eval_samples_per_second": 101.271, "eval_steps_per_second": 12.665, "step": 630000 }, { "epoch": 358.14, "learning_rate": 1e-05, "loss": 0.5674, "step": 640000 }, { "epoch": 358.14, "eval_loss": 0.5915200114250183, "eval_runtime": 76.5727, "eval_samples_per_second": 100.976, "eval_steps_per_second": 12.629, "step": 640000 }, { "epoch": 363.74, "learning_rate": 1e-05, "loss": 0.5664, "step": 650000 }, { "epoch": 363.74, "eval_loss": 0.5875544548034668, "eval_runtime": 76.0536, "eval_samples_per_second": 101.665, "eval_steps_per_second": 12.715, "step": 650000 }, { "epoch": 369.33, "learning_rate": 1e-05, "loss": 0.565, "step": 660000 }, { "epoch": 369.33, "eval_loss": 0.5878584980964661, "eval_runtime": 76.1299, "eval_samples_per_second": 101.563, "eval_steps_per_second": 12.702, "step": 660000 }, { "epoch": 374.93, "learning_rate": 1e-05, "loss": 0.5636, "step": 670000 }, { "epoch": 374.93, "eval_loss": 0.5897438526153564, "eval_runtime": 76.3557, "eval_samples_per_second": 101.263, "eval_steps_per_second": 12.664, "step": 670000 }, { "epoch": 380.53, "learning_rate": 1e-05, "loss": 0.5625, "step": 680000 }, { "epoch": 380.53, "eval_loss": 0.5888833999633789, "eval_runtime": 76.7072, "eval_samples_per_second": 100.799, "eval_steps_per_second": 12.606, "step": 680000 }, { "epoch": 386.12, "learning_rate": 1e-05, "loss": 0.5609, "step": 690000 }, { "epoch": 386.12, "eval_loss": 0.5903308987617493, "eval_runtime": 76.5139, "eval_samples_per_second": 101.053, "eval_steps_per_second": 12.638, "step": 690000 }, { "epoch": 391.72, "learning_rate": 1e-05, "loss": 0.5594, "step": 700000 }, { "epoch": 391.72, "eval_loss": 0.5877216458320618, "eval_runtime": 76.4574, "eval_samples_per_second": 101.128, "eval_steps_per_second": 12.648, "step": 700000 }, { "epoch": 397.31, "learning_rate": 1e-05, "loss": 0.5584, "step": 710000 }, { "epoch": 397.31, "eval_loss": 0.5875140428543091, "eval_runtime": 76.1634, "eval_samples_per_second": 101.519, "eval_steps_per_second": 12.696, "step": 710000 }, { "epoch": 402.91, "learning_rate": 1e-05, "loss": 0.5573, "step": 720000 }, { "epoch": 402.91, "eval_loss": 0.5887530446052551, "eval_runtime": 76.2807, "eval_samples_per_second": 101.362, "eval_steps_per_second": 12.677, "step": 720000 }, { "epoch": 408.51, "learning_rate": 1e-05, "loss": 0.5561, "step": 730000 }, { "epoch": 408.51, "eval_loss": 0.5863147974014282, "eval_runtime": 76.2255, "eval_samples_per_second": 101.436, "eval_steps_per_second": 12.686, "step": 730000 } ], "max_steps": 1000000, "num_train_epochs": 560, "total_flos": 3.197989282913906e+21, ======= "eval_loss": 0.6199995875358582, "eval_runtime": 76.4051, "eval_samples_per_second": 101.197, "eval_steps_per_second": 12.656, "step": 200000 }, { "epoch": 111.92, "step": 200000, "total_flos": 1.0512018951481177e+21, "train_loss": 0.5566074145507812, "train_runtime": 84698.4102, "train_samples_per_second": 37.781, "train_steps_per_second": 2.361 } ], "max_steps": 200000, "num_train_epochs": 112, "total_flos": 1.0512018951481177e+21, >>>>>>> f31c512c34f45b1f3e7b799ebda1f4af417ffe1b "trial_name": null, "trial_params": null }