{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.28, "eval_steps": 500, "global_step": 140, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 442.6666666666667, "completions/mean_terminated_length": 482.90909090909093, "completions/min_length": 0.0, "completions/min_terminated_length": 212.0, "epoch": 0.002, "format_failures": 0.0, "grad_norm": 0.3274489641189575, "kl": 0.0, "learning_rate": 0.0, "loss": 0.048, "num_tokens": 21804.0, "reward": 0.26185137033462524, "reward_std": 0.28920137882232666, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 136.5, "completions/mean_terminated_length": 148.9090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 60.0, "epoch": 0.004, "format_failures": 0.0, "grad_norm": 1.2693145275115967, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.0962, "num_tokens": 42324.0, "reward": 0.38461539149284363, "reward_std": 0.3770364224910736, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 217.83333333333334, "completions/mean_terminated_length": 237.63636363636363, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.006, "format_failures": 0.0, "grad_norm": 0.3044165074825287, "kl": 0.19029825925827026, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 58980.0, "reward": 0.0, "reward_std": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 321.0833333333333, "completions/mean_terminated_length": 350.27272727272725, "completions/min_length": 0.0, "completions/min_terminated_length": 103.0, "epoch": 0.008, "format_failures": 1.0, "grad_norm": 0.3372040390968323, "kl": 0.029289670288562775, "learning_rate": 1e-06, "loss": 0.1107, "num_tokens": 81756.0, "reward": 0.23689448833465576, "reward_std": 0.2267814427614212, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 119.08333333333333, "completions/mean_terminated_length": 129.9090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 66.0, "epoch": 0.01, "format_failures": 0.0, "grad_norm": 10.779764175415039, "kl": 3.1303787231445312, "learning_rate": 1e-06, "loss": 0.0311, "num_tokens": 96360.0, "reward": 0.1666666716337204, "reward_std": 0.30772873759269714, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 420.6666666666667, "completions/mean_terminated_length": 458.90909090909093, "completions/min_length": 0.0, "completions/min_terminated_length": 329.0, "epoch": 0.012, "format_failures": 1.0, "grad_norm": 0.2519327402114868, "kl": 0.016291129169985652, "learning_rate": 1e-06, "loss": 0.0559, "num_tokens": 119712.0, "reward": 0.34878918528556824, "reward_std": 0.2739146649837494, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 67.33333333333333, "completions/mean_terminated_length": 73.45454545454545, "completions/min_length": 0.0, "completions/min_terminated_length": 36.0, "epoch": 0.014, "format_failures": 0.0, "grad_norm": 2531.101806640625, "kl": 562.2636108398438, "learning_rate": 1e-06, "loss": 5.4405, "num_tokens": 128772.0, "reward": 0.0, "reward_std": 0.0, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 186.41666666666666, "completions/mean_terminated_length": 203.36363636363637, "completions/min_length": 0.0, "completions/min_terminated_length": 85.0, "epoch": 0.016, "format_failures": 0.0, "grad_norm": 0.7023671865463257, "kl": 0.0004708967899205163, "learning_rate": 1e-06, "loss": -0.1143, "num_tokens": 164100.0, "reward": 0.06388889253139496, "reward_std": 0.1274919956922531, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 253.41666666666666, "completions/mean_terminated_length": 276.45454545454544, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.018, "format_failures": 0.0, "grad_norm": 1.1911135911941528, "kl": 0.0012580148177221417, "learning_rate": 1e-06, "loss": -0.3277, "num_tokens": 197808.0, "reward": 0.1118159219622612, "reward_std": 0.2614404261112213, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 64.83333333333333, "completions/mean_terminated_length": 70.72727272727273, "completions/min_length": 0.0, "completions/min_terminated_length": 35.0, "epoch": 0.02, "format_failures": 0.0, "grad_norm": 1.324984073638916, "kl": 0.2648707218468189, "learning_rate": 1e-06, "loss": -0.0221, "num_tokens": 207000.0, "reward": 0.01666666753590107, "reward_std": 0.057735029608011246, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 126.33333333333333, "completions/mean_terminated_length": 137.8181818181818, "completions/min_length": 0.0, "completions/min_terminated_length": 62.0, "epoch": 0.022, "format_failures": 0.0, "grad_norm": 0.5873882174491882, "kl": 0.017587594222277403, "learning_rate": 1e-06, "loss": 0.0197, "num_tokens": 221808.0, "reward": 0.1805555671453476, "reward_std": 0.3134874999523163, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 2049.0, "completions/max_terminated_length": 2049.0, "completions/mean_length": 541.25, "completions/mean_terminated_length": 649.5, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.024, "format_failures": 0.0, "grad_norm": 0.48546102643013, "kl": 0.002345994464121759, "learning_rate": 1e-06, "loss": 0.0336, "num_tokens": 255132.0, "reward": 0.4682539701461792, "reward_std": 0.4320843815803528, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 29.666666666666668, "completions/mean_terminated_length": 32.36363636363637, "completions/min_length": 0.0, "completions/min_terminated_length": 22.0, "epoch": 0.026, "format_failures": 0.0, "grad_norm": 0.186175137758255, "kl": 0.041642000898718834, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 265092.0, "reward": 0.0, "reward_std": 0.0, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 381.6666666666667, "completions/mean_terminated_length": 416.3636363636364, "completions/min_length": 0.0, "completions/min_terminated_length": 188.0, "epoch": 0.028, "format_failures": 0.0, "grad_norm": 0.20345070958137512, "kl": 0.009796573780477047, "learning_rate": 1e-06, "loss": 0.0257, "num_tokens": 294096.0, "reward": 0.29761505126953125, "reward_std": 0.16453009843826294, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 1034.0, "completions/max_terminated_length": 1034.0, "completions/mean_length": 332.25, "completions/mean_terminated_length": 362.45454545454544, "completions/min_length": 0.0, "completions/min_terminated_length": 125.0, "epoch": 0.03, "format_failures": 1.0, "grad_norm": 0.5157941579818726, "kl": 0.004433898604474962, "learning_rate": 1e-06, "loss": -0.0103, "num_tokens": 325368.0, "reward": 0.2917824387550354, "reward_std": 0.3325340151786804, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 150.16666666666666, "completions/mean_terminated_length": 163.8181818181818, "completions/min_length": 0.0, "completions/min_terminated_length": 30.0, "epoch": 0.032, "format_failures": 0.0, "grad_norm": 0.05657627806067467, "kl": 0.0326845021918416, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 341196.0, "reward": 0.0, "reward_std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 228.41666666666666, "completions/mean_terminated_length": 249.1818181818182, "completions/min_length": 0.0, "completions/min_terminated_length": 26.0, "epoch": 0.034, "format_failures": 0.0, "grad_norm": 1.8653935194015503, "kl": 0.8598212422803044, "learning_rate": 1e-06, "loss": 0.014, "num_tokens": 354228.0, "reward": 0.01666666753590107, "reward_std": 0.05773502588272095, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 48.333333333333336, "completions/mean_terminated_length": 52.72727272727273, "completions/min_length": 0.0, "completions/min_terminated_length": 25.0, "epoch": 0.036, "format_failures": 1.0, "grad_norm": 0.018069056794047356, "kl": 0.023271435871720314, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 381468.0, "reward": 0.0, "reward_std": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 228.91666666666666, "completions/mean_terminated_length": 249.72727272727272, "completions/min_length": 0.0, "completions/min_terminated_length": 16.0, "epoch": 0.038, "format_failures": 0.0, "grad_norm": 1.073132872581482, "kl": 0.003063492476940155, "learning_rate": 1e-06, "loss": 0.0334, "num_tokens": 415356.0, "reward": 0.1666666716337204, "reward_std": 0.38924944400787354, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 84.58333333333333, "completions/mean_terminated_length": 92.27272727272727, "completions/min_length": 0.0, "completions/min_terminated_length": 66.0, "epoch": 0.04, "format_failures": 0.0, "grad_norm": 1.1736811399459839, "kl": 0.018741012550890446, "learning_rate": 1e-06, "loss": 0.0962, "num_tokens": 442596.0, "reward": 0.1041666716337204, "reward_std": 0.22508415579795837, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 89.58333333333333, "completions/mean_terminated_length": 97.72727272727273, "completions/min_length": 0.0, "completions/min_terminated_length": 69.0, "epoch": 0.042, "format_failures": 0.0, "grad_norm": 0.960914671421051, "kl": 0.03209133446216583, "learning_rate": 1e-06, "loss": -0.0169, "num_tokens": 453252.0, "reward": 0.2708333432674408, "reward_std": 0.4454101026058197, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 124.33333333333333, "completions/mean_terminated_length": 135.63636363636363, "completions/min_length": 0.0, "completions/min_terminated_length": 74.0, "epoch": 0.044, "format_failures": 0.0, "grad_norm": 1.0618880987167358, "kl": 0.03219995368272066, "learning_rate": 1e-06, "loss": -0.3593, "num_tokens": 481656.0, "reward": 0.09444444626569748, "reward_std": 0.17164288461208344, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 299.5, "completions/mean_terminated_length": 326.72727272727275, "completions/min_length": 0.0, "completions/min_terminated_length": 148.0, "epoch": 0.046, "format_failures": 0.0, "grad_norm": 0.3598278760910034, "kl": 0.031054741702973843, "learning_rate": 1e-06, "loss": 0.0131, "num_tokens": 505704.0, "reward": 0.4847402572631836, "reward_std": 0.25003767013549805, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 297.5, "completions/mean_terminated_length": 324.54545454545456, "completions/min_length": 0.0, "completions/min_terminated_length": 211.0, "epoch": 0.048, "format_failures": 0.0, "grad_norm": 0.27960336208343506, "kl": 0.04240706283599138, "learning_rate": 1e-06, "loss": -0.0398, "num_tokens": 523500.0, "reward": 0.2615740895271301, "reward_std": 0.219794362783432, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 179.16666666666666, "completions/mean_terminated_length": 195.45454545454547, "completions/min_length": 0.0, "completions/min_terminated_length": 114.0, "epoch": 0.05, "format_failures": 0.0, "grad_norm": 1.2980320453643799, "kl": 0.0048073166981339455, "learning_rate": 1e-06, "loss": -0.3887, "num_tokens": 555300.0, "reward": 0.5003399848937988, "reward_std": 0.39150455594062805, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 315.9166666666667, "completions/mean_terminated_length": 344.6363636363636, "completions/min_length": 0.0, "completions/min_terminated_length": 239.0, "epoch": 0.052, "format_failures": 0.0, "grad_norm": 0.2552706003189087, "kl": 0.027493927627801895, "learning_rate": 1e-06, "loss": 0.0567, "num_tokens": 576000.0, "reward": 0.43729767203330994, "reward_std": 0.18975813686847687, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 72.91666666666667, "completions/mean_terminated_length": 79.54545454545455, "completions/min_length": 0.0, "completions/min_terminated_length": 65.0, "epoch": 0.054, "format_failures": 0.0, "grad_norm": 1.1299240589141846, "kl": 0.0332061443477869, "learning_rate": 1e-06, "loss": -0.057, "num_tokens": 584712.0, "reward": 0.33095240592956543, "reward_std": 0.444376677274704, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 91.16666666666667, "completions/mean_terminated_length": 99.45454545454545, "completions/min_length": 0.0, "completions/min_terminated_length": 56.0, "epoch": 0.056, "format_failures": 0.0, "grad_norm": 0.044371046125888824, "kl": 0.03765446413308382, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 598032.0, "reward": 0.0, "reward_std": 0.0, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 304.5, "completions/mean_terminated_length": 332.1818181818182, "completions/min_length": 0.0, "completions/min_terminated_length": 212.0, "epoch": 0.058, "format_failures": 0.0, "grad_norm": 0.5104940533638, "kl": 0.03451683558523655, "learning_rate": 1e-06, "loss": -0.0274, "num_tokens": 615204.0, "reward": 0.4068452715873718, "reward_std": 0.37161099910736084, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 162.91666666666666, "completions/mean_terminated_length": 177.72727272727272, "completions/min_length": 0.0, "completions/min_terminated_length": 59.0, "epoch": 0.06, "format_failures": 0.0, "grad_norm": 1.2335582971572876, "kl": 0.007039119256660342, "learning_rate": 1e-06, "loss": 0.2673, "num_tokens": 647892.0, "reward": 0.3291666805744171, "reward_std": 0.4266456663608551, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 332.5, "completions/mean_terminated_length": 362.72727272727275, "completions/min_length": 0.0, "completions/min_terminated_length": 222.0, "epoch": 0.062, "format_failures": 2.0, "grad_norm": 0.3000166416168213, "kl": 0.03664882015436888, "learning_rate": 1e-06, "loss": 0.0306, "num_tokens": 670860.0, "reward": 0.6458902955055237, "reward_std": 0.26038500666618347, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 218.66666666666666, "completions/mean_terminated_length": 238.54545454545453, "completions/min_length": 0.0, "completions/min_terminated_length": 180.0, "epoch": 0.064, "format_failures": 0.0, "grad_norm": 0.37272748351097107, "kl": 0.07015270553529263, "learning_rate": 1e-06, "loss": 0.0169, "num_tokens": 682212.0, "reward": 0.43658646941185, "reward_std": 0.24143192172050476, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 58.09090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 47.0, "epoch": 0.066, "format_failures": 0.0, "grad_norm": 1.1589769124984741, "kl": 0.03555137664079666, "learning_rate": 1e-06, "loss": -0.0651, "num_tokens": 692040.0, "reward": 0.11666666716337204, "reward_std": 0.301008403301239, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 336.0, "completions/mean_terminated_length": 366.54545454545456, "completions/min_length": 0.0, "completions/min_terminated_length": 292.0, "epoch": 0.068, "format_failures": 0.0, "grad_norm": 0.42152470350265503, "kl": 0.19683832861483097, "learning_rate": 1e-06, "loss": -0.0173, "num_tokens": 704484.0, "reward": 0.5136784911155701, "reward_std": 0.38917282223701477, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 59.166666666666664, "completions/mean_terminated_length": 64.54545454545455, "completions/min_length": 0.0, "completions/min_terminated_length": 40.0, "epoch": 0.07, "format_failures": 0.0, "grad_norm": 1.729435682296753, "kl": 0.055947478860616684, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 710520.0, "reward": 0.5611110925674438, "reward_std": 0.45256468653678894, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 91.91666666666667, "completions/mean_terminated_length": 100.27272727272727, "completions/min_length": 0.0, "completions/min_terminated_length": 59.0, "epoch": 0.072, "format_failures": 0.0, "grad_norm": 0.7297618389129639, "kl": 0.28226011246442795, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 720588.0, "reward": 0.0, "reward_std": 0.0, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 184.66666666666666, "completions/mean_terminated_length": 201.45454545454547, "completions/min_length": 0.0, "completions/min_terminated_length": 152.0, "epoch": 0.074, "format_failures": 0.0, "grad_norm": 0.1786535382270813, "kl": 0.05143214017152786, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 731112.0, "reward": 0.5931217074394226, "reward_std": 0.15197694301605225, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 61.416666666666664, "completions/mean_terminated_length": 67.0, "completions/min_length": 0.0, "completions/min_terminated_length": 36.0, "epoch": 0.076, "format_failures": 1.0, "grad_norm": 2.560441732406616, "kl": 0.061069367453455925, "learning_rate": 1e-06, "loss": 0.1107, "num_tokens": 758340.0, "reward": 0.0833333358168602, "reward_std": 0.28867512941360474, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 2050.0, "completions/max_terminated_length": 2050.0, "completions/mean_length": 715.0, "completions/mean_terminated_length": 780.0, "completions/min_length": 0.0, "completions/min_terminated_length": 357.0, "epoch": 0.078, "format_failures": 0.0, "grad_norm": 0.41932860016822815, "kl": 0.01548363408073783, "learning_rate": 1e-06, "loss": 0.0106, "num_tokens": 790968.0, "reward": 0.25740742683410645, "reward_std": 0.32573264837265015, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 1162.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 471.75, "completions/mean_terminated_length": 514.6363636363636, "completions/min_length": 0.0, "completions/min_terminated_length": 113.0, "epoch": 0.08, "format_failures": 0.0, "grad_norm": 0.8145480155944824, "kl": 0.016389482654631138, "learning_rate": 1e-06, "loss": 0.154, "num_tokens": 829104.0, "reward": 0.43334314227104187, "reward_std": 0.3763042986392975, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 99.91666666666667, "completions/mean_terminated_length": 109.0, "completions/min_length": 0.0, "completions/min_terminated_length": 47.0, "epoch": 0.082, "format_failures": 0.0, "grad_norm": 18.232030868530273, "kl": 1.717683531343937, "learning_rate": 1e-06, "loss": 0.197, "num_tokens": 850716.0, "reward": 0.2430555671453476, "reward_std": 0.4042987823486328, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 77.33333333333333, "completions/mean_terminated_length": 84.36363636363636, "completions/min_length": 0.0, "completions/min_terminated_length": 63.0, "epoch": 0.084, "format_failures": 0.0, "grad_norm": 0.5794758796691895, "kl": 0.21323725581169128, "learning_rate": 1e-06, "loss": -0.0344, "num_tokens": 859644.0, "reward": 0.0476190522313118, "reward_std": 0.1649572253227234, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 136.66666666666666, "completions/mean_terminated_length": 149.0909090909091, "completions/min_length": 0.0, "completions/min_terminated_length": 57.0, "epoch": 0.086, "format_failures": 0.0, "grad_norm": 2.507535934448242, "kl": 0.2139158956706524, "learning_rate": 1e-06, "loss": -0.0282, "num_tokens": 871596.0, "reward": 0.3333333432674408, "reward_std": 0.4923659861087799, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 28.25, "completions/mean_terminated_length": 42.375, "completions/min_length": 0.0, "completions/min_terminated_length": 21.0, "epoch": 0.088, "format_failures": 0.0, "grad_norm": 0.33207282423973083, "kl": 0.035286733880639076, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 879828.0, "reward": 0.0, "reward_std": 0.0, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 117.83333333333333, "completions/mean_terminated_length": 128.54545454545453, "completions/min_length": 0.0, "completions/min_terminated_length": 68.0, "epoch": 0.09, "format_failures": 0.0, "grad_norm": 0.2761678099632263, "kl": 0.15724625438451767, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 899448.0, "reward": 0.0, "reward_std": 0.0, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 105.16666666666667, "completions/mean_terminated_length": 114.72727272727273, "completions/min_length": 0.0, "completions/min_terminated_length": 26.0, "epoch": 0.092, "format_failures": 0.0, "grad_norm": 1.1471128463745117, "kl": 0.12899010255932808, "learning_rate": 1e-06, "loss": 0.0117, "num_tokens": 914760.0, "reward": 0.1666666716337204, "reward_std": 0.30151134729385376, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 233.66666666666666, "completions/mean_terminated_length": 254.9090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 153.0, "epoch": 0.094, "format_failures": 0.0, "grad_norm": 0.5467153191566467, "kl": 0.2796362675726414, "learning_rate": 1e-06, "loss": -0.0318, "num_tokens": 925212.0, "reward": 0.549458920955658, "reward_std": 0.3676450848579407, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 166.25, "completions/mean_terminated_length": 181.36363636363637, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.096, "format_failures": 0.0, "grad_norm": 0.78724205493927, "kl": 0.49516983330249786, "learning_rate": 1e-06, "loss": -0.0104, "num_tokens": 938424.0, "reward": 0.02083333395421505, "reward_std": 0.07216878235340118, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 106.08333333333333, "completions/mean_terminated_length": 115.72727272727273, "completions/min_length": 0.0, "completions/min_terminated_length": 29.0, "epoch": 0.098, "format_failures": 1.0, "grad_norm": 1.7356528043746948, "kl": 0.389555960893631, "learning_rate": 1e-06, "loss": -0.0599, "num_tokens": 950172.0, "reward": 0.1944444626569748, "reward_std": 0.38816672563552856, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 1127.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 186.58333333333334, "completions/mean_terminated_length": 223.9, "completions/min_length": 0.0, "completions/min_terminated_length": 49.0, "epoch": 0.1, "format_failures": 0.0, "grad_norm": 1.3811311721801758, "kl": 0.0656690001487732, "learning_rate": 1e-06, "loss": 0.949, "num_tokens": 981816.0, "reward": 0.5007641911506653, "reward_std": 0.4272591173648834, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 74.75, "completions/mean_terminated_length": 81.54545454545455, "completions/min_length": 0.0, "completions/min_terminated_length": 67.0, "epoch": 0.102, "format_failures": 0.0, "grad_norm": 3.630605697631836, "kl": 0.11415744014084339, "learning_rate": 1e-06, "loss": 0.1083, "num_tokens": 994800.0, "reward": 0.4722222685813904, "reward_std": 0.4596514403820038, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 292.9166666666667, "completions/mean_terminated_length": 319.54545454545456, "completions/min_length": 0.0, "completions/min_terminated_length": 230.0, "epoch": 0.104, "format_failures": 0.0, "grad_norm": 0.664616048336029, "kl": 0.024851050227880478, "learning_rate": 1e-06, "loss": -0.0988, "num_tokens": 1028352.0, "reward": 0.5121031999588013, "reward_std": 0.26174625754356384, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 267.4166666666667, "completions/mean_terminated_length": 291.72727272727275, "completions/min_length": 0.0, "completions/min_terminated_length": 158.0, "epoch": 0.106, "format_failures": 0.0, "grad_norm": 0.3362949788570404, "kl": 0.09099859930574894, "learning_rate": 1e-06, "loss": 0.0303, "num_tokens": 1053264.0, "reward": 0.0625, "reward_std": 0.21650634706020355, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 292.0833333333333, "completions/mean_terminated_length": 318.6363636363636, "completions/min_length": 0.0, "completions/min_terminated_length": 190.0, "epoch": 0.108, "format_failures": 0.0, "grad_norm": 0.17621153593063354, "kl": 0.03119577933102846, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 1068108.0, "reward": 0.4200083613395691, "reward_std": 0.194437637925148, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 88.75, "completions/mean_terminated_length": 96.81818181818181, "completions/min_length": 0.0, "completions/min_terminated_length": 69.0, "epoch": 0.11, "format_failures": 0.0, "grad_norm": 0.6367191672325134, "kl": 0.03671593498438597, "learning_rate": 1e-06, "loss": 0.0088, "num_tokens": 1079820.0, "reward": 0.19027778506278992, "reward_std": 0.15930061042308807, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 163.58333333333334, "completions/mean_terminated_length": 178.45454545454547, "completions/min_length": 0.0, "completions/min_terminated_length": 113.0, "epoch": 0.112, "format_failures": 0.0, "grad_norm": 2.1606733798980713, "kl": 0.20935122203081846, "learning_rate": 1e-06, "loss": -0.0277, "num_tokens": 1091832.0, "reward": 0.5777778029441833, "reward_std": 0.4515592157840729, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 288.4166666666667, "completions/mean_terminated_length": 314.6363636363636, "completions/min_length": 0.0, "completions/min_terminated_length": 169.0, "epoch": 0.114, "format_failures": 0.0, "grad_norm": 0.32393601536750793, "kl": 0.031358057633042336, "learning_rate": 1e-06, "loss": -0.044, "num_tokens": 1105608.0, "reward": 0.1666666716337204, "reward_std": 0.24984844028949738, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 96.0, "completions/max_terminated_length": 96.0, "completions/mean_length": 65.5, "completions/mean_terminated_length": 71.45454545454545, "completions/min_length": 0.0, "completions/min_terminated_length": 55.0, "epoch": 0.116, "format_failures": 0.0, "grad_norm": 0.021954922005534172, "kl": 0.018348069861531258, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 1113168.0, "reward": 0.0, "reward_std": 0.0, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 224.41666666666666, "completions/mean_terminated_length": 244.8181818181818, "completions/min_length": 0.0, "completions/min_terminated_length": 92.0, "epoch": 0.118, "format_failures": 0.0, "grad_norm": 1.1990734338760376, "kl": 0.3062889650464058, "learning_rate": 1e-06, "loss": 0.0431, "num_tokens": 1136832.0, "reward": 0.2395833432674408, "reward_std": 0.25259074568748474, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 238.0, "completions/mean_terminated_length": 259.6363636363636, "completions/min_length": 0.0, "completions/min_terminated_length": 80.0, "epoch": 0.12, "format_failures": 0.0, "grad_norm": 0.5170612931251526, "kl": 0.03292474150657654, "learning_rate": 1e-06, "loss": 0.0251, "num_tokens": 1150536.0, "reward": 0.39345240592956543, "reward_std": 0.3553503155708313, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 82.16666666666667, "completions/mean_terminated_length": 89.63636363636364, "completions/min_length": 0.0, "completions/min_terminated_length": 65.0, "epoch": 0.122, "format_failures": 0.0, "grad_norm": 1.1562092304229736, "kl": 0.023061166517436504, "learning_rate": 1e-06, "loss": 0.1452, "num_tokens": 1158984.0, "reward": 0.7333333492279053, "reward_std": 0.3639269173145294, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 296.1666666666667, "completions/mean_terminated_length": 323.09090909090907, "completions/min_length": 0.0, "completions/min_terminated_length": 201.0, "epoch": 0.124, "format_failures": 0.0, "grad_norm": 0.32044336199760437, "kl": 0.06375124305486679, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 1173504.0, "reward": 0.43736547231674194, "reward_std": 0.25956276059150696, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 2051.0, "completions/max_terminated_length": 2051.0, "completions/mean_length": 586.4166666666666, "completions/mean_terminated_length": 639.7272727272727, "completions/min_length": 0.0, "completions/min_terminated_length": 38.0, "epoch": 0.126, "format_failures": 0.0, "grad_norm": 0.6462875008583069, "kl": 0.023477558977901936, "learning_rate": 1e-06, "loss": 0.0492, "num_tokens": 1206840.0, "reward": 0.501884937286377, "reward_std": 0.5706992149353027, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 150.66666666666666, "completions/mean_terminated_length": 164.36363636363637, "completions/min_length": 0.0, "completions/min_terminated_length": 97.0, "epoch": 0.128, "format_failures": 0.0, "grad_norm": 0.4827415347099304, "kl": 0.11513948068022728, "learning_rate": 1e-06, "loss": 0.2183, "num_tokens": 1230888.0, "reward": 0.3715476393699646, "reward_std": 0.17215265333652496, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 1340.0, "completions/max_terminated_length": 1340.0, "completions/mean_length": 277.5833333333333, "completions/mean_terminated_length": 302.8181818181818, "completions/min_length": 0.0, "completions/min_terminated_length": 20.0, "epoch": 0.13, "format_failures": 0.0, "grad_norm": 0.46889665722846985, "kl": 0.9275694619864225, "learning_rate": 1e-06, "loss": 0.2754, "num_tokens": 1262100.0, "reward": 0.3917522430419922, "reward_std": 0.2266404628753662, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 366.25, "completions/mean_terminated_length": 399.54545454545456, "completions/min_length": 0.0, "completions/min_terminated_length": 212.0, "epoch": 0.132, "format_failures": 1.0, "grad_norm": 0.30657899379730225, "kl": 0.16883518174290657, "learning_rate": 1e-06, "loss": 0.0155, "num_tokens": 1278012.0, "reward": 0.34761905670166016, "reward_std": 0.2757572531700134, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 300.9166666666667, "completions/mean_terminated_length": 361.1, "completions/min_length": 0.0, "completions/min_terminated_length": 224.0, "epoch": 0.134, "format_failures": 0.0, "grad_norm": 0.6152874231338501, "kl": 0.10999106336385012, "learning_rate": 1e-06, "loss": 0.3303, "num_tokens": 1308996.0, "reward": 0.32609128952026367, "reward_std": 0.23752012848854065, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 137.5, "completions/mean_terminated_length": 150.0, "completions/min_length": 0.0, "completions/min_terminated_length": 59.0, "epoch": 0.136, "format_failures": 0.0, "grad_norm": 1.7395364046096802, "kl": 0.7087040841579437, "learning_rate": 1e-06, "loss": -0.0121, "num_tokens": 1321020.0, "reward": 0.20873016119003296, "reward_std": 0.34043052792549133, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 129.83333333333334, "completions/mean_terminated_length": 141.63636363636363, "completions/min_length": 0.0, "completions/min_terminated_length": 28.0, "epoch": 0.138, "format_failures": 0.0, "grad_norm": 0.902642548084259, "kl": 0.7902000248432159, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 1332492.0, "reward": 0.0877976268529892, "reward_std": 0.20928393304347992, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1172.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 333.1666666666667, "completions/mean_terminated_length": 444.22222222222223, "completions/min_length": 0.0, "completions/min_terminated_length": 133.0, "epoch": 0.14, "format_failures": 0.0, "grad_norm": 0.22367094457149506, "kl": 0.03544241935014725, "learning_rate": 1e-06, "loss": 0.0442, "num_tokens": 1363812.0, "reward": 0.22601282596588135, "reward_std": 0.1535530686378479, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 368.5833333333333, "completions/mean_terminated_length": 402.09090909090907, "completions/min_length": 0.0, "completions/min_terminated_length": 205.0, "epoch": 0.142, "format_failures": 0.0, "grad_norm": 0.25884878635406494, "kl": 0.0446395231410861, "learning_rate": 1e-06, "loss": 0.0091, "num_tokens": 1396788.0, "reward": 0.6545634865760803, "reward_std": 0.2292691022157669, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 127.75, "completions/mean_terminated_length": 139.36363636363637, "completions/min_length": 0.0, "completions/min_terminated_length": 62.0, "epoch": 0.144, "format_failures": 0.0, "grad_norm": 2.139310121536255, "kl": 0.2615228593349457, "learning_rate": 1e-06, "loss": 0.0935, "num_tokens": 1411512.0, "reward": 0.625, "reward_std": 0.4826536476612091, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 321.1666666666667, "completions/mean_terminated_length": 350.3636363636364, "completions/min_length": 0.0, "completions/min_terminated_length": 194.0, "epoch": 0.146, "format_failures": 0.0, "grad_norm": 0.7009347081184387, "kl": 0.13678913563489914, "learning_rate": 1e-06, "loss": 0.0771, "num_tokens": 1436532.0, "reward": 0.3439815044403076, "reward_std": 0.27971503138542175, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 253.08333333333334, "completions/mean_terminated_length": 276.09090909090907, "completions/min_length": 0.0, "completions/min_terminated_length": 271.0, "epoch": 0.148, "format_failures": 0.0, "grad_norm": 1.2899372577667236, "kl": 0.10085960477590561, "learning_rate": 1e-06, "loss": 0.3862, "num_tokens": 1471704.0, "reward": 0.7222222685813904, "reward_std": 0.4457052946090698, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 196.5, "completions/mean_terminated_length": 214.36363636363637, "completions/min_length": 0.0, "completions/min_terminated_length": 54.0, "epoch": 0.15, "format_failures": 0.0, "grad_norm": 0.4177331328392029, "kl": 0.026733385398983955, "learning_rate": 1e-06, "loss": 0.0579, "num_tokens": 1485468.0, "reward": 0.2735119163990021, "reward_std": 0.30911651253700256, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 200.41666666666666, "completions/mean_terminated_length": 218.63636363636363, "completions/min_length": 0.0, "completions/min_terminated_length": 62.0, "epoch": 0.152, "format_failures": 0.0, "grad_norm": 0.8074631094932556, "kl": 0.45791861414909363, "learning_rate": 1e-06, "loss": -0.0476, "num_tokens": 1500636.0, "reward": 0.17129629850387573, "reward_std": 0.19502559304237366, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 144.08333333333334, "completions/mean_terminated_length": 157.1818181818182, "completions/min_length": 0.0, "completions/min_terminated_length": 21.0, "epoch": 0.154, "format_failures": 0.0, "grad_norm": 1.8004605770111084, "kl": 0.32159996032714844, "learning_rate": 1e-06, "loss": -0.0603, "num_tokens": 1512264.0, "reward": 0.5055555701255798, "reward_std": 0.29963788390159607, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 312.1666666666667, "completions/mean_terminated_length": 340.54545454545456, "completions/min_length": 0.0, "completions/min_terminated_length": 170.0, "epoch": 0.156, "format_failures": 0.0, "grad_norm": 0.3055727481842041, "kl": 0.03414521459490061, "learning_rate": 1e-06, "loss": -0.0067, "num_tokens": 1526292.0, "reward": 0.5897321701049805, "reward_std": 0.2986750900745392, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 167.5, "completions/mean_terminated_length": 182.72727272727272, "completions/min_length": 0.0, "completions/min_terminated_length": 165.0, "epoch": 0.158, "format_failures": 0.0, "grad_norm": 2.3401753902435303, "kl": 0.03888106718659401, "learning_rate": 1e-06, "loss": -0.0218, "num_tokens": 1540416.0, "reward": 0.6666666865348816, "reward_std": 0.4923659861087799, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 210.91666666666666, "completions/mean_terminated_length": 253.1, "completions/min_length": 0.0, "completions/min_terminated_length": 137.0, "epoch": 0.16, "format_failures": 0.0, "grad_norm": 28.73111343383789, "kl": 15.663371562957764, "learning_rate": 1e-06, "loss": 0.0445, "num_tokens": 1553580.0, "reward": 0.4305555820465088, "reward_std": 0.4738534092903137, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 43.166666666666664, "completions/mean_terminated_length": 47.09090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 32.0, "epoch": 0.162, "format_failures": 0.0, "grad_norm": 13.234149932861328, "kl": 2.6492202281951904, "learning_rate": 1e-06, "loss": -0.0385, "num_tokens": 1560816.0, "reward": 0.27916666865348816, "reward_std": 0.42504456639289856, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 189.66666666666666, "completions/mean_terminated_length": 206.9090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.164, "format_failures": 0.0, "grad_norm": 1.0555896759033203, "kl": 0.060676803812384605, "learning_rate": 1e-06, "loss": -0.0432, "num_tokens": 1573524.0, "reward": 0.39722225069999695, "reward_std": 0.2684729993343353, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 482.1666666666667, "completions/mean_terminated_length": 526.0, "completions/min_length": 0.0, "completions/min_terminated_length": 479.0, "epoch": 0.166, "format_failures": 0.0, "grad_norm": 0.27017322182655334, "kl": 0.013310576789081097, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 1595796.0, "reward": 0.8000000715255737, "reward_std": 0.39080336689949036, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 144.91666666666666, "completions/mean_terminated_length": 158.0909090909091, "completions/min_length": 0.0, "completions/min_terminated_length": 83.0, "epoch": 0.168, "format_failures": 0.0, "grad_norm": 1.0021555423736572, "kl": 0.2212899848818779, "learning_rate": 1e-06, "loss": 0.0304, "num_tokens": 1606284.0, "reward": 0.2957010865211487, "reward_std": 0.2737172842025757, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 2050.0, "completions/max_terminated_length": 2050.0, "completions/mean_length": 510.0833333333333, "completions/mean_terminated_length": 556.4545454545455, "completions/min_length": 0.0, "completions/min_terminated_length": 216.0, "epoch": 0.17, "format_failures": 0.0, "grad_norm": 0.3675689399242401, "kl": 0.2206931747496128, "learning_rate": 1e-06, "loss": 0.1278, "num_tokens": 1639152.0, "reward": 0.43888890743255615, "reward_std": 0.2596941888332367, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 152.25, "completions/mean_terminated_length": 166.0909090909091, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.172, "format_failures": 0.0, "grad_norm": 2.8949317932128906, "kl": 1.413679599761963, "learning_rate": 1e-06, "loss": 0.0356, "num_tokens": 1652364.0, "reward": 0.4761905074119568, "reward_std": 0.5035434365272522, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 152.91666666666666, "completions/mean_terminated_length": 166.8181818181818, "completions/min_length": 0.0, "completions/min_terminated_length": 58.0, "epoch": 0.174, "format_failures": 0.0, "grad_norm": 1.7609695196151733, "kl": 0.07055489160120487, "learning_rate": 1e-06, "loss": 0.3366, "num_tokens": 1685136.0, "reward": 0.33750003576278687, "reward_std": 0.43647608160972595, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 135.25, "completions/mean_terminated_length": 147.54545454545453, "completions/min_length": 0.0, "completions/min_terminated_length": 77.0, "epoch": 0.176, "format_failures": 0.0, "grad_norm": 0.6215497255325317, "kl": 0.08650689758360386, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 1693764.0, "reward": 0.5745911598205566, "reward_std": 0.1768045872449875, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 73.25, "completions/mean_terminated_length": 79.9090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 61.0, "epoch": 0.178, "format_failures": 1.0, "grad_norm": 0.8421996235847473, "kl": 0.016213122755289078, "learning_rate": 1e-06, "loss": 0.0149, "num_tokens": 1707588.0, "reward": 0.06666667014360428, "reward_std": 0.1775250881910324, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 178.16666666666666, "completions/mean_terminated_length": 194.36363636363637, "completions/min_length": 0.0, "completions/min_terminated_length": 101.0, "epoch": 0.18, "format_failures": 0.0, "grad_norm": 0.4202212691307068, "kl": 0.3119240030646324, "learning_rate": 1e-06, "loss": 0.0093, "num_tokens": 1716792.0, "reward": 0.6381944417953491, "reward_std": 0.22775352001190186, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 165.58333333333334, "completions/mean_terminated_length": 180.63636363636363, "completions/min_length": 0.0, "completions/min_terminated_length": 56.0, "epoch": 0.182, "format_failures": 0.0, "grad_norm": 3.5526509284973145, "kl": 0.04295740742236376, "learning_rate": 1e-06, "loss": -0.007, "num_tokens": 1735188.0, "reward": 0.6666666865348816, "reward_std": 0.4923659861087799, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 296.3333333333333, "completions/mean_terminated_length": 323.27272727272725, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.184, "format_failures": 0.0, "grad_norm": 0.7098760008811951, "kl": 0.14585042744874954, "learning_rate": 1e-06, "loss": -0.052, "num_tokens": 1748808.0, "reward": 0.4570105969905853, "reward_std": 0.29787296056747437, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 325.1666666666667, "completions/mean_terminated_length": 354.72727272727275, "completions/min_length": 0.0, "completions/min_terminated_length": 233.0, "epoch": 0.186, "format_failures": 0.0, "grad_norm": 4.00807523727417, "kl": 2.2327868938446045, "learning_rate": 1e-06, "loss": 0.0328, "num_tokens": 1763196.0, "reward": 0.37762749195098877, "reward_std": 0.2510078251361847, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 78.66666666666667, "completions/mean_terminated_length": 85.81818181818181, "completions/min_length": 0.0, "completions/min_terminated_length": 60.0, "epoch": 0.188, "format_failures": 0.0, "grad_norm": 4.166850566864014, "kl": 0.4828091114759445, "learning_rate": 1e-06, "loss": -0.0043, "num_tokens": 1775700.0, "reward": 0.41428571939468384, "reward_std": 0.20157082378864288, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 163.0, "completions/mean_terminated_length": 177.8181818181818, "completions/min_length": 0.0, "completions/min_terminated_length": 86.0, "epoch": 0.19, "format_failures": 0.0, "grad_norm": 2.0013251304626465, "kl": 0.3356290655210614, "learning_rate": 1e-06, "loss": -0.0532, "num_tokens": 1790064.0, "reward": 0.4275793731212616, "reward_std": 0.3848039209842682, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 128.33333333333334, "completions/mean_terminated_length": 140.0, "completions/min_length": 0.0, "completions/min_terminated_length": 134.0, "epoch": 0.192, "format_failures": 0.0, "grad_norm": 6.922305107116699, "kl": 3.5449295742437243, "learning_rate": 1e-06, "loss": 0.0385, "num_tokens": 1803036.0, "reward": 0.6979166865348816, "reward_std": 0.31738603115081787, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 94.33333333333333, "completions/mean_terminated_length": 102.9090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 72.0, "epoch": 0.194, "format_failures": 0.0, "grad_norm": 1.4514728784561157, "kl": 0.1412234902381897, "learning_rate": 1e-06, "loss": 0.3157, "num_tokens": 1816092.0, "reward": 0.8380953073501587, "reward_std": 0.30834609270095825, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 43.5, "completions/mean_terminated_length": 47.45454545454545, "completions/min_length": 0.0, "completions/min_terminated_length": 31.0, "epoch": 0.196, "format_failures": 0.0, "grad_norm": 2.004136085510254, "kl": 0.6110408902168274, "learning_rate": 1e-06, "loss": 0.0095, "num_tokens": 1827024.0, "reward": 0.0, "reward_std": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 212.08333333333334, "completions/mean_terminated_length": 231.36363636363637, "completions/min_length": 0.0, "completions/min_terminated_length": 102.0, "epoch": 0.198, "format_failures": 0.0, "grad_norm": 0.8370314240455627, "kl": 0.09233395755290985, "learning_rate": 1e-06, "loss": 0.1438, "num_tokens": 1860576.0, "reward": 0.2782828211784363, "reward_std": 0.2644941210746765, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 163.25, "completions/mean_terminated_length": 178.0909090909091, "completions/min_length": 0.0, "completions/min_terminated_length": 101.0, "epoch": 0.2, "format_failures": 0.0, "grad_norm": 1.565374732017517, "kl": 0.391565203666687, "learning_rate": 1e-06, "loss": -0.0497, "num_tokens": 1872996.0, "reward": 0.5944445133209229, "reward_std": 0.47775429487228394, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 150.16666666666666, "completions/mean_terminated_length": 163.8181818181818, "completions/min_length": 0.0, "completions/min_terminated_length": 94.0, "epoch": 0.202, "format_failures": 0.0, "grad_norm": 1.6569881439208984, "kl": 0.24375841114670038, "learning_rate": 1e-06, "loss": 0.0387, "num_tokens": 1892856.0, "reward": 0.3499999940395355, "reward_std": 0.36666667461395264, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 107.66666666666667, "completions/mean_terminated_length": 117.45454545454545, "completions/min_length": 0.0, "completions/min_terminated_length": 93.0, "epoch": 0.204, "format_failures": 0.0, "grad_norm": 0.9490823745727539, "kl": 0.010788497282192111, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 1903992.0, "reward": 0.7714947462081909, "reward_std": 0.2890874743461609, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 66.0, "completions/mean_terminated_length": 72.0, "completions/min_length": 0.0, "completions/min_terminated_length": 24.0, "epoch": 0.206, "format_failures": 0.0, "grad_norm": 1.482935905456543, "kl": 0.03114949818700552, "learning_rate": 1e-06, "loss": -0.0754, "num_tokens": 1913640.0, "reward": 0.3333333432674408, "reward_std": 0.32566946744918823, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 260.5833333333333, "completions/mean_terminated_length": 284.27272727272725, "completions/min_length": 0.0, "completions/min_terminated_length": 197.0, "epoch": 0.208, "format_failures": 0.0, "grad_norm": 0.4501963257789612, "kl": 0.011977697955444455, "learning_rate": 1e-06, "loss": -0.0496, "num_tokens": 1932468.0, "reward": 0.37487921118736267, "reward_std": 0.29262858629226685, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 113.91666666666667, "completions/mean_terminated_length": 136.7, "completions/min_length": 0.0, "completions/min_terminated_length": 120.0, "epoch": 0.21, "format_failures": 0.0, "grad_norm": 3.2958946228027344, "kl": 0.024902154691517353, "learning_rate": 1e-06, "loss": 0.0181, "num_tokens": 1942992.0, "reward": 0.5, "reward_std": 0.5222329497337341, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 166.0, "completions/mean_terminated_length": 181.0909090909091, "completions/min_length": 0.0, "completions/min_terminated_length": 57.0, "epoch": 0.212, "format_failures": 0.0, "grad_norm": 1.3716078996658325, "kl": 1.098541870713234, "learning_rate": 1e-06, "loss": 0.0299, "num_tokens": 1964208.0, "reward": 0.07500000298023224, "reward_std": 0.17645499110221863, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 171.58333333333334, "completions/mean_terminated_length": 187.1818181818182, "completions/min_length": 0.0, "completions/min_terminated_length": 100.0, "epoch": 0.214, "format_failures": 2.0, "grad_norm": 0.27850034832954407, "kl": 0.020487794652581215, "learning_rate": 1e-06, "loss": 0.0329, "num_tokens": 1974972.0, "reward": 0.4126984477043152, "reward_std": 0.18834668397903442, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 45.416666666666664, "completions/mean_terminated_length": 49.54545454545455, "completions/min_length": 0.0, "completions/min_terminated_length": 34.0, "epoch": 0.216, "format_failures": 0.0, "grad_norm": 2.118313789367676, "kl": 0.03025034721940756, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 1981716.0, "reward": 0.8333333730697632, "reward_std": 0.38924944400787354, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 117.5, "completions/mean_terminated_length": 128.1818181818182, "completions/min_length": 0.0, "completions/min_terminated_length": 88.0, "epoch": 0.218, "format_failures": 0.0, "grad_norm": 1.9193243980407715, "kl": 0.04295819811522961, "learning_rate": 1e-06, "loss": 0.009, "num_tokens": 1992420.0, "reward": 0.701388955116272, "reward_std": 0.38302528858184814, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 108.66666666666667, "completions/mean_terminated_length": 118.54545454545455, "completions/min_length": 0.0, "completions/min_terminated_length": 92.0, "epoch": 0.22, "format_failures": 0.0, "grad_norm": 4.0581183433532715, "kl": 0.34252697695046663, "learning_rate": 1e-06, "loss": -0.014, "num_tokens": 2004288.0, "reward": 0.479166716337204, "reward_std": 0.30592837929725647, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 129.0, "completions/mean_terminated_length": 140.72727272727272, "completions/min_length": 0.0, "completions/min_terminated_length": 112.0, "epoch": 0.222, "format_failures": 0.0, "grad_norm": 2.901212692260742, "kl": 0.451558455824852, "learning_rate": 1e-06, "loss": 0.0047, "num_tokens": 2021400.0, "reward": 0.0, "reward_std": 0.0, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 147.08333333333334, "completions/mean_terminated_length": 160.45454545454547, "completions/min_length": 0.0, "completions/min_terminated_length": 65.0, "epoch": 0.224, "format_failures": 0.0, "grad_norm": 3.0557456016540527, "kl": 0.1749698342755437, "learning_rate": 1e-06, "loss": 0.0461, "num_tokens": 2033580.0, "reward": 0.7708333730697632, "reward_std": 0.32784304022789, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 81.75, "completions/mean_terminated_length": 89.18181818181819, "completions/min_length": 0.0, "completions/min_terminated_length": 41.0, "epoch": 0.226, "format_failures": 0.0, "grad_norm": 2.929105281829834, "kl": 1.0704956352710724, "learning_rate": 1e-06, "loss": -0.1432, "num_tokens": 2065740.0, "reward": 0.6625000238418579, "reward_std": 0.3711928129196167, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 420.5, "completions/mean_terminated_length": 458.72727272727275, "completions/min_length": 0.0, "completions/min_terminated_length": 171.0, "epoch": 0.228, "format_failures": 0.0, "grad_norm": 0.966941237449646, "kl": 0.012734876945614815, "learning_rate": 1e-06, "loss": -0.0432, "num_tokens": 2101236.0, "reward": 0.6500000357627869, "reward_std": 0.40886637568473816, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 263.75, "completions/mean_terminated_length": 287.72727272727275, "completions/min_length": 0.0, "completions/min_terminated_length": 280.0, "epoch": 0.23, "format_failures": 0.0, "grad_norm": 7.276376247406006, "kl": 2.2721076011657715, "learning_rate": 1e-06, "loss": 0.0151, "num_tokens": 2114484.0, "reward": 0.7777778506278992, "reward_std": 0.3576955795288086, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 167.41666666666666, "completions/mean_terminated_length": 182.63636363636363, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.232, "format_failures": 0.0, "grad_norm": 0.6819717884063721, "kl": 0.020047412253916264, "learning_rate": 1e-06, "loss": 0.0179, "num_tokens": 2125992.0, "reward": 0.8819445371627808, "reward_std": 0.2524084150791168, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 211.33333333333334, "completions/mean_terminated_length": 230.54545454545453, "completions/min_length": 0.0, "completions/min_terminated_length": 147.0, "epoch": 0.234, "format_failures": 0.0, "grad_norm": 0.19310350716114044, "kl": 0.019224281422793865, "learning_rate": 1e-06, "loss": 0.012, "num_tokens": 2137692.0, "reward": 0.585936427116394, "reward_std": 0.09784586727619171, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 142.16666666666666, "completions/mean_terminated_length": 155.0909090909091, "completions/min_length": 0.0, "completions/min_terminated_length": 110.0, "epoch": 0.236, "format_failures": 0.0, "grad_norm": 2.085691213607788, "kl": 0.09273007325828075, "learning_rate": 1e-06, "loss": 0.0139, "num_tokens": 2148816.0, "reward": 0.319444477558136, "reward_std": 0.2289450317621231, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 317.0833333333333, "completions/mean_terminated_length": 345.90909090909093, "completions/min_length": 0.0, "completions/min_terminated_length": 140.0, "epoch": 0.238, "format_failures": 0.0, "grad_norm": 0.37083595991134644, "kl": 0.0630851686000824, "learning_rate": 1e-06, "loss": 0.0918, "num_tokens": 2168256.0, "reward": 0.37870368361473083, "reward_std": 0.2895275950431824, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 126.66666666666667, "completions/mean_terminated_length": 138.1818181818182, "completions/min_length": 0.0, "completions/min_terminated_length": 58.0, "epoch": 0.24, "format_failures": 0.0, "grad_norm": 6.606923580169678, "kl": 3.8295647501945496, "learning_rate": 1e-06, "loss": 0.1365, "num_tokens": 2183124.0, "reward": 0.4027777910232544, "reward_std": 0.3723955750465393, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5833333333333333, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 32.083333333333336, "completions/mean_terminated_length": 77.0, "completions/min_length": 0.0, "completions/min_terminated_length": 77.0, "epoch": 0.242, "format_failures": 0.0, "grad_norm": 0.08047831058502197, "kl": 0.013985397294163704, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 2190396.0, "reward": 1.0, "reward_std": 0.0, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 431.0833333333333, "completions/mean_terminated_length": 470.27272727272725, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, "epoch": 0.244, "format_failures": 0.0, "grad_norm": 0.019394446164369583, "kl": 0.01961024198681116, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 2218320.0, "reward": 0.0, "reward_std": 0.0, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 284.9166666666667, "completions/mean_terminated_length": 310.8181818181818, "completions/min_length": 0.0, "completions/min_terminated_length": 118.0, "epoch": 0.246, "format_failures": 0.0, "grad_norm": 1.5184653997421265, "kl": 1.0404187738895416, "learning_rate": 1e-06, "loss": -0.0335, "num_tokens": 2231256.0, "reward": 0.4014219641685486, "reward_std": 0.31073111295700073, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 64.75, "completions/mean_terminated_length": 70.63636363636364, "completions/min_length": 0.0, "completions/min_terminated_length": 34.0, "epoch": 0.248, "format_failures": 0.0, "grad_norm": 1.6326740980148315, "kl": 0.3745545968413353, "learning_rate": 1e-06, "loss": 0.0517, "num_tokens": 2240424.0, "reward": 0.8037037253379822, "reward_std": 0.3365945816040039, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.6666666666666667, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 37.75, "completions/mean_terminated_length": 113.25, "completions/min_length": 0.0, "completions/min_terminated_length": 102.0, "epoch": 0.25, "format_failures": 0.0, "grad_norm": 10.052517890930176, "kl": 1.53599963337183, "learning_rate": 1e-06, "loss": -0.0049, "num_tokens": 2249424.0, "reward": 0.9166666865348816, "reward_std": 0.28867512941360474, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 199.5, "completions/mean_terminated_length": 217.63636363636363, "completions/min_length": 0.0, "completions/min_terminated_length": 160.0, "epoch": 0.252, "format_failures": 0.0, "grad_norm": 1.1388990879058838, "kl": 0.24531831266358495, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 2263584.0, "reward": 0.0, "reward_std": 0.0, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 125.0, "completions/mean_terminated_length": 136.36363636363637, "completions/min_length": 0.0, "completions/min_terminated_length": 123.0, "epoch": 0.254, "format_failures": 0.0, "grad_norm": 2.392914056777954, "kl": 0.9988721050322056, "learning_rate": 1e-06, "loss": -0.0025, "num_tokens": 2276520.0, "reward": 0.7291666865348816, "reward_std": 0.3608439266681671, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 134.08333333333334, "completions/mean_terminated_length": 146.27272727272728, "completions/min_length": 0.0, "completions/min_terminated_length": 106.0, "epoch": 0.256, "format_failures": 0.0, "grad_norm": 0.5191885828971863, "kl": 0.20999768376350403, "learning_rate": 1e-06, "loss": 0.0146, "num_tokens": 2286408.0, "reward": 0.717815101146698, "reward_std": 0.14373189210891724, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 137.75, "completions/mean_terminated_length": 150.27272727272728, "completions/min_length": 0.0, "completions/min_terminated_length": 98.0, "epoch": 0.258, "format_failures": 0.0, "grad_norm": 1.204528570175171, "kl": 0.08800000417977571, "learning_rate": 1e-06, "loss": 0.0511, "num_tokens": 2296044.0, "reward": 0.5675595998764038, "reward_std": 0.2289842963218689, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 124.58333333333333, "completions/mean_terminated_length": 135.9090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 54.0, "epoch": 0.26, "format_failures": 0.0, "grad_norm": 0.44312867522239685, "kl": 0.07202759943902493, "learning_rate": 1e-06, "loss": 0.0475, "num_tokens": 2305644.0, "reward": 0.5101972222328186, "reward_std": 0.19489067792892456, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 281.1666666666667, "completions/mean_terminated_length": 306.72727272727275, "completions/min_length": 0.0, "completions/min_terminated_length": 253.0, "epoch": 0.262, "format_failures": 1.0, "grad_norm": 1.5526983737945557, "kl": 0.06795010529458523, "learning_rate": 1e-06, "loss": -0.0019, "num_tokens": 2319192.0, "reward": 0.75, "reward_std": 0.3217690885066986, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 162.83333333333334, "completions/mean_terminated_length": 177.63636363636363, "completions/min_length": 0.0, "completions/min_terminated_length": 175.0, "epoch": 0.264, "format_failures": 0.0, "grad_norm": 2.740288257598877, "kl": 0.7462278339080513, "learning_rate": 1e-06, "loss": 0.0045, "num_tokens": 2329488.0, "reward": 0.9791666865348816, "reward_std": 0.07216878235340118, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 315.5, "completions/mean_terminated_length": 344.1818181818182, "completions/min_length": 0.0, "completions/min_terminated_length": 233.0, "epoch": 0.266, "format_failures": 0.0, "grad_norm": 0.11069951951503754, "kl": 0.01982728624716401, "learning_rate": 1e-06, "loss": -0.034, "num_tokens": 2358276.0, "reward": 0.5852844715118408, "reward_std": 0.12080158293247223, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 161.16666666666666, "completions/mean_terminated_length": 175.8181818181818, "completions/min_length": 0.0, "completions/min_terminated_length": 104.0, "epoch": 0.268, "format_failures": 0.0, "grad_norm": 0.8276861906051636, "kl": 0.09472572058439255, "learning_rate": 1e-06, "loss": 0.0149, "num_tokens": 2368980.0, "reward": 0.6518849730491638, "reward_std": 0.2886110842227936, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 227.08333333333334, "completions/mean_terminated_length": 247.72727272727272, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 0.27, "format_failures": 0.0, "grad_norm": 0.5550012588500977, "kl": 0.02074157353490591, "learning_rate": 1e-06, "loss": -0.0841, "num_tokens": 2379828.0, "reward": 0.6243386268615723, "reward_std": 0.3905191719532013, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 210.0, "completions/mean_terminated_length": 229.0909090909091, "completions/min_length": 0.0, "completions/min_terminated_length": 228.0, "epoch": 0.272, "format_failures": 0.0, "grad_norm": 1.019722580909729, "kl": 0.13905800506472588, "learning_rate": 1e-06, "loss": 0.0123, "num_tokens": 2394360.0, "reward": 0.949999988079071, "reward_std": 0.17320507764816284, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 215.0, "completions/mean_terminated_length": 234.54545454545453, "completions/min_length": 0.0, "completions/min_terminated_length": 145.0, "epoch": 0.274, "format_failures": 0.0, "grad_norm": 0.32402342557907104, "kl": 0.014864406548440456, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 2406096.0, "reward": 0.6149470806121826, "reward_std": 0.19829140603542328, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 136.58333333333334, "completions/mean_terminated_length": 149.0, "completions/min_length": 0.0, "completions/min_terminated_length": 58.0, "epoch": 0.276, "format_failures": 0.0, "grad_norm": 1.005679965019226, "kl": 0.023909798823297024, "learning_rate": 1e-06, "loss": -0.0608, "num_tokens": 2423568.0, "reward": 0.5231481790542603, "reward_std": 0.3425479829311371, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 165.58333333333334, "completions/mean_terminated_length": 180.63636363636363, "completions/min_length": 0.0, "completions/min_terminated_length": 75.0, "epoch": 0.278, "format_failures": 0.0, "grad_norm": 3.9986395835876465, "kl": 2.975656658411026, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 2437320.0, "reward": 0.7277778387069702, "reward_std": 0.4172621965408325, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.33333333333333337, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 36.5, "completions/mean_terminated_length": 54.75, "completions/min_length": 0.0, "completions/min_terminated_length": 53.0, "epoch": 0.28, "format_failures": 0.0, "grad_norm": 0.04945458099246025, "kl": 0.008955058641731739, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 2449116.0, "reward": 1.0, "reward_std": 0.0, "step": 140 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 2449116, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }