| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.28, | |
| "eval_steps": 500, | |
| "global_step": 140, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 1005.0, | |
| "completions/max_terminated_length": 1005.0, | |
| "completions/mean_length": 442.6666666666667, | |
| "completions/mean_terminated_length": 482.90909090909093, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 212.0, | |
| "epoch": 0.002, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.3274489641189575, | |
| "kl": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.048, | |
| "num_tokens": 21804.0, | |
| "reward": 0.26185137033462524, | |
| "reward_std": 0.28920137882232666, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 264.0, | |
| "completions/max_terminated_length": 264.0, | |
| "completions/mean_length": 136.5, | |
| "completions/mean_terminated_length": 148.9090909090909, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 60.0, | |
| "epoch": 0.004, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.2693145275115967, | |
| "kl": 0.0, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0962, | |
| "num_tokens": 42324.0, | |
| "reward": 0.38461539149284363, | |
| "reward_std": 0.3770364224910736, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 362.0, | |
| "completions/max_terminated_length": 362.0, | |
| "completions/mean_length": 217.83333333333334, | |
| "completions/mean_terminated_length": 237.63636363636363, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 124.0, | |
| "epoch": 0.006, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.3044165074825287, | |
| "kl": 0.19029825925827026, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0009, | |
| "num_tokens": 58980.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 896.0, | |
| "completions/max_terminated_length": 896.0, | |
| "completions/mean_length": 321.0833333333333, | |
| "completions/mean_terminated_length": 350.27272727272725, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 103.0, | |
| "epoch": 0.008, | |
| "format_failures": 1.0, | |
| "grad_norm": 0.3372040390968323, | |
| "kl": 0.029289670288562775, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1107, | |
| "num_tokens": 81756.0, | |
| "reward": 0.23689448833465576, | |
| "reward_std": 0.2267814427614212, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 193.0, | |
| "completions/max_terminated_length": 193.0, | |
| "completions/mean_length": 119.08333333333333, | |
| "completions/mean_terminated_length": 129.9090909090909, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 66.0, | |
| "epoch": 0.01, | |
| "format_failures": 0.0, | |
| "grad_norm": 10.779764175415039, | |
| "kl": 3.1303787231445312, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0311, | |
| "num_tokens": 96360.0, | |
| "reward": 0.1666666716337204, | |
| "reward_std": 0.30772873759269714, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 745.0, | |
| "completions/max_terminated_length": 745.0, | |
| "completions/mean_length": 420.6666666666667, | |
| "completions/mean_terminated_length": 458.90909090909093, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 329.0, | |
| "epoch": 0.012, | |
| "format_failures": 1.0, | |
| "grad_norm": 0.2519327402114868, | |
| "kl": 0.016291129169985652, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0559, | |
| "num_tokens": 119712.0, | |
| "reward": 0.34878918528556824, | |
| "reward_std": 0.2739146649837494, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 164.0, | |
| "completions/max_terminated_length": 164.0, | |
| "completions/mean_length": 67.33333333333333, | |
| "completions/mean_terminated_length": 73.45454545454545, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 36.0, | |
| "epoch": 0.014, | |
| "format_failures": 0.0, | |
| "grad_norm": 2531.101806640625, | |
| "kl": 562.2636108398438, | |
| "learning_rate": 1e-06, | |
| "loss": 5.4405, | |
| "num_tokens": 128772.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 421.0, | |
| "completions/max_terminated_length": 421.0, | |
| "completions/mean_length": 186.41666666666666, | |
| "completions/mean_terminated_length": 203.36363636363637, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 85.0, | |
| "epoch": 0.016, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.7023671865463257, | |
| "kl": 0.0004708967899205163, | |
| "learning_rate": 1e-06, | |
| "loss": -0.1143, | |
| "num_tokens": 164100.0, | |
| "reward": 0.06388889253139496, | |
| "reward_std": 0.1274919956922531, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 484.0, | |
| "completions/max_terminated_length": 484.0, | |
| "completions/mean_length": 253.41666666666666, | |
| "completions/mean_terminated_length": 276.45454545454544, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 165.0, | |
| "epoch": 0.018, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.1911135911941528, | |
| "kl": 0.0012580148177221417, | |
| "learning_rate": 1e-06, | |
| "loss": -0.3277, | |
| "num_tokens": 197808.0, | |
| "reward": 0.1118159219622612, | |
| "reward_std": 0.2614404261112213, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 170.0, | |
| "completions/max_terminated_length": 170.0, | |
| "completions/mean_length": 64.83333333333333, | |
| "completions/mean_terminated_length": 70.72727272727273, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 35.0, | |
| "epoch": 0.02, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.324984073638916, | |
| "kl": 0.2648707218468189, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0221, | |
| "num_tokens": 207000.0, | |
| "reward": 0.01666666753590107, | |
| "reward_std": 0.057735029608011246, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 183.0, | |
| "completions/max_terminated_length": 183.0, | |
| "completions/mean_length": 126.33333333333333, | |
| "completions/mean_terminated_length": 137.8181818181818, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 62.0, | |
| "epoch": 0.022, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.5873882174491882, | |
| "kl": 0.017587594222277403, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0197, | |
| "num_tokens": 221808.0, | |
| "reward": 0.1805555671453476, | |
| "reward_std": 0.3134874999523163, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.16666666666666663, | |
| "completions/max_length": 2049.0, | |
| "completions/max_terminated_length": 2049.0, | |
| "completions/mean_length": 541.25, | |
| "completions/mean_terminated_length": 649.5, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 137.0, | |
| "epoch": 0.024, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.48546102643013, | |
| "kl": 0.002345994464121759, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0336, | |
| "num_tokens": 255132.0, | |
| "reward": 0.4682539701461792, | |
| "reward_std": 0.4320843815803528, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 53.0, | |
| "completions/max_terminated_length": 53.0, | |
| "completions/mean_length": 29.666666666666668, | |
| "completions/mean_terminated_length": 32.36363636363637, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 22.0, | |
| "epoch": 0.026, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.186175137758255, | |
| "kl": 0.041642000898718834, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "num_tokens": 265092.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 708.0, | |
| "completions/max_terminated_length": 708.0, | |
| "completions/mean_length": 381.6666666666667, | |
| "completions/mean_terminated_length": 416.3636363636364, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 188.0, | |
| "epoch": 0.028, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.20345070958137512, | |
| "kl": 0.009796573780477047, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0257, | |
| "num_tokens": 294096.0, | |
| "reward": 0.29761505126953125, | |
| "reward_std": 0.16453009843826294, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 1034.0, | |
| "completions/max_terminated_length": 1034.0, | |
| "completions/mean_length": 332.25, | |
| "completions/mean_terminated_length": 362.45454545454544, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 125.0, | |
| "epoch": 0.03, | |
| "format_failures": 1.0, | |
| "grad_norm": 0.5157941579818726, | |
| "kl": 0.004433898604474962, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0103, | |
| "num_tokens": 325368.0, | |
| "reward": 0.2917824387550354, | |
| "reward_std": 0.3325340151786804, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 268.0, | |
| "completions/max_terminated_length": 268.0, | |
| "completions/mean_length": 150.16666666666666, | |
| "completions/mean_terminated_length": 163.8181818181818, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 30.0, | |
| "epoch": 0.032, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.05657627806067467, | |
| "kl": 0.0326845021918416, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "num_tokens": 341196.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 296.0, | |
| "completions/max_terminated_length": 296.0, | |
| "completions/mean_length": 228.41666666666666, | |
| "completions/mean_terminated_length": 249.1818181818182, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 26.0, | |
| "epoch": 0.034, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.8653935194015503, | |
| "kl": 0.8598212422803044, | |
| "learning_rate": 1e-06, | |
| "loss": 0.014, | |
| "num_tokens": 354228.0, | |
| "reward": 0.01666666753590107, | |
| "reward_std": 0.05773502588272095, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 71.0, | |
| "completions/max_terminated_length": 71.0, | |
| "completions/mean_length": 48.333333333333336, | |
| "completions/mean_terminated_length": 52.72727272727273, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 25.0, | |
| "epoch": 0.036, | |
| "format_failures": 1.0, | |
| "grad_norm": 0.018069056794047356, | |
| "kl": 0.023271435871720314, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "num_tokens": 381468.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 719.0, | |
| "completions/max_terminated_length": 719.0, | |
| "completions/mean_length": 228.91666666666666, | |
| "completions/mean_terminated_length": 249.72727272727272, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 16.0, | |
| "epoch": 0.038, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.073132872581482, | |
| "kl": 0.003063492476940155, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0334, | |
| "num_tokens": 415356.0, | |
| "reward": 0.1666666716337204, | |
| "reward_std": 0.38924944400787354, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 153.0, | |
| "completions/max_terminated_length": 153.0, | |
| "completions/mean_length": 84.58333333333333, | |
| "completions/mean_terminated_length": 92.27272727272727, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 66.0, | |
| "epoch": 0.04, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.1736811399459839, | |
| "kl": 0.018741012550890446, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0962, | |
| "num_tokens": 442596.0, | |
| "reward": 0.1041666716337204, | |
| "reward_std": 0.22508415579795837, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 118.0, | |
| "completions/max_terminated_length": 118.0, | |
| "completions/mean_length": 89.58333333333333, | |
| "completions/mean_terminated_length": 97.72727272727273, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 69.0, | |
| "epoch": 0.042, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.960914671421051, | |
| "kl": 0.03209133446216583, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0169, | |
| "num_tokens": 453252.0, | |
| "reward": 0.2708333432674408, | |
| "reward_std": 0.4454101026058197, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 249.0, | |
| "completions/max_terminated_length": 249.0, | |
| "completions/mean_length": 124.33333333333333, | |
| "completions/mean_terminated_length": 135.63636363636363, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 74.0, | |
| "epoch": 0.044, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.0618880987167358, | |
| "kl": 0.03219995368272066, | |
| "learning_rate": 1e-06, | |
| "loss": -0.3593, | |
| "num_tokens": 481656.0, | |
| "reward": 0.09444444626569748, | |
| "reward_std": 0.17164288461208344, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 673.0, | |
| "completions/max_terminated_length": 673.0, | |
| "completions/mean_length": 299.5, | |
| "completions/mean_terminated_length": 326.72727272727275, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 148.0, | |
| "epoch": 0.046, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.3598278760910034, | |
| "kl": 0.031054741702973843, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0131, | |
| "num_tokens": 505704.0, | |
| "reward": 0.4847402572631836, | |
| "reward_std": 0.25003767013549805, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 497.0, | |
| "completions/max_terminated_length": 497.0, | |
| "completions/mean_length": 297.5, | |
| "completions/mean_terminated_length": 324.54545454545456, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 211.0, | |
| "epoch": 0.048, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.27960336208343506, | |
| "kl": 0.04240706283599138, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0398, | |
| "num_tokens": 523500.0, | |
| "reward": 0.2615740895271301, | |
| "reward_std": 0.219794362783432, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 383.0, | |
| "completions/max_terminated_length": 383.0, | |
| "completions/mean_length": 179.16666666666666, | |
| "completions/mean_terminated_length": 195.45454545454547, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 114.0, | |
| "epoch": 0.05, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.2980320453643799, | |
| "kl": 0.0048073166981339455, | |
| "learning_rate": 1e-06, | |
| "loss": -0.3887, | |
| "num_tokens": 555300.0, | |
| "reward": 0.5003399848937988, | |
| "reward_std": 0.39150455594062805, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 699.0, | |
| "completions/max_terminated_length": 699.0, | |
| "completions/mean_length": 315.9166666666667, | |
| "completions/mean_terminated_length": 344.6363636363636, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 239.0, | |
| "epoch": 0.052, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.2552706003189087, | |
| "kl": 0.027493927627801895, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0567, | |
| "num_tokens": 576000.0, | |
| "reward": 0.43729767203330994, | |
| "reward_std": 0.18975813686847687, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 140.0, | |
| "completions/max_terminated_length": 140.0, | |
| "completions/mean_length": 72.91666666666667, | |
| "completions/mean_terminated_length": 79.54545454545455, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 65.0, | |
| "epoch": 0.054, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.1299240589141846, | |
| "kl": 0.0332061443477869, | |
| "learning_rate": 1e-06, | |
| "loss": -0.057, | |
| "num_tokens": 584712.0, | |
| "reward": 0.33095240592956543, | |
| "reward_std": 0.444376677274704, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 181.0, | |
| "completions/max_terminated_length": 181.0, | |
| "completions/mean_length": 91.16666666666667, | |
| "completions/mean_terminated_length": 99.45454545454545, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 56.0, | |
| "epoch": 0.056, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.044371046125888824, | |
| "kl": 0.03765446413308382, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0004, | |
| "num_tokens": 598032.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 522.0, | |
| "completions/max_terminated_length": 522.0, | |
| "completions/mean_length": 304.5, | |
| "completions/mean_terminated_length": 332.1818181818182, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 212.0, | |
| "epoch": 0.058, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.5104940533638, | |
| "kl": 0.03451683558523655, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0274, | |
| "num_tokens": 615204.0, | |
| "reward": 0.4068452715873718, | |
| "reward_std": 0.37161099910736084, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 296.0, | |
| "completions/max_terminated_length": 296.0, | |
| "completions/mean_length": 162.91666666666666, | |
| "completions/mean_terminated_length": 177.72727272727272, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 59.0, | |
| "epoch": 0.06, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.2335582971572876, | |
| "kl": 0.007039119256660342, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2673, | |
| "num_tokens": 647892.0, | |
| "reward": 0.3291666805744171, | |
| "reward_std": 0.4266456663608551, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 812.0, | |
| "completions/max_terminated_length": 812.0, | |
| "completions/mean_length": 332.5, | |
| "completions/mean_terminated_length": 362.72727272727275, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 222.0, | |
| "epoch": 0.062, | |
| "format_failures": 2.0, | |
| "grad_norm": 0.3000166416168213, | |
| "kl": 0.03664882015436888, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0306, | |
| "num_tokens": 670860.0, | |
| "reward": 0.6458902955055237, | |
| "reward_std": 0.26038500666618347, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 335.0, | |
| "completions/max_terminated_length": 335.0, | |
| "completions/mean_length": 218.66666666666666, | |
| "completions/mean_terminated_length": 238.54545454545453, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 180.0, | |
| "epoch": 0.064, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.37272748351097107, | |
| "kl": 0.07015270553529263, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0169, | |
| "num_tokens": 682212.0, | |
| "reward": 0.43658646941185, | |
| "reward_std": 0.24143192172050476, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 73.0, | |
| "completions/max_terminated_length": 73.0, | |
| "completions/mean_length": 53.25, | |
| "completions/mean_terminated_length": 58.09090909090909, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 47.0, | |
| "epoch": 0.066, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.1589769124984741, | |
| "kl": 0.03555137664079666, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0651, | |
| "num_tokens": 692040.0, | |
| "reward": 0.11666666716337204, | |
| "reward_std": 0.301008403301239, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 465.0, | |
| "completions/max_terminated_length": 465.0, | |
| "completions/mean_length": 336.0, | |
| "completions/mean_terminated_length": 366.54545454545456, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 292.0, | |
| "epoch": 0.068, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.42152470350265503, | |
| "kl": 0.19683832861483097, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0173, | |
| "num_tokens": 704484.0, | |
| "reward": 0.5136784911155701, | |
| "reward_std": 0.38917282223701477, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 93.0, | |
| "completions/max_terminated_length": 93.0, | |
| "completions/mean_length": 59.166666666666664, | |
| "completions/mean_terminated_length": 64.54545454545455, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 40.0, | |
| "epoch": 0.07, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.729435682296753, | |
| "kl": 0.055947478860616684, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0028, | |
| "num_tokens": 710520.0, | |
| "reward": 0.5611110925674438, | |
| "reward_std": 0.45256468653678894, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 192.0, | |
| "completions/max_terminated_length": 192.0, | |
| "completions/mean_length": 91.91666666666667, | |
| "completions/mean_terminated_length": 100.27272727272727, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 59.0, | |
| "epoch": 0.072, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.7297618389129639, | |
| "kl": 0.28226011246442795, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "num_tokens": 720588.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 255.0, | |
| "completions/max_terminated_length": 255.0, | |
| "completions/mean_length": 184.66666666666666, | |
| "completions/mean_terminated_length": 201.45454545454547, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 152.0, | |
| "epoch": 0.074, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.1786535382270813, | |
| "kl": 0.05143214017152786, | |
| "learning_rate": 1e-06, | |
| "loss": 0.001, | |
| "num_tokens": 731112.0, | |
| "reward": 0.5931217074394226, | |
| "reward_std": 0.15197694301605225, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 127.0, | |
| "completions/max_terminated_length": 127.0, | |
| "completions/mean_length": 61.416666666666664, | |
| "completions/mean_terminated_length": 67.0, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 36.0, | |
| "epoch": 0.076, | |
| "format_failures": 1.0, | |
| "grad_norm": 2.560441732406616, | |
| "kl": 0.061069367453455925, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1107, | |
| "num_tokens": 758340.0, | |
| "reward": 0.0833333358168602, | |
| "reward_std": 0.28867512941360474, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 2050.0, | |
| "completions/max_terminated_length": 2050.0, | |
| "completions/mean_length": 715.0, | |
| "completions/mean_terminated_length": 780.0, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 357.0, | |
| "epoch": 0.078, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.41932860016822815, | |
| "kl": 0.01548363408073783, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0106, | |
| "num_tokens": 790968.0, | |
| "reward": 0.25740742683410645, | |
| "reward_std": 0.32573264837265015, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 1162.0, | |
| "completions/max_terminated_length": 1162.0, | |
| "completions/mean_length": 471.75, | |
| "completions/mean_terminated_length": 514.6363636363636, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 113.0, | |
| "epoch": 0.08, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.8145480155944824, | |
| "kl": 0.016389482654631138, | |
| "learning_rate": 1e-06, | |
| "loss": 0.154, | |
| "num_tokens": 829104.0, | |
| "reward": 0.43334314227104187, | |
| "reward_std": 0.3763042986392975, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 254.0, | |
| "completions/max_terminated_length": 254.0, | |
| "completions/mean_length": 99.91666666666667, | |
| "completions/mean_terminated_length": 109.0, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 47.0, | |
| "epoch": 0.082, | |
| "format_failures": 0.0, | |
| "grad_norm": 18.232030868530273, | |
| "kl": 1.717683531343937, | |
| "learning_rate": 1e-06, | |
| "loss": 0.197, | |
| "num_tokens": 850716.0, | |
| "reward": 0.2430555671453476, | |
| "reward_std": 0.4042987823486328, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 156.0, | |
| "completions/max_terminated_length": 156.0, | |
| "completions/mean_length": 77.33333333333333, | |
| "completions/mean_terminated_length": 84.36363636363636, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 63.0, | |
| "epoch": 0.084, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.5794758796691895, | |
| "kl": 0.21323725581169128, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0344, | |
| "num_tokens": 859644.0, | |
| "reward": 0.0476190522313118, | |
| "reward_std": 0.1649572253227234, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 186.0, | |
| "completions/max_terminated_length": 186.0, | |
| "completions/mean_length": 136.66666666666666, | |
| "completions/mean_terminated_length": 149.0909090909091, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 57.0, | |
| "epoch": 0.086, | |
| "format_failures": 0.0, | |
| "grad_norm": 2.507535934448242, | |
| "kl": 0.2139158956706524, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0282, | |
| "num_tokens": 871596.0, | |
| "reward": 0.3333333432674408, | |
| "reward_std": 0.4923659861087799, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.33333333333333337, | |
| "completions/max_length": 53.0, | |
| "completions/max_terminated_length": 53.0, | |
| "completions/mean_length": 28.25, | |
| "completions/mean_terminated_length": 42.375, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 21.0, | |
| "epoch": 0.088, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.33207282423973083, | |
| "kl": 0.035286733880639076, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0008, | |
| "num_tokens": 879828.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 359.0, | |
| "completions/max_terminated_length": 359.0, | |
| "completions/mean_length": 117.83333333333333, | |
| "completions/mean_terminated_length": 128.54545454545453, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 68.0, | |
| "epoch": 0.09, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.2761678099632263, | |
| "kl": 0.15724625438451767, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "num_tokens": 899448.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 179.0, | |
| "completions/max_terminated_length": 179.0, | |
| "completions/mean_length": 105.16666666666667, | |
| "completions/mean_terminated_length": 114.72727272727273, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 26.0, | |
| "epoch": 0.092, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.1471128463745117, | |
| "kl": 0.12899010255932808, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0117, | |
| "num_tokens": 914760.0, | |
| "reward": 0.1666666716337204, | |
| "reward_std": 0.30151134729385376, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 345.0, | |
| "completions/max_terminated_length": 345.0, | |
| "completions/mean_length": 233.66666666666666, | |
| "completions/mean_terminated_length": 254.9090909090909, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 153.0, | |
| "epoch": 0.094, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.5467153191566467, | |
| "kl": 0.2796362675726414, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0318, | |
| "num_tokens": 925212.0, | |
| "reward": 0.549458920955658, | |
| "reward_std": 0.3676450848579407, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 264.0, | |
| "completions/max_terminated_length": 264.0, | |
| "completions/mean_length": 166.25, | |
| "completions/mean_terminated_length": 181.36363636363637, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 142.0, | |
| "epoch": 0.096, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.78724205493927, | |
| "kl": 0.49516983330249786, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0104, | |
| "num_tokens": 938424.0, | |
| "reward": 0.02083333395421505, | |
| "reward_std": 0.07216878235340118, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 203.0, | |
| "completions/max_terminated_length": 203.0, | |
| "completions/mean_length": 106.08333333333333, | |
| "completions/mean_terminated_length": 115.72727272727273, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 29.0, | |
| "epoch": 0.098, | |
| "format_failures": 1.0, | |
| "grad_norm": 1.7356528043746948, | |
| "kl": 0.389555960893631, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0599, | |
| "num_tokens": 950172.0, | |
| "reward": 0.1944444626569748, | |
| "reward_std": 0.38816672563552856, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.16666666666666663, | |
| "completions/max_length": 1127.0, | |
| "completions/max_terminated_length": 1127.0, | |
| "completions/mean_length": 186.58333333333334, | |
| "completions/mean_terminated_length": 223.9, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 49.0, | |
| "epoch": 0.1, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.3811311721801758, | |
| "kl": 0.0656690001487732, | |
| "learning_rate": 1e-06, | |
| "loss": 0.949, | |
| "num_tokens": 981816.0, | |
| "reward": 0.5007641911506653, | |
| "reward_std": 0.4272591173648834, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 121.0, | |
| "completions/max_terminated_length": 121.0, | |
| "completions/mean_length": 74.75, | |
| "completions/mean_terminated_length": 81.54545454545455, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 67.0, | |
| "epoch": 0.102, | |
| "format_failures": 0.0, | |
| "grad_norm": 3.630605697631836, | |
| "kl": 0.11415744014084339, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1083, | |
| "num_tokens": 994800.0, | |
| "reward": 0.4722222685813904, | |
| "reward_std": 0.4596514403820038, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 447.0, | |
| "completions/max_terminated_length": 447.0, | |
| "completions/mean_length": 292.9166666666667, | |
| "completions/mean_terminated_length": 319.54545454545456, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 230.0, | |
| "epoch": 0.104, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.664616048336029, | |
| "kl": 0.024851050227880478, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0988, | |
| "num_tokens": 1028352.0, | |
| "reward": 0.5121031999588013, | |
| "reward_std": 0.26174625754356384, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 478.0, | |
| "completions/max_terminated_length": 478.0, | |
| "completions/mean_length": 267.4166666666667, | |
| "completions/mean_terminated_length": 291.72727272727275, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 158.0, | |
| "epoch": 0.106, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.3362949788570404, | |
| "kl": 0.09099859930574894, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0303, | |
| "num_tokens": 1053264.0, | |
| "reward": 0.0625, | |
| "reward_std": 0.21650634706020355, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 571.0, | |
| "completions/max_terminated_length": 571.0, | |
| "completions/mean_length": 292.0833333333333, | |
| "completions/mean_terminated_length": 318.6363636363636, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 190.0, | |
| "epoch": 0.108, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.17621153593063354, | |
| "kl": 0.03119577933102846, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0012, | |
| "num_tokens": 1068108.0, | |
| "reward": 0.4200083613395691, | |
| "reward_std": 0.194437637925148, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 168.0, | |
| "completions/max_terminated_length": 168.0, | |
| "completions/mean_length": 88.75, | |
| "completions/mean_terminated_length": 96.81818181818181, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 69.0, | |
| "epoch": 0.11, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.6367191672325134, | |
| "kl": 0.03671593498438597, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0088, | |
| "num_tokens": 1079820.0, | |
| "reward": 0.19027778506278992, | |
| "reward_std": 0.15930061042308807, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 189.0, | |
| "completions/max_terminated_length": 189.0, | |
| "completions/mean_length": 163.58333333333334, | |
| "completions/mean_terminated_length": 178.45454545454547, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 113.0, | |
| "epoch": 0.112, | |
| "format_failures": 0.0, | |
| "grad_norm": 2.1606733798980713, | |
| "kl": 0.20935122203081846, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0277, | |
| "num_tokens": 1091832.0, | |
| "reward": 0.5777778029441833, | |
| "reward_std": 0.4515592157840729, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 456.0, | |
| "completions/max_terminated_length": 456.0, | |
| "completions/mean_length": 288.4166666666667, | |
| "completions/mean_terminated_length": 314.6363636363636, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 169.0, | |
| "epoch": 0.114, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.32393601536750793, | |
| "kl": 0.031358057633042336, | |
| "learning_rate": 1e-06, | |
| "loss": -0.044, | |
| "num_tokens": 1105608.0, | |
| "reward": 0.1666666716337204, | |
| "reward_std": 0.24984844028949738, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 96.0, | |
| "completions/max_terminated_length": 96.0, | |
| "completions/mean_length": 65.5, | |
| "completions/mean_terminated_length": 71.45454545454545, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 55.0, | |
| "epoch": 0.116, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.021954922005534172, | |
| "kl": 0.018348069861531258, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "num_tokens": 1113168.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 533.0, | |
| "completions/max_terminated_length": 533.0, | |
| "completions/mean_length": 224.41666666666666, | |
| "completions/mean_terminated_length": 244.8181818181818, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 92.0, | |
| "epoch": 0.118, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.1990734338760376, | |
| "kl": 0.3062889650464058, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0431, | |
| "num_tokens": 1136832.0, | |
| "reward": 0.2395833432674408, | |
| "reward_std": 0.25259074568748474, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 374.0, | |
| "completions/max_terminated_length": 374.0, | |
| "completions/mean_length": 238.0, | |
| "completions/mean_terminated_length": 259.6363636363636, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 80.0, | |
| "epoch": 0.12, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.5170612931251526, | |
| "kl": 0.03292474150657654, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0251, | |
| "num_tokens": 1150536.0, | |
| "reward": 0.39345240592956543, | |
| "reward_std": 0.3553503155708313, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 158.0, | |
| "completions/max_terminated_length": 158.0, | |
| "completions/mean_length": 82.16666666666667, | |
| "completions/mean_terminated_length": 89.63636363636364, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 65.0, | |
| "epoch": 0.122, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.1562092304229736, | |
| "kl": 0.023061166517436504, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1452, | |
| "num_tokens": 1158984.0, | |
| "reward": 0.7333333492279053, | |
| "reward_std": 0.3639269173145294, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 553.0, | |
| "completions/max_terminated_length": 553.0, | |
| "completions/mean_length": 296.1666666666667, | |
| "completions/mean_terminated_length": 323.09090909090907, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 201.0, | |
| "epoch": 0.124, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.32044336199760437, | |
| "kl": 0.06375124305486679, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0015, | |
| "num_tokens": 1173504.0, | |
| "reward": 0.43736547231674194, | |
| "reward_std": 0.25956276059150696, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 2051.0, | |
| "completions/max_terminated_length": 2051.0, | |
| "completions/mean_length": 586.4166666666666, | |
| "completions/mean_terminated_length": 639.7272727272727, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 38.0, | |
| "epoch": 0.126, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.6462875008583069, | |
| "kl": 0.023477558977901936, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0492, | |
| "num_tokens": 1206840.0, | |
| "reward": 0.501884937286377, | |
| "reward_std": 0.5706992149353027, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 270.0, | |
| "completions/max_terminated_length": 270.0, | |
| "completions/mean_length": 150.66666666666666, | |
| "completions/mean_terminated_length": 164.36363636363637, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 97.0, | |
| "epoch": 0.128, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.4827415347099304, | |
| "kl": 0.11513948068022728, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2183, | |
| "num_tokens": 1230888.0, | |
| "reward": 0.3715476393699646, | |
| "reward_std": 0.17215265333652496, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 1340.0, | |
| "completions/max_terminated_length": 1340.0, | |
| "completions/mean_length": 277.5833333333333, | |
| "completions/mean_terminated_length": 302.8181818181818, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 20.0, | |
| "epoch": 0.13, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.46889665722846985, | |
| "kl": 0.9275694619864225, | |
| "learning_rate": 1e-06, | |
| "loss": 0.2754, | |
| "num_tokens": 1262100.0, | |
| "reward": 0.3917522430419922, | |
| "reward_std": 0.2266404628753662, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 599.0, | |
| "completions/max_terminated_length": 599.0, | |
| "completions/mean_length": 366.25, | |
| "completions/mean_terminated_length": 399.54545454545456, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 212.0, | |
| "epoch": 0.132, | |
| "format_failures": 1.0, | |
| "grad_norm": 0.30657899379730225, | |
| "kl": 0.16883518174290657, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0155, | |
| "num_tokens": 1278012.0, | |
| "reward": 0.34761905670166016, | |
| "reward_std": 0.2757572531700134, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.16666666666666663, | |
| "completions/max_length": 559.0, | |
| "completions/max_terminated_length": 559.0, | |
| "completions/mean_length": 300.9166666666667, | |
| "completions/mean_terminated_length": 361.1, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 224.0, | |
| "epoch": 0.134, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.6152874231338501, | |
| "kl": 0.10999106336385012, | |
| "learning_rate": 1e-06, | |
| "loss": 0.3303, | |
| "num_tokens": 1308996.0, | |
| "reward": 0.32609128952026367, | |
| "reward_std": 0.23752012848854065, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 227.0, | |
| "completions/max_terminated_length": 227.0, | |
| "completions/mean_length": 137.5, | |
| "completions/mean_terminated_length": 150.0, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 59.0, | |
| "epoch": 0.136, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.7395364046096802, | |
| "kl": 0.7087040841579437, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0121, | |
| "num_tokens": 1321020.0, | |
| "reward": 0.20873016119003296, | |
| "reward_std": 0.34043052792549133, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 297.0, | |
| "completions/max_terminated_length": 297.0, | |
| "completions/mean_length": 129.83333333333334, | |
| "completions/mean_terminated_length": 141.63636363636363, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 28.0, | |
| "epoch": 0.138, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.902642548084259, | |
| "kl": 0.7902000248432159, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0035, | |
| "num_tokens": 1332492.0, | |
| "reward": 0.0877976268529892, | |
| "reward_std": 0.20928393304347992, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 1172.0, | |
| "completions/max_terminated_length": 1172.0, | |
| "completions/mean_length": 333.1666666666667, | |
| "completions/mean_terminated_length": 444.22222222222223, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 133.0, | |
| "epoch": 0.14, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.22367094457149506, | |
| "kl": 0.03544241935014725, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0442, | |
| "num_tokens": 1363812.0, | |
| "reward": 0.22601282596588135, | |
| "reward_std": 0.1535530686378479, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 547.0, | |
| "completions/max_terminated_length": 547.0, | |
| "completions/mean_length": 368.5833333333333, | |
| "completions/mean_terminated_length": 402.09090909090907, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 205.0, | |
| "epoch": 0.142, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.25884878635406494, | |
| "kl": 0.0446395231410861, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0091, | |
| "num_tokens": 1396788.0, | |
| "reward": 0.6545634865760803, | |
| "reward_std": 0.2292691022157669, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 228.0, | |
| "completions/max_terminated_length": 228.0, | |
| "completions/mean_length": 127.75, | |
| "completions/mean_terminated_length": 139.36363636363637, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 62.0, | |
| "epoch": 0.144, | |
| "format_failures": 0.0, | |
| "grad_norm": 2.139310121536255, | |
| "kl": 0.2615228593349457, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0935, | |
| "num_tokens": 1411512.0, | |
| "reward": 0.625, | |
| "reward_std": 0.4826536476612091, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 644.0, | |
| "completions/max_terminated_length": 644.0, | |
| "completions/mean_length": 321.1666666666667, | |
| "completions/mean_terminated_length": 350.3636363636364, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 194.0, | |
| "epoch": 0.146, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.7009347081184387, | |
| "kl": 0.13678913563489914, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0771, | |
| "num_tokens": 1436532.0, | |
| "reward": 0.3439815044403076, | |
| "reward_std": 0.27971503138542175, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 280.0, | |
| "completions/max_terminated_length": 280.0, | |
| "completions/mean_length": 253.08333333333334, | |
| "completions/mean_terminated_length": 276.09090909090907, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 271.0, | |
| "epoch": 0.148, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.2899372577667236, | |
| "kl": 0.10085960477590561, | |
| "learning_rate": 1e-06, | |
| "loss": 0.3862, | |
| "num_tokens": 1471704.0, | |
| "reward": 0.7222222685813904, | |
| "reward_std": 0.4457052946090698, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 308.0, | |
| "completions/max_terminated_length": 308.0, | |
| "completions/mean_length": 196.5, | |
| "completions/mean_terminated_length": 214.36363636363637, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 54.0, | |
| "epoch": 0.15, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.4177331328392029, | |
| "kl": 0.026733385398983955, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0579, | |
| "num_tokens": 1485468.0, | |
| "reward": 0.2735119163990021, | |
| "reward_std": 0.30911651253700256, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 368.0, | |
| "completions/max_terminated_length": 368.0, | |
| "completions/mean_length": 200.41666666666666, | |
| "completions/mean_terminated_length": 218.63636363636363, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 62.0, | |
| "epoch": 0.152, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.8074631094932556, | |
| "kl": 0.45791861414909363, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0476, | |
| "num_tokens": 1500636.0, | |
| "reward": 0.17129629850387573, | |
| "reward_std": 0.19502559304237366, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 220.0, | |
| "completions/max_terminated_length": 220.0, | |
| "completions/mean_length": 144.08333333333334, | |
| "completions/mean_terminated_length": 157.1818181818182, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 21.0, | |
| "epoch": 0.154, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.8004605770111084, | |
| "kl": 0.32159996032714844, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0603, | |
| "num_tokens": 1512264.0, | |
| "reward": 0.5055555701255798, | |
| "reward_std": 0.29963788390159607, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 564.0, | |
| "completions/max_terminated_length": 564.0, | |
| "completions/mean_length": 312.1666666666667, | |
| "completions/mean_terminated_length": 340.54545454545456, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 170.0, | |
| "epoch": 0.156, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.3055727481842041, | |
| "kl": 0.03414521459490061, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0067, | |
| "num_tokens": 1526292.0, | |
| "reward": 0.5897321701049805, | |
| "reward_std": 0.2986750900745392, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 192.0, | |
| "completions/max_terminated_length": 192.0, | |
| "completions/mean_length": 167.5, | |
| "completions/mean_terminated_length": 182.72727272727272, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 165.0, | |
| "epoch": 0.158, | |
| "format_failures": 0.0, | |
| "grad_norm": 2.3401753902435303, | |
| "kl": 0.03888106718659401, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0218, | |
| "num_tokens": 1540416.0, | |
| "reward": 0.6666666865348816, | |
| "reward_std": 0.4923659861087799, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.16666666666666663, | |
| "completions/max_length": 291.0, | |
| "completions/max_terminated_length": 291.0, | |
| "completions/mean_length": 210.91666666666666, | |
| "completions/mean_terminated_length": 253.1, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 137.0, | |
| "epoch": 0.16, | |
| "format_failures": 0.0, | |
| "grad_norm": 28.73111343383789, | |
| "kl": 15.663371562957764, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0445, | |
| "num_tokens": 1553580.0, | |
| "reward": 0.4305555820465088, | |
| "reward_std": 0.4738534092903137, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 60.0, | |
| "completions/max_terminated_length": 60.0, | |
| "completions/mean_length": 43.166666666666664, | |
| "completions/mean_terminated_length": 47.09090909090909, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 32.0, | |
| "epoch": 0.162, | |
| "format_failures": 0.0, | |
| "grad_norm": 13.234149932861328, | |
| "kl": 2.6492202281951904, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0385, | |
| "num_tokens": 1560816.0, | |
| "reward": 0.27916666865348816, | |
| "reward_std": 0.42504456639289856, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 331.0, | |
| "completions/max_terminated_length": 331.0, | |
| "completions/mean_length": 189.66666666666666, | |
| "completions/mean_terminated_length": 206.9090909090909, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 142.0, | |
| "epoch": 0.164, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.0555896759033203, | |
| "kl": 0.060676803812384605, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0432, | |
| "num_tokens": 1573524.0, | |
| "reward": 0.39722225069999695, | |
| "reward_std": 0.2684729993343353, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 684.0, | |
| "completions/max_terminated_length": 684.0, | |
| "completions/mean_length": 482.1666666666667, | |
| "completions/mean_terminated_length": 526.0, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 479.0, | |
| "epoch": 0.166, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.27017322182655334, | |
| "kl": 0.013310576789081097, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0023, | |
| "num_tokens": 1595796.0, | |
| "reward": 0.8000000715255737, | |
| "reward_std": 0.39080336689949036, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 257.0, | |
| "completions/max_terminated_length": 257.0, | |
| "completions/mean_length": 144.91666666666666, | |
| "completions/mean_terminated_length": 158.0909090909091, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 83.0, | |
| "epoch": 0.168, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.0021555423736572, | |
| "kl": 0.2212899848818779, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0304, | |
| "num_tokens": 1606284.0, | |
| "reward": 0.2957010865211487, | |
| "reward_std": 0.2737172842025757, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 2050.0, | |
| "completions/max_terminated_length": 2050.0, | |
| "completions/mean_length": 510.0833333333333, | |
| "completions/mean_terminated_length": 556.4545454545455, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 216.0, | |
| "epoch": 0.17, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.3675689399242401, | |
| "kl": 0.2206931747496128, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1278, | |
| "num_tokens": 1639152.0, | |
| "reward": 0.43888890743255615, | |
| "reward_std": 0.2596941888332367, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 280.0, | |
| "completions/max_terminated_length": 280.0, | |
| "completions/mean_length": 152.25, | |
| "completions/mean_terminated_length": 166.0909090909091, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 117.0, | |
| "epoch": 0.172, | |
| "format_failures": 0.0, | |
| "grad_norm": 2.8949317932128906, | |
| "kl": 1.413679599761963, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0356, | |
| "num_tokens": 1652364.0, | |
| "reward": 0.4761905074119568, | |
| "reward_std": 0.5035434365272522, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 245.0, | |
| "completions/max_terminated_length": 245.0, | |
| "completions/mean_length": 152.91666666666666, | |
| "completions/mean_terminated_length": 166.8181818181818, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 58.0, | |
| "epoch": 0.174, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.7609695196151733, | |
| "kl": 0.07055489160120487, | |
| "learning_rate": 1e-06, | |
| "loss": 0.3366, | |
| "num_tokens": 1685136.0, | |
| "reward": 0.33750003576278687, | |
| "reward_std": 0.43647608160972595, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 180.0, | |
| "completions/max_terminated_length": 180.0, | |
| "completions/mean_length": 135.25, | |
| "completions/mean_terminated_length": 147.54545454545453, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 77.0, | |
| "epoch": 0.176, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.6215497255325317, | |
| "kl": 0.08650689758360386, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0112, | |
| "num_tokens": 1693764.0, | |
| "reward": 0.5745911598205566, | |
| "reward_std": 0.1768045872449875, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 140.0, | |
| "completions/max_terminated_length": 140.0, | |
| "completions/mean_length": 73.25, | |
| "completions/mean_terminated_length": 79.9090909090909, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 61.0, | |
| "epoch": 0.178, | |
| "format_failures": 1.0, | |
| "grad_norm": 0.8421996235847473, | |
| "kl": 0.016213122755289078, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0149, | |
| "num_tokens": 1707588.0, | |
| "reward": 0.06666667014360428, | |
| "reward_std": 0.1775250881910324, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 277.0, | |
| "completions/max_terminated_length": 277.0, | |
| "completions/mean_length": 178.16666666666666, | |
| "completions/mean_terminated_length": 194.36363636363637, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 101.0, | |
| "epoch": 0.18, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.4202212691307068, | |
| "kl": 0.3119240030646324, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0093, | |
| "num_tokens": 1716792.0, | |
| "reward": 0.6381944417953491, | |
| "reward_std": 0.22775352001190186, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 220.0, | |
| "completions/max_terminated_length": 220.0, | |
| "completions/mean_length": 165.58333333333334, | |
| "completions/mean_terminated_length": 180.63636363636363, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 56.0, | |
| "epoch": 0.182, | |
| "format_failures": 0.0, | |
| "grad_norm": 3.5526509284973145, | |
| "kl": 0.04295740742236376, | |
| "learning_rate": 1e-06, | |
| "loss": -0.007, | |
| "num_tokens": 1735188.0, | |
| "reward": 0.6666666865348816, | |
| "reward_std": 0.4923659861087799, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 554.0, | |
| "completions/max_terminated_length": 554.0, | |
| "completions/mean_length": 296.3333333333333, | |
| "completions/mean_terminated_length": 323.27272727272725, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 142.0, | |
| "epoch": 0.184, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.7098760008811951, | |
| "kl": 0.14585042744874954, | |
| "learning_rate": 1e-06, | |
| "loss": -0.052, | |
| "num_tokens": 1748808.0, | |
| "reward": 0.4570105969905853, | |
| "reward_std": 0.29787296056747437, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 442.0, | |
| "completions/max_terminated_length": 442.0, | |
| "completions/mean_length": 325.1666666666667, | |
| "completions/mean_terminated_length": 354.72727272727275, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 233.0, | |
| "epoch": 0.186, | |
| "format_failures": 0.0, | |
| "grad_norm": 4.00807523727417, | |
| "kl": 2.2327868938446045, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0328, | |
| "num_tokens": 1763196.0, | |
| "reward": 0.37762749195098877, | |
| "reward_std": 0.2510078251361847, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 146.0, | |
| "completions/max_terminated_length": 146.0, | |
| "completions/mean_length": 78.66666666666667, | |
| "completions/mean_terminated_length": 85.81818181818181, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 60.0, | |
| "epoch": 0.188, | |
| "format_failures": 0.0, | |
| "grad_norm": 4.166850566864014, | |
| "kl": 0.4828091114759445, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0043, | |
| "num_tokens": 1775700.0, | |
| "reward": 0.41428571939468384, | |
| "reward_std": 0.20157082378864288, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 288.0, | |
| "completions/max_terminated_length": 288.0, | |
| "completions/mean_length": 163.0, | |
| "completions/mean_terminated_length": 177.8181818181818, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 86.0, | |
| "epoch": 0.19, | |
| "format_failures": 0.0, | |
| "grad_norm": 2.0013251304626465, | |
| "kl": 0.3356290655210614, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0532, | |
| "num_tokens": 1790064.0, | |
| "reward": 0.4275793731212616, | |
| "reward_std": 0.3848039209842682, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 144.0, | |
| "completions/max_terminated_length": 144.0, | |
| "completions/mean_length": 128.33333333333334, | |
| "completions/mean_terminated_length": 140.0, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 134.0, | |
| "epoch": 0.192, | |
| "format_failures": 0.0, | |
| "grad_norm": 6.922305107116699, | |
| "kl": 3.5449295742437243, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0385, | |
| "num_tokens": 1803036.0, | |
| "reward": 0.6979166865348816, | |
| "reward_std": 0.31738603115081787, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 221.0, | |
| "completions/max_terminated_length": 221.0, | |
| "completions/mean_length": 94.33333333333333, | |
| "completions/mean_terminated_length": 102.9090909090909, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 72.0, | |
| "epoch": 0.194, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.4514728784561157, | |
| "kl": 0.1412234902381897, | |
| "learning_rate": 1e-06, | |
| "loss": 0.3157, | |
| "num_tokens": 1816092.0, | |
| "reward": 0.8380953073501587, | |
| "reward_std": 0.30834609270095825, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 82.0, | |
| "completions/max_terminated_length": 82.0, | |
| "completions/mean_length": 43.5, | |
| "completions/mean_terminated_length": 47.45454545454545, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 31.0, | |
| "epoch": 0.196, | |
| "format_failures": 0.0, | |
| "grad_norm": 2.004136085510254, | |
| "kl": 0.6110408902168274, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0095, | |
| "num_tokens": 1827024.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 373.0, | |
| "completions/max_terminated_length": 373.0, | |
| "completions/mean_length": 212.08333333333334, | |
| "completions/mean_terminated_length": 231.36363636363637, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 102.0, | |
| "epoch": 0.198, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.8370314240455627, | |
| "kl": 0.09233395755290985, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1438, | |
| "num_tokens": 1860576.0, | |
| "reward": 0.2782828211784363, | |
| "reward_std": 0.2644941210746765, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 225.0, | |
| "completions/max_terminated_length": 225.0, | |
| "completions/mean_length": 163.25, | |
| "completions/mean_terminated_length": 178.0909090909091, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 101.0, | |
| "epoch": 0.2, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.565374732017517, | |
| "kl": 0.391565203666687, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0497, | |
| "num_tokens": 1872996.0, | |
| "reward": 0.5944445133209229, | |
| "reward_std": 0.47775429487228394, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 411.0, | |
| "completions/max_terminated_length": 411.0, | |
| "completions/mean_length": 150.16666666666666, | |
| "completions/mean_terminated_length": 163.8181818181818, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 94.0, | |
| "epoch": 0.202, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.6569881439208984, | |
| "kl": 0.24375841114670038, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0387, | |
| "num_tokens": 1892856.0, | |
| "reward": 0.3499999940395355, | |
| "reward_std": 0.36666667461395264, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 150.0, | |
| "completions/max_terminated_length": 150.0, | |
| "completions/mean_length": 107.66666666666667, | |
| "completions/mean_terminated_length": 117.45454545454545, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 93.0, | |
| "epoch": 0.204, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.9490823745727539, | |
| "kl": 0.010788497282192111, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0193, | |
| "num_tokens": 1903992.0, | |
| "reward": 0.7714947462081909, | |
| "reward_std": 0.2890874743461609, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 110.0, | |
| "completions/max_terminated_length": 110.0, | |
| "completions/mean_length": 66.0, | |
| "completions/mean_terminated_length": 72.0, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 24.0, | |
| "epoch": 0.206, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.482935905456543, | |
| "kl": 0.03114949818700552, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0754, | |
| "num_tokens": 1913640.0, | |
| "reward": 0.3333333432674408, | |
| "reward_std": 0.32566946744918823, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 379.0, | |
| "completions/max_terminated_length": 379.0, | |
| "completions/mean_length": 260.5833333333333, | |
| "completions/mean_terminated_length": 284.27272727272725, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 197.0, | |
| "epoch": 0.208, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.4501963257789612, | |
| "kl": 0.011977697955444455, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0496, | |
| "num_tokens": 1932468.0, | |
| "reward": 0.37487921118736267, | |
| "reward_std": 0.29262858629226685, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.16666666666666663, | |
| "completions/max_length": 143.0, | |
| "completions/max_terminated_length": 143.0, | |
| "completions/mean_length": 113.91666666666667, | |
| "completions/mean_terminated_length": 136.7, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 120.0, | |
| "epoch": 0.21, | |
| "format_failures": 0.0, | |
| "grad_norm": 3.2958946228027344, | |
| "kl": 0.024902154691517353, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0181, | |
| "num_tokens": 1942992.0, | |
| "reward": 0.5, | |
| "reward_std": 0.5222329497337341, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 714.0, | |
| "completions/max_terminated_length": 714.0, | |
| "completions/mean_length": 166.0, | |
| "completions/mean_terminated_length": 181.0909090909091, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 57.0, | |
| "epoch": 0.212, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.3716078996658325, | |
| "kl": 1.098541870713234, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0299, | |
| "num_tokens": 1964208.0, | |
| "reward": 0.07500000298023224, | |
| "reward_std": 0.17645499110221863, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 321.0, | |
| "completions/max_terminated_length": 321.0, | |
| "completions/mean_length": 171.58333333333334, | |
| "completions/mean_terminated_length": 187.1818181818182, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 100.0, | |
| "epoch": 0.214, | |
| "format_failures": 2.0, | |
| "grad_norm": 0.27850034832954407, | |
| "kl": 0.020487794652581215, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0329, | |
| "num_tokens": 1974972.0, | |
| "reward": 0.4126984477043152, | |
| "reward_std": 0.18834668397903442, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 54.0, | |
| "completions/max_terminated_length": 54.0, | |
| "completions/mean_length": 45.416666666666664, | |
| "completions/mean_terminated_length": 49.54545454545455, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 34.0, | |
| "epoch": 0.216, | |
| "format_failures": 0.0, | |
| "grad_norm": 2.118313789367676, | |
| "kl": 0.03025034721940756, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 1981716.0, | |
| "reward": 0.8333333730697632, | |
| "reward_std": 0.38924944400787354, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 233.0, | |
| "completions/max_terminated_length": 233.0, | |
| "completions/mean_length": 117.5, | |
| "completions/mean_terminated_length": 128.1818181818182, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 88.0, | |
| "epoch": 0.218, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.9193243980407715, | |
| "kl": 0.04295819811522961, | |
| "learning_rate": 1e-06, | |
| "loss": 0.009, | |
| "num_tokens": 1992420.0, | |
| "reward": 0.701388955116272, | |
| "reward_std": 0.38302528858184814, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 131.0, | |
| "completions/max_terminated_length": 131.0, | |
| "completions/mean_length": 108.66666666666667, | |
| "completions/mean_terminated_length": 118.54545454545455, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 92.0, | |
| "epoch": 0.22, | |
| "format_failures": 0.0, | |
| "grad_norm": 4.0581183433532715, | |
| "kl": 0.34252697695046663, | |
| "learning_rate": 1e-06, | |
| "loss": -0.014, | |
| "num_tokens": 2004288.0, | |
| "reward": 0.479166716337204, | |
| "reward_std": 0.30592837929725647, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 192.0, | |
| "completions/max_terminated_length": 192.0, | |
| "completions/mean_length": 129.0, | |
| "completions/mean_terminated_length": 140.72727272727272, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 112.0, | |
| "epoch": 0.222, | |
| "format_failures": 0.0, | |
| "grad_norm": 2.901212692260742, | |
| "kl": 0.451558455824852, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0047, | |
| "num_tokens": 2021400.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 174.0, | |
| "completions/max_terminated_length": 174.0, | |
| "completions/mean_length": 147.08333333333334, | |
| "completions/mean_terminated_length": 160.45454545454547, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 65.0, | |
| "epoch": 0.224, | |
| "format_failures": 0.0, | |
| "grad_norm": 3.0557456016540527, | |
| "kl": 0.1749698342755437, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0461, | |
| "num_tokens": 2033580.0, | |
| "reward": 0.7708333730697632, | |
| "reward_std": 0.32784304022789, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 274.0, | |
| "completions/max_terminated_length": 274.0, | |
| "completions/mean_length": 81.75, | |
| "completions/mean_terminated_length": 89.18181818181819, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 41.0, | |
| "epoch": 0.226, | |
| "format_failures": 0.0, | |
| "grad_norm": 2.929105281829834, | |
| "kl": 1.0704956352710724, | |
| "learning_rate": 1e-06, | |
| "loss": -0.1432, | |
| "num_tokens": 2065740.0, | |
| "reward": 0.6625000238418579, | |
| "reward_std": 0.3711928129196167, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 796.0, | |
| "completions/max_terminated_length": 796.0, | |
| "completions/mean_length": 420.5, | |
| "completions/mean_terminated_length": 458.72727272727275, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 171.0, | |
| "epoch": 0.228, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.966941237449646, | |
| "kl": 0.012734876945614815, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0432, | |
| "num_tokens": 2101236.0, | |
| "reward": 0.6500000357627869, | |
| "reward_std": 0.40886637568473816, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 302.0, | |
| "completions/max_terminated_length": 302.0, | |
| "completions/mean_length": 263.75, | |
| "completions/mean_terminated_length": 287.72727272727275, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 280.0, | |
| "epoch": 0.23, | |
| "format_failures": 0.0, | |
| "grad_norm": 7.276376247406006, | |
| "kl": 2.2721076011657715, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0151, | |
| "num_tokens": 2114484.0, | |
| "reward": 0.7777778506278992, | |
| "reward_std": 0.3576955795288086, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 188.0, | |
| "completions/max_terminated_length": 188.0, | |
| "completions/mean_length": 167.41666666666666, | |
| "completions/mean_terminated_length": 182.63636363636363, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 147.0, | |
| "epoch": 0.232, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.6819717884063721, | |
| "kl": 0.020047412253916264, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0179, | |
| "num_tokens": 2125992.0, | |
| "reward": 0.8819445371627808, | |
| "reward_std": 0.2524084150791168, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 394.0, | |
| "completions/max_terminated_length": 394.0, | |
| "completions/mean_length": 211.33333333333334, | |
| "completions/mean_terminated_length": 230.54545454545453, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 147.0, | |
| "epoch": 0.234, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.19310350716114044, | |
| "kl": 0.019224281422793865, | |
| "learning_rate": 1e-06, | |
| "loss": 0.012, | |
| "num_tokens": 2137692.0, | |
| "reward": 0.585936427116394, | |
| "reward_std": 0.09784586727619171, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 205.0, | |
| "completions/max_terminated_length": 205.0, | |
| "completions/mean_length": 142.16666666666666, | |
| "completions/mean_terminated_length": 155.0909090909091, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 110.0, | |
| "epoch": 0.236, | |
| "format_failures": 0.0, | |
| "grad_norm": 2.085691213607788, | |
| "kl": 0.09273007325828075, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0139, | |
| "num_tokens": 2148816.0, | |
| "reward": 0.319444477558136, | |
| "reward_std": 0.2289450317621231, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 583.0, | |
| "completions/max_terminated_length": 583.0, | |
| "completions/mean_length": 317.0833333333333, | |
| "completions/mean_terminated_length": 345.90909090909093, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 140.0, | |
| "epoch": 0.238, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.37083595991134644, | |
| "kl": 0.0630851686000824, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0918, | |
| "num_tokens": 2168256.0, | |
| "reward": 0.37870368361473083, | |
| "reward_std": 0.2895275950431824, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 306.0, | |
| "completions/max_terminated_length": 306.0, | |
| "completions/mean_length": 126.66666666666667, | |
| "completions/mean_terminated_length": 138.1818181818182, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 58.0, | |
| "epoch": 0.24, | |
| "format_failures": 0.0, | |
| "grad_norm": 6.606923580169678, | |
| "kl": 3.8295647501945496, | |
| "learning_rate": 1e-06, | |
| "loss": 0.1365, | |
| "num_tokens": 2183124.0, | |
| "reward": 0.4027777910232544, | |
| "reward_std": 0.3723955750465393, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.5833333333333333, | |
| "completions/max_length": 77.0, | |
| "completions/max_terminated_length": 77.0, | |
| "completions/mean_length": 32.083333333333336, | |
| "completions/mean_terminated_length": 77.0, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 77.0, | |
| "epoch": 0.242, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.08047831058502197, | |
| "kl": 0.013985397294163704, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0003, | |
| "num_tokens": 2190396.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 595.0, | |
| "completions/max_terminated_length": 595.0, | |
| "completions/mean_length": 431.0833333333333, | |
| "completions/mean_terminated_length": 470.27272727272725, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 109.0, | |
| "epoch": 0.244, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.019394446164369583, | |
| "kl": 0.01961024198681116, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 2218320.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 454.0, | |
| "completions/max_terminated_length": 454.0, | |
| "completions/mean_length": 284.9166666666667, | |
| "completions/mean_terminated_length": 310.8181818181818, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 118.0, | |
| "epoch": 0.246, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.5184653997421265, | |
| "kl": 1.0404187738895416, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0335, | |
| "num_tokens": 2231256.0, | |
| "reward": 0.4014219641685486, | |
| "reward_std": 0.31073111295700073, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 141.0, | |
| "completions/max_terminated_length": 141.0, | |
| "completions/mean_length": 64.75, | |
| "completions/mean_terminated_length": 70.63636363636364, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 34.0, | |
| "epoch": 0.248, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.6326740980148315, | |
| "kl": 0.3745545968413353, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0517, | |
| "num_tokens": 2240424.0, | |
| "reward": 0.8037037253379822, | |
| "reward_std": 0.3365945816040039, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.6666666666666667, | |
| "completions/max_length": 117.0, | |
| "completions/max_terminated_length": 117.0, | |
| "completions/mean_length": 37.75, | |
| "completions/mean_terminated_length": 113.25, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 102.0, | |
| "epoch": 0.25, | |
| "format_failures": 0.0, | |
| "grad_norm": 10.052517890930176, | |
| "kl": 1.53599963337183, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0049, | |
| "num_tokens": 2249424.0, | |
| "reward": 0.9166666865348816, | |
| "reward_std": 0.28867512941360474, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 235.0, | |
| "completions/max_terminated_length": 235.0, | |
| "completions/mean_length": 199.5, | |
| "completions/mean_terminated_length": 217.63636363636363, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 160.0, | |
| "epoch": 0.252, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.1388990879058838, | |
| "kl": 0.24531831266358495, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "num_tokens": 2263584.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 142.0, | |
| "completions/max_terminated_length": 142.0, | |
| "completions/mean_length": 125.0, | |
| "completions/mean_terminated_length": 136.36363636363637, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 123.0, | |
| "epoch": 0.254, | |
| "format_failures": 0.0, | |
| "grad_norm": 2.392914056777954, | |
| "kl": 0.9988721050322056, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0025, | |
| "num_tokens": 2276520.0, | |
| "reward": 0.7291666865348816, | |
| "reward_std": 0.3608439266681671, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 199.0, | |
| "completions/max_terminated_length": 199.0, | |
| "completions/mean_length": 134.08333333333334, | |
| "completions/mean_terminated_length": 146.27272727272728, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 106.0, | |
| "epoch": 0.256, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.5191885828971863, | |
| "kl": 0.20999768376350403, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0146, | |
| "num_tokens": 2286408.0, | |
| "reward": 0.717815101146698, | |
| "reward_std": 0.14373189210891724, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 229.0, | |
| "completions/max_terminated_length": 229.0, | |
| "completions/mean_length": 137.75, | |
| "completions/mean_terminated_length": 150.27272727272728, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 98.0, | |
| "epoch": 0.258, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.204528570175171, | |
| "kl": 0.08800000417977571, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0511, | |
| "num_tokens": 2296044.0, | |
| "reward": 0.5675595998764038, | |
| "reward_std": 0.2289842963218689, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 198.0, | |
| "completions/max_terminated_length": 198.0, | |
| "completions/mean_length": 124.58333333333333, | |
| "completions/mean_terminated_length": 135.9090909090909, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 54.0, | |
| "epoch": 0.26, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.44312867522239685, | |
| "kl": 0.07202759943902493, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0475, | |
| "num_tokens": 2305644.0, | |
| "reward": 0.5101972222328186, | |
| "reward_std": 0.19489067792892456, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 328.0, | |
| "completions/max_terminated_length": 328.0, | |
| "completions/mean_length": 281.1666666666667, | |
| "completions/mean_terminated_length": 306.72727272727275, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 253.0, | |
| "epoch": 0.262, | |
| "format_failures": 1.0, | |
| "grad_norm": 1.5526983737945557, | |
| "kl": 0.06795010529458523, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0019, | |
| "num_tokens": 2319192.0, | |
| "reward": 0.75, | |
| "reward_std": 0.3217690885066986, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 183.0, | |
| "completions/max_terminated_length": 183.0, | |
| "completions/mean_length": 162.83333333333334, | |
| "completions/mean_terminated_length": 177.63636363636363, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 175.0, | |
| "epoch": 0.264, | |
| "format_failures": 0.0, | |
| "grad_norm": 2.740288257598877, | |
| "kl": 0.7462278339080513, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0045, | |
| "num_tokens": 2329488.0, | |
| "reward": 0.9791666865348816, | |
| "reward_std": 0.07216878235340118, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 532.0, | |
| "completions/max_terminated_length": 532.0, | |
| "completions/mean_length": 315.5, | |
| "completions/mean_terminated_length": 344.1818181818182, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 233.0, | |
| "epoch": 0.266, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.11069951951503754, | |
| "kl": 0.01982728624716401, | |
| "learning_rate": 1e-06, | |
| "loss": -0.034, | |
| "num_tokens": 2358276.0, | |
| "reward": 0.5852844715118408, | |
| "reward_std": 0.12080158293247223, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 269.0, | |
| "completions/max_terminated_length": 269.0, | |
| "completions/mean_length": 161.16666666666666, | |
| "completions/mean_terminated_length": 175.8181818181818, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 104.0, | |
| "epoch": 0.268, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.8276861906051636, | |
| "kl": 0.09472572058439255, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0149, | |
| "num_tokens": 2368980.0, | |
| "reward": 0.6518849730491638, | |
| "reward_std": 0.2886110842227936, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 325.0, | |
| "completions/max_terminated_length": 325.0, | |
| "completions/mean_length": 227.08333333333334, | |
| "completions/mean_terminated_length": 247.72727272727272, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 136.0, | |
| "epoch": 0.27, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.5550012588500977, | |
| "kl": 0.02074157353490591, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0841, | |
| "num_tokens": 2379828.0, | |
| "reward": 0.6243386268615723, | |
| "reward_std": 0.3905191719532013, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 232.0, | |
| "completions/max_terminated_length": 232.0, | |
| "completions/mean_length": 210.0, | |
| "completions/mean_terminated_length": 229.0909090909091, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 228.0, | |
| "epoch": 0.272, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.019722580909729, | |
| "kl": 0.13905800506472588, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0123, | |
| "num_tokens": 2394360.0, | |
| "reward": 0.949999988079071, | |
| "reward_std": 0.17320507764816284, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 348.0, | |
| "completions/max_terminated_length": 348.0, | |
| "completions/mean_length": 215.0, | |
| "completions/mean_terminated_length": 234.54545454545453, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 145.0, | |
| "epoch": 0.274, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.32402342557907104, | |
| "kl": 0.014864406548440456, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0012, | |
| "num_tokens": 2406096.0, | |
| "reward": 0.6149470806121826, | |
| "reward_std": 0.19829140603542328, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 323.0, | |
| "completions/max_terminated_length": 323.0, | |
| "completions/mean_length": 136.58333333333334, | |
| "completions/mean_terminated_length": 149.0, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 58.0, | |
| "epoch": 0.276, | |
| "format_failures": 0.0, | |
| "grad_norm": 1.005679965019226, | |
| "kl": 0.023909798823297024, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0608, | |
| "num_tokens": 2423568.0, | |
| "reward": 0.5231481790542603, | |
| "reward_std": 0.3425479829311371, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.08333333333333337, | |
| "completions/max_length": 241.0, | |
| "completions/max_terminated_length": 241.0, | |
| "completions/mean_length": 165.58333333333334, | |
| "completions/mean_terminated_length": 180.63636363636363, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 75.0, | |
| "epoch": 0.278, | |
| "format_failures": 0.0, | |
| "grad_norm": 3.9986395835876465, | |
| "kl": 2.975656658411026, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0003, | |
| "num_tokens": 2437320.0, | |
| "reward": 0.7277778387069702, | |
| "reward_std": 0.4172621965408325, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.33333333333333337, | |
| "completions/max_length": 55.0, | |
| "completions/max_terminated_length": 55.0, | |
| "completions/mean_length": 36.5, | |
| "completions/mean_terminated_length": 54.75, | |
| "completions/min_length": 0.0, | |
| "completions/min_terminated_length": 53.0, | |
| "epoch": 0.28, | |
| "format_failures": 0.0, | |
| "grad_norm": 0.04945458099246025, | |
| "kl": 0.008955058641731739, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0002, | |
| "num_tokens": 2449116.0, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "step": 140 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 1000, | |
| "num_input_tokens_seen": 2449116, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |