| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.2, |
| "eval_steps": 500, |
| "global_step": 50, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 237.0, |
| "completions/max_terminated_length": 237.0, |
| "completions/mean_length": 114.4375, |
| "completions/mean_terminated_length": 114.4375, |
| "completions/min_length": 68.0, |
| "completions/min_terminated_length": 68.0, |
| "entropy": 0.23012111708521843, |
| "epoch": 0.004, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 0.32706609795270336, |
| "learning_rate": 1e-06, |
| "loss": 0.0, |
| "num_tokens": 21278.0, |
| "reward": 1.8497917652130127, |
| "reward_std": 0.09231126308441162, |
| "rewards/accuracy_reward_func/mean": 0.871666669845581, |
| "rewards/accuracy_reward_func/std": 0.21418261528015137, |
| "rewards/format_reward_func/mean": 0.9781249761581421, |
| "rewards/format_reward_func/std": 0.1237436830997467, |
| "step": 1, |
| "step_time": 23.142175153829157 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 175.0, |
| "completions/max_terminated_length": 175.0, |
| "completions/mean_length": 95.9375, |
| "completions/mean_terminated_length": 95.9375, |
| "completions/min_length": 68.0, |
| "completions/min_terminated_length": 68.0, |
| "entropy": 0.2443241998553276, |
| "epoch": 0.008, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 0.33467634062811474, |
| "learning_rate": 9.8e-07, |
| "loss": 0.0, |
| "num_tokens": 43776.0, |
| "reward": 1.8129092454910278, |
| "reward_std": 0.02420501410961151, |
| "rewards/accuracy_reward_func/mean": 0.8129092454910278, |
| "rewards/accuracy_reward_func/std": 0.21289166808128357, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 2, |
| "step_time": 9.088656539097428 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 145.0, |
| "completions/max_terminated_length": 145.0, |
| "completions/mean_length": 97.03125, |
| "completions/mean_terminated_length": 97.03125, |
| "completions/min_length": 67.0, |
| "completions/min_terminated_length": 67.0, |
| "entropy": 0.19135062769055367, |
| "epoch": 0.012, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 0.4170485616641671, |
| "learning_rate": 9.6e-07, |
| "loss": -0.0, |
| "num_tokens": 70409.0, |
| "reward": 1.906822919845581, |
| "reward_std": 0.0494791716337204, |
| "rewards/accuracy_reward_func/mean": 0.906822919845581, |
| "rewards/accuracy_reward_func/std": 0.13648581504821777, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 3, |
| "step_time": 8.136402582749724 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 137.0, |
| "completions/max_terminated_length": 137.0, |
| "completions/mean_length": 99.125, |
| "completions/mean_terminated_length": 99.125, |
| "completions/min_length": 76.0, |
| "completions/min_terminated_length": 76.0, |
| "entropy": 0.25429725274443626, |
| "epoch": 0.016, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.7524404289855315, |
| "learning_rate": 9.399999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 94161.0, |
| "reward": 1.7960565090179443, |
| "reward_std": 0.13303174078464508, |
| "rewards/accuracy_reward_func/mean": 0.8085565567016602, |
| "rewards/accuracy_reward_func/std": 0.24971628189086914, |
| "rewards/format_reward_func/mean": 0.987500011920929, |
| "rewards/format_reward_func/std": 0.0707106739282608, |
| "step": 4, |
| "step_time": 7.919396638870239 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 258.0, |
| "completions/max_terminated_length": 258.0, |
| "completions/mean_length": 127.75, |
| "completions/mean_terminated_length": 127.75, |
| "completions/min_length": 78.0, |
| "completions/min_terminated_length": 78.0, |
| "entropy": 0.24686714261770248, |
| "epoch": 0.02, |
| "frac_reward_zero_std": 0.375, |
| "grad_norm": 0.5703983537961373, |
| "learning_rate": 9.2e-07, |
| "loss": -0.0, |
| "num_tokens": 115581.0, |
| "reward": 1.9024033546447754, |
| "reward_std": 0.06382934749126434, |
| "rewards/accuracy_reward_func/mean": 0.9024032950401306, |
| "rewards/accuracy_reward_func/std": 0.14118432998657227, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 5, |
| "step_time": 11.992262025363743 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 145.0, |
| "completions/max_terminated_length": 145.0, |
| "completions/mean_length": 92.34375, |
| "completions/mean_terminated_length": 92.34375, |
| "completions/min_length": 63.0, |
| "completions/min_terminated_length": 63.0, |
| "entropy": 0.22791285440325737, |
| "epoch": 0.024, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 0.2647928147370579, |
| "learning_rate": 9e-07, |
| "loss": 0.0, |
| "num_tokens": 140948.0, |
| "reward": 1.7881250381469727, |
| "reward_std": 0.016249999403953552, |
| "rewards/accuracy_reward_func/mean": 0.7881250381469727, |
| "rewards/accuracy_reward_func/std": 0.23106878995895386, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 6, |
| "step_time": 8.167283555492759 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 158.0, |
| "completions/max_terminated_length": 158.0, |
| "completions/mean_length": 101.59375, |
| "completions/mean_terminated_length": 101.59375, |
| "completions/min_length": 61.0, |
| "completions/min_terminated_length": 61.0, |
| "entropy": 0.2525057829916477, |
| "epoch": 0.028, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 0.28702743314019674, |
| "learning_rate": 8.799999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 167411.0, |
| "reward": 1.9117188453674316, |
| "reward_std": 0.04828793182969093, |
| "rewards/accuracy_reward_func/mean": 0.9117187261581421, |
| "rewards/accuracy_reward_func/std": 0.12177487462759018, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 7, |
| "step_time": 8.586366776376963 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 198.0, |
| "completions/max_terminated_length": 198.0, |
| "completions/mean_length": 101.90625, |
| "completions/mean_terminated_length": 101.90625, |
| "completions/min_length": 72.0, |
| "completions/min_terminated_length": 72.0, |
| "entropy": 0.251990407705307, |
| "epoch": 0.032, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.6162980049635086, |
| "learning_rate": 8.599999999999999e-07, |
| "loss": -0.0, |
| "num_tokens": 192304.0, |
| "reward": 1.9317708015441895, |
| "reward_std": 0.0726683959364891, |
| "rewards/accuracy_reward_func/mean": 0.9317708611488342, |
| "rewards/accuracy_reward_func/std": 0.12373802810907364, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 8, |
| "step_time": 10.18786786403507 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 171.0, |
| "completions/max_terminated_length": 171.0, |
| "completions/mean_length": 89.53125, |
| "completions/mean_terminated_length": 89.53125, |
| "completions/min_length": 53.0, |
| "completions/min_terminated_length": 53.0, |
| "entropy": 0.2736722156405449, |
| "epoch": 0.036, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 0.7002448094598603, |
| "learning_rate": 8.399999999999999e-07, |
| "loss": -0.0, |
| "num_tokens": 217177.0, |
| "reward": 1.8406250476837158, |
| "reward_std": 0.09107423573732376, |
| "rewards/accuracy_reward_func/mean": 0.840624988079071, |
| "rewards/accuracy_reward_func/std": 0.3231492042541504, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 9, |
| "step_time": 9.146976439282298 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 134.0, |
| "completions/max_terminated_length": 134.0, |
| "completions/mean_length": 88.40625, |
| "completions/mean_terminated_length": 88.40625, |
| "completions/min_length": 59.0, |
| "completions/min_terminated_length": 59.0, |
| "entropy": 0.2195826843380928, |
| "epoch": 0.04, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.4544179080346874, |
| "learning_rate": 8.199999999999999e-07, |
| "loss": -0.0, |
| "num_tokens": 241142.0, |
| "reward": 1.9390909671783447, |
| "reward_std": 0.014433760195970535, |
| "rewards/accuracy_reward_func/mean": 0.9390908479690552, |
| "rewards/accuracy_reward_func/std": 0.0804174616932869, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 10, |
| "step_time": 7.882300075143576 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 111.0, |
| "completions/max_terminated_length": 111.0, |
| "completions/mean_length": 87.5625, |
| "completions/mean_terminated_length": 87.5625, |
| "completions/min_length": 59.0, |
| "completions/min_terminated_length": 59.0, |
| "entropy": 0.19999410584568977, |
| "epoch": 0.044, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 0.31705242310143567, |
| "learning_rate": 8e-07, |
| "loss": 0.0, |
| "num_tokens": 268372.0, |
| "reward": 1.909999966621399, |
| "reward_std": 0.0329379141330719, |
| "rewards/accuracy_reward_func/mean": 0.9099999666213989, |
| "rewards/accuracy_reward_func/std": 0.1774914711713791, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 11, |
| "step_time": 7.217764110304415 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 192.0, |
| "completions/max_terminated_length": 192.0, |
| "completions/mean_length": 103.25, |
| "completions/mean_terminated_length": 103.25, |
| "completions/min_length": 60.0, |
| "completions/min_terminated_length": 60.0, |
| "entropy": 0.2381710633635521, |
| "epoch": 0.048, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 0.29876243636744126, |
| "learning_rate": 7.799999999999999e-07, |
| "loss": -0.0, |
| "num_tokens": 286592.0, |
| "reward": 1.8604166507720947, |
| "reward_std": 0.11249998956918716, |
| "rewards/accuracy_reward_func/mean": 0.8604166507720947, |
| "rewards/accuracy_reward_func/std": 0.31079012155532837, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 12, |
| "step_time": 9.55866174865514 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 168.0, |
| "completions/max_terminated_length": 168.0, |
| "completions/mean_length": 87.6875, |
| "completions/mean_terminated_length": 87.6875, |
| "completions/min_length": 53.0, |
| "completions/min_terminated_length": 53.0, |
| "entropy": 0.21462798118591309, |
| "epoch": 0.052, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 0.7167465527886859, |
| "learning_rate": 7.599999999999999e-07, |
| "loss": -0.0, |
| "num_tokens": 306702.0, |
| "reward": 1.9366666078567505, |
| "reward_std": 0.02041665092110634, |
| "rewards/accuracy_reward_func/mean": 0.9366666674613953, |
| "rewards/accuracy_reward_func/std": 0.11999402940273285, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 13, |
| "step_time": 8.825894831679761 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 168.0, |
| "completions/max_terminated_length": 168.0, |
| "completions/mean_length": 119.375, |
| "completions/mean_terminated_length": 119.375, |
| "completions/min_length": 73.0, |
| "completions/min_terminated_length": 73.0, |
| "entropy": 0.2256241999566555, |
| "epoch": 0.056, |
| "frac_reward_zero_std": 0.375, |
| "grad_norm": 0.5531863595246166, |
| "learning_rate": 7.4e-07, |
| "loss": 0.0, |
| "num_tokens": 333758.0, |
| "reward": 1.8471875190734863, |
| "reward_std": 0.14163094758987427, |
| "rewards/accuracy_reward_func/mean": 0.8471875190734863, |
| "rewards/accuracy_reward_func/std": 0.26768702268600464, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 14, |
| "step_time": 8.871076120994985 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 151.0, |
| "completions/max_terminated_length": 151.0, |
| "completions/mean_length": 99.125, |
| "completions/mean_terminated_length": 99.125, |
| "completions/min_length": 64.0, |
| "completions/min_terminated_length": 64.0, |
| "entropy": 0.21050135791301727, |
| "epoch": 0.06, |
| "frac_reward_zero_std": 0.375, |
| "grad_norm": 0.5955394753926603, |
| "learning_rate": 7.2e-07, |
| "loss": -0.0, |
| "num_tokens": 363766.0, |
| "reward": 1.8220758438110352, |
| "reward_std": 0.18934205174446106, |
| "rewards/accuracy_reward_func/mean": 0.8533259034156799, |
| "rewards/accuracy_reward_func/std": 0.2272060364484787, |
| "rewards/format_reward_func/mean": 0.96875, |
| "rewards/format_reward_func/std": 0.1767766922712326, |
| "step": 15, |
| "step_time": 8.423650750890374 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 158.0, |
| "completions/max_terminated_length": 158.0, |
| "completions/mean_length": 94.875, |
| "completions/mean_terminated_length": 94.875, |
| "completions/min_length": 75.0, |
| "completions/min_terminated_length": 75.0, |
| "entropy": 0.23235702514648438, |
| "epoch": 0.064, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 0.31149648184979223, |
| "learning_rate": 7e-07, |
| "loss": -0.0, |
| "num_tokens": 383022.0, |
| "reward": 1.8937499523162842, |
| "reward_std": 0.11737333238124847, |
| "rewards/accuracy_reward_func/mean": 0.9156249761581421, |
| "rewards/accuracy_reward_func/std": 0.19610625505447388, |
| "rewards/format_reward_func/mean": 0.9781249761581421, |
| "rewards/format_reward_func/std": 0.1237436830997467, |
| "step": 16, |
| "step_time": 8.814181880094111 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 216.0, |
| "completions/max_terminated_length": 216.0, |
| "completions/mean_length": 116.03125, |
| "completions/mean_terminated_length": 116.03125, |
| "completions/min_length": 77.0, |
| "completions/min_terminated_length": 77.0, |
| "entropy": 0.21332164481282234, |
| "epoch": 0.068, |
| "frac_reward_zero_std": 0.375, |
| "grad_norm": 0.6314309586182766, |
| "learning_rate": 6.800000000000001e-07, |
| "loss": -0.0, |
| "num_tokens": 409507.0, |
| "reward": 1.7729910612106323, |
| "reward_std": 0.08083245158195496, |
| "rewards/accuracy_reward_func/mean": 0.7729910612106323, |
| "rewards/accuracy_reward_func/std": 0.2013828307390213, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 17, |
| "step_time": 10.377578075043857 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 194.0, |
| "completions/max_terminated_length": 194.0, |
| "completions/mean_length": 124.0, |
| "completions/mean_terminated_length": 124.0, |
| "completions/min_length": 70.0, |
| "completions/min_terminated_length": 70.0, |
| "entropy": 0.3000790849328041, |
| "epoch": 0.072, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.43916067550283777, |
| "learning_rate": 6.6e-07, |
| "loss": -0.0, |
| "num_tokens": 429839.0, |
| "reward": 1.7817708253860474, |
| "reward_std": 0.0970831960439682, |
| "rewards/accuracy_reward_func/mean": 0.7817708253860474, |
| "rewards/accuracy_reward_func/std": 0.20934097468852997, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 18, |
| "step_time": 9.632790027186275 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 180.0, |
| "completions/max_terminated_length": 180.0, |
| "completions/mean_length": 113.8125, |
| "completions/mean_terminated_length": 113.8125, |
| "completions/min_length": 69.0, |
| "completions/min_terminated_length": 69.0, |
| "entropy": 0.24520108476281166, |
| "epoch": 0.076, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 0.8224032747868548, |
| "learning_rate": 6.4e-07, |
| "loss": -0.0, |
| "num_tokens": 455401.0, |
| "reward": 1.8831250667572021, |
| "reward_std": 0.0729166716337204, |
| "rewards/accuracy_reward_func/mean": 0.8831250071525574, |
| "rewards/accuracy_reward_func/std": 0.1986341029405594, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 19, |
| "step_time": 9.248267728835344 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 161.0, |
| "completions/max_terminated_length": 161.0, |
| "completions/mean_length": 94.28125, |
| "completions/mean_terminated_length": 94.28125, |
| "completions/min_length": 68.0, |
| "completions/min_terminated_length": 68.0, |
| "entropy": 0.2287127859890461, |
| "epoch": 0.08, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.7435614457550869, |
| "learning_rate": 6.2e-07, |
| "loss": 0.0, |
| "num_tokens": 482190.0, |
| "reward": 1.8910417556762695, |
| "reward_std": 0.08611349761486053, |
| "rewards/accuracy_reward_func/mean": 0.8910416960716248, |
| "rewards/accuracy_reward_func/std": 0.18090461194515228, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 20, |
| "step_time": 8.669513036496937 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 137.0, |
| "completions/max_terminated_length": 137.0, |
| "completions/mean_length": 98.0, |
| "completions/mean_terminated_length": 98.0, |
| "completions/min_length": 71.0, |
| "completions/min_terminated_length": 71.0, |
| "entropy": 0.23622526600956917, |
| "epoch": 0.084, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 0.3037737034258383, |
| "learning_rate": 6e-07, |
| "loss": 0.0, |
| "num_tokens": 504026.0, |
| "reward": 1.7918750047683716, |
| "reward_std": 0.10315428674221039, |
| "rewards/accuracy_reward_func/mean": 0.7918750047683716, |
| "rewards/accuracy_reward_func/std": 0.3606663942337036, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 21, |
| "step_time": 8.023445818573236 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 133.0, |
| "completions/max_terminated_length": 133.0, |
| "completions/mean_length": 94.4375, |
| "completions/mean_terminated_length": 94.4375, |
| "completions/min_length": 67.0, |
| "completions/min_terminated_length": 67.0, |
| "entropy": 0.21428008005023003, |
| "epoch": 0.088, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 0.29203554195255926, |
| "learning_rate": 5.8e-07, |
| "loss": 0.0, |
| "num_tokens": 530080.0, |
| "reward": 1.9406249523162842, |
| "reward_std": 0.045683760195970535, |
| "rewards/accuracy_reward_func/mean": 0.940625011920929, |
| "rewards/accuracy_reward_func/std": 0.10506334155797958, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 22, |
| "step_time": 7.778694893233478 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 232.0, |
| "completions/max_terminated_length": 232.0, |
| "completions/mean_length": 110.40625, |
| "completions/mean_terminated_length": 110.40625, |
| "completions/min_length": 68.0, |
| "completions/min_terminated_length": 68.0, |
| "entropy": 0.23138786852359772, |
| "epoch": 0.092, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.19686742995266426, |
| "learning_rate": 5.6e-07, |
| "loss": -0.0, |
| "num_tokens": 558297.0, |
| "reward": 1.8413751125335693, |
| "reward_std": 0.0037499964237213135, |
| "rewards/accuracy_reward_func/mean": 0.8413749933242798, |
| "rewards/accuracy_reward_func/std": 0.21956580877304077, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 23, |
| "step_time": 10.919259454123676 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 148.0, |
| "completions/max_terminated_length": 148.0, |
| "completions/mean_length": 102.59375, |
| "completions/mean_terminated_length": 102.59375, |
| "completions/min_length": 66.0, |
| "completions/min_terminated_length": 66.0, |
| "entropy": 0.18613235652446747, |
| "epoch": 0.096, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.20640152621893892, |
| "learning_rate": 5.4e-07, |
| "loss": -0.0, |
| "num_tokens": 584700.0, |
| "reward": 1.8937499523162842, |
| "reward_std": 0.020833328366279602, |
| "rewards/accuracy_reward_func/mean": 0.893750011920929, |
| "rewards/accuracy_reward_func/std": 0.1515599936246872, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 24, |
| "step_time": 8.58028247859329 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 125.0, |
| "completions/max_terminated_length": 125.0, |
| "completions/mean_length": 97.625, |
| "completions/mean_terminated_length": 97.625, |
| "completions/min_length": 65.0, |
| "completions/min_terminated_length": 65.0, |
| "entropy": 0.1680564060807228, |
| "epoch": 0.1, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 0.22607092328648917, |
| "learning_rate": 5.2e-07, |
| "loss": -0.0, |
| "num_tokens": 605736.0, |
| "reward": 1.9614583253860474, |
| "reward_std": 0.04327813535928726, |
| "rewards/accuracy_reward_func/mean": 0.9614583253860474, |
| "rewards/accuracy_reward_func/std": 0.09804884344339371, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 25, |
| "step_time": 7.494132779538631 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 218.0, |
| "completions/max_terminated_length": 218.0, |
| "completions/mean_length": 110.84375, |
| "completions/mean_terminated_length": 110.84375, |
| "completions/min_length": 75.0, |
| "completions/min_terminated_length": 75.0, |
| "entropy": 0.2523631304502487, |
| "epoch": 0.104, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 0.6963240544036335, |
| "learning_rate": 5e-07, |
| "loss": -0.0, |
| "num_tokens": 629803.0, |
| "reward": 1.722395896911621, |
| "reward_std": 0.03437499701976776, |
| "rewards/accuracy_reward_func/mean": 0.7223958373069763, |
| "rewards/accuracy_reward_func/std": 0.2913203537464142, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 26, |
| "step_time": 10.387551098130643 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 206.0, |
| "completions/max_terminated_length": 206.0, |
| "completions/mean_length": 101.65625, |
| "completions/mean_terminated_length": 101.65625, |
| "completions/min_length": 66.0, |
| "completions/min_terminated_length": 66.0, |
| "entropy": 0.28546470403671265, |
| "epoch": 0.108, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 0.8320895685645606, |
| "learning_rate": 4.8e-07, |
| "loss": -0.0, |
| "num_tokens": 655600.0, |
| "reward": 1.6748958826065063, |
| "reward_std": 0.21159148216247559, |
| "rewards/accuracy_reward_func/mean": 0.7405208349227905, |
| "rewards/accuracy_reward_func/std": 0.3287121653556824, |
| "rewards/format_reward_func/mean": 0.934374988079071, |
| "rewards/format_reward_func/std": 0.20730119943618774, |
| "step": 27, |
| "step_time": 9.990737781859934 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 321.0, |
| "completions/max_terminated_length": 321.0, |
| "completions/mean_length": 121.78125, |
| "completions/mean_terminated_length": 121.78125, |
| "completions/min_length": 70.0, |
| "completions/min_terminated_length": 70.0, |
| "entropy": 0.20365699753165245, |
| "epoch": 0.112, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 0.3040693070538053, |
| "learning_rate": 4.6e-07, |
| "loss": -0.0, |
| "num_tokens": 681285.0, |
| "reward": 1.8820312023162842, |
| "reward_std": 0.04736516997218132, |
| "rewards/accuracy_reward_func/mean": 0.882031261920929, |
| "rewards/accuracy_reward_func/std": 0.16079869866371155, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 28, |
| "step_time": 13.582923103123903 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 175.0, |
| "completions/max_terminated_length": 175.0, |
| "completions/mean_length": 114.3125, |
| "completions/mean_terminated_length": 114.3125, |
| "completions/min_length": 65.0, |
| "completions/min_terminated_length": 65.0, |
| "entropy": 0.22691119089722633, |
| "epoch": 0.116, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 0.29754584523046024, |
| "learning_rate": 4.3999999999999997e-07, |
| "loss": 0.0, |
| "num_tokens": 700215.0, |
| "reward": 1.941562533378601, |
| "reward_std": 0.03998880088329315, |
| "rewards/accuracy_reward_func/mean": 0.9415625333786011, |
| "rewards/accuracy_reward_func/std": 0.11359171569347382, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 29, |
| "step_time": 9.000959642231464 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 144.0, |
| "completions/max_terminated_length": 144.0, |
| "completions/mean_length": 98.8125, |
| "completions/mean_terminated_length": 98.8125, |
| "completions/min_length": 76.0, |
| "completions/min_terminated_length": 76.0, |
| "entropy": 0.19718682020902634, |
| "epoch": 0.12, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 0.28356832567704315, |
| "learning_rate": 4.1999999999999995e-07, |
| "loss": 0.0, |
| "num_tokens": 719033.0, |
| "reward": 1.8567261695861816, |
| "reward_std": 0.03630475699901581, |
| "rewards/accuracy_reward_func/mean": 0.8567261695861816, |
| "rewards/accuracy_reward_func/std": 0.244332417845726, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 30, |
| "step_time": 8.117130983620882 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 300.0, |
| "completions/max_terminated_length": 300.0, |
| "completions/mean_length": 107.0625, |
| "completions/mean_terminated_length": 107.0625, |
| "completions/min_length": 67.0, |
| "completions/min_terminated_length": 67.0, |
| "entropy": 0.23164699599146843, |
| "epoch": 0.124, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 0.3147460209715464, |
| "learning_rate": 4e-07, |
| "loss": 0.0, |
| "num_tokens": 742811.0, |
| "reward": 1.8263542652130127, |
| "reward_std": 0.07890324294567108, |
| "rewards/accuracy_reward_func/mean": 0.8263541460037231, |
| "rewards/accuracy_reward_func/std": 0.2404511272907257, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 31, |
| "step_time": 13.022676510736346 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 145.0, |
| "completions/max_terminated_length": 145.0, |
| "completions/mean_length": 104.28125, |
| "completions/mean_terminated_length": 104.28125, |
| "completions/min_length": 70.0, |
| "completions/min_terminated_length": 70.0, |
| "entropy": 0.2124277576804161, |
| "epoch": 0.128, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.14656459241245287, |
| "learning_rate": 3.7999999999999996e-07, |
| "loss": 0.0, |
| "num_tokens": 767276.0, |
| "reward": 1.933750033378601, |
| "reward_std": 0.004330122843384743, |
| "rewards/accuracy_reward_func/mean": 0.9337500333786011, |
| "rewards/accuracy_reward_func/std": 0.16721147298812866, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 32, |
| "step_time": 8.787895078770816 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 163.0, |
| "completions/max_terminated_length": 163.0, |
| "completions/mean_length": 99.6875, |
| "completions/mean_terminated_length": 99.6875, |
| "completions/min_length": 66.0, |
| "completions/min_terminated_length": 66.0, |
| "entropy": 0.18238281086087227, |
| "epoch": 0.132, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.17864514961842184, |
| "learning_rate": 3.6e-07, |
| "loss": 0.0, |
| "num_tokens": 795462.0, |
| "reward": 1.957291603088379, |
| "reward_std": 0.0024056239053606987, |
| "rewards/accuracy_reward_func/mean": 0.9572916626930237, |
| "rewards/accuracy_reward_func/std": 0.0789080411195755, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 33, |
| "step_time": 8.68917733244598 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 162.0, |
| "completions/max_terminated_length": 162.0, |
| "completions/mean_length": 105.21875, |
| "completions/mean_terminated_length": 105.21875, |
| "completions/min_length": 75.0, |
| "completions/min_terminated_length": 75.0, |
| "entropy": 0.20627178624272346, |
| "epoch": 0.136, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 0.3530815395243793, |
| "learning_rate": 3.4000000000000003e-07, |
| "loss": 0.0, |
| "num_tokens": 820193.0, |
| "reward": 1.7807291746139526, |
| "reward_std": 0.06852563470602036, |
| "rewards/accuracy_reward_func/mean": 0.7807291746139526, |
| "rewards/accuracy_reward_func/std": 0.34384265542030334, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 34, |
| "step_time": 8.745650510303676 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 165.0, |
| "completions/max_terminated_length": 165.0, |
| "completions/mean_length": 112.15625, |
| "completions/mean_terminated_length": 112.15625, |
| "completions/min_length": 81.0, |
| "completions/min_terminated_length": 81.0, |
| "entropy": 0.260306891053915, |
| "epoch": 0.14, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.5856316529561636, |
| "learning_rate": 3.2e-07, |
| "loss": -0.0, |
| "num_tokens": 844158.0, |
| "reward": 1.7372127771377563, |
| "reward_std": 0.0817936509847641, |
| "rewards/accuracy_reward_func/mean": 0.7372127771377563, |
| "rewards/accuracy_reward_func/std": 0.2763761281967163, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 35, |
| "step_time": 8.764568363316357 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 246.0, |
| "completions/max_terminated_length": 246.0, |
| "completions/mean_length": 120.65625, |
| "completions/mean_terminated_length": 120.65625, |
| "completions/min_length": 73.0, |
| "completions/min_terminated_length": 73.0, |
| "entropy": 0.2644369825720787, |
| "epoch": 0.144, |
| "frac_reward_zero_std": 0.375, |
| "grad_norm": 0.3867881480590169, |
| "learning_rate": 3e-07, |
| "loss": -0.0, |
| "num_tokens": 870035.0, |
| "reward": 1.723668098449707, |
| "reward_std": 0.1543687880039215, |
| "rewards/accuracy_reward_func/mean": 0.7455431222915649, |
| "rewards/accuracy_reward_func/std": 0.3101375699043274, |
| "rewards/format_reward_func/mean": 0.9781249761581421, |
| "rewards/format_reward_func/std": 0.1237436830997467, |
| "step": 36, |
| "step_time": 11.264538847841322 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 162.0, |
| "completions/max_terminated_length": 162.0, |
| "completions/mean_length": 100.28125, |
| "completions/mean_terminated_length": 100.28125, |
| "completions/min_length": 72.0, |
| "completions/min_terminated_length": 72.0, |
| "entropy": 0.23801737278699875, |
| "epoch": 0.148, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.4666251511091792, |
| "learning_rate": 2.8e-07, |
| "loss": -0.0, |
| "num_tokens": 893872.0, |
| "reward": 1.8420684337615967, |
| "reward_std": 0.1202840656042099, |
| "rewards/accuracy_reward_func/mean": 0.8639434576034546, |
| "rewards/accuracy_reward_func/std": 0.228593647480011, |
| "rewards/format_reward_func/mean": 0.9781249761581421, |
| "rewards/format_reward_func/std": 0.1237436830997467, |
| "step": 37, |
| "step_time": 8.666291879490018 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 198.0, |
| "completions/max_terminated_length": 198.0, |
| "completions/mean_length": 110.9375, |
| "completions/mean_terminated_length": 110.9375, |
| "completions/min_length": 73.0, |
| "completions/min_terminated_length": 73.0, |
| "entropy": 0.23708894476294518, |
| "epoch": 0.152, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 0.3003587942152678, |
| "learning_rate": 2.6e-07, |
| "loss": 0.0, |
| "num_tokens": 919606.0, |
| "reward": 1.9385044574737549, |
| "reward_std": 0.02471514418721199, |
| "rewards/accuracy_reward_func/mean": 0.9385044574737549, |
| "rewards/accuracy_reward_func/std": 0.0629124566912651, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 38, |
| "step_time": 9.764362094923854 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 146.0, |
| "completions/max_terminated_length": 146.0, |
| "completions/mean_length": 96.75, |
| "completions/mean_terminated_length": 96.75, |
| "completions/min_length": 68.0, |
| "completions/min_terminated_length": 68.0, |
| "entropy": 0.23468047007918358, |
| "epoch": 0.156, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 0.37048827303687887, |
| "learning_rate": 2.4e-07, |
| "loss": -0.0, |
| "num_tokens": 945630.0, |
| "reward": 1.731874942779541, |
| "reward_std": 0.1671428233385086, |
| "rewards/accuracy_reward_func/mean": 0.7756249904632568, |
| "rewards/accuracy_reward_func/std": 0.3532584309577942, |
| "rewards/format_reward_func/mean": 0.956250011920929, |
| "rewards/format_reward_func/std": 0.1721542775630951, |
| "step": 39, |
| "step_time": 9.183467078953981 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 173.0, |
| "completions/max_terminated_length": 173.0, |
| "completions/mean_length": 109.0, |
| "completions/mean_terminated_length": 109.0, |
| "completions/min_length": 75.0, |
| "completions/min_terminated_length": 75.0, |
| "entropy": 0.23336144164204597, |
| "epoch": 0.16, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.287196945879942, |
| "learning_rate": 2.1999999999999998e-07, |
| "loss": 0.0, |
| "num_tokens": 967418.0, |
| "reward": 1.9270832538604736, |
| "reward_std": 0.012028136290609837, |
| "rewards/accuracy_reward_func/mean": 0.9270833134651184, |
| "rewards/accuracy_reward_func/std": 0.1690024584531784, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 40, |
| "step_time": 9.006164254620671 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 143.0, |
| "completions/max_terminated_length": 143.0, |
| "completions/mean_length": 99.28125, |
| "completions/mean_terminated_length": 99.28125, |
| "completions/min_length": 70.0, |
| "completions/min_terminated_length": 70.0, |
| "entropy": 0.22068660333752632, |
| "epoch": 0.164, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 0.26699909188358745, |
| "learning_rate": 2e-07, |
| "loss": 0.0, |
| "num_tokens": 994215.0, |
| "reward": 1.9018750190734863, |
| "reward_std": 0.06372595578432083, |
| "rewards/accuracy_reward_func/mean": 0.9018750190734863, |
| "rewards/accuracy_reward_func/std": 0.15860149264335632, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 41, |
| "step_time": 8.13273252826184 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 149.0, |
| "completions/max_terminated_length": 149.0, |
| "completions/mean_length": 99.625, |
| "completions/mean_terminated_length": 99.625, |
| "completions/min_length": 70.0, |
| "completions/min_terminated_length": 70.0, |
| "entropy": 0.23406217247247696, |
| "epoch": 0.168, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.3533172869156224, |
| "learning_rate": 1.8e-07, |
| "loss": -0.0, |
| "num_tokens": 1019951.0, |
| "reward": 1.9318749904632568, |
| "reward_std": 0.04643829166889191, |
| "rewards/accuracy_reward_func/mean": 0.9318749904632568, |
| "rewards/accuracy_reward_func/std": 0.10333744436502457, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 42, |
| "step_time": 8.258449734188616 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 118.0, |
| "completions/max_terminated_length": 118.0, |
| "completions/mean_length": 92.75, |
| "completions/mean_terminated_length": 92.75, |
| "completions/min_length": 75.0, |
| "completions/min_terminated_length": 75.0, |
| "entropy": 0.22229528427124023, |
| "epoch": 0.172, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 0.25935443501058275, |
| "learning_rate": 1.6e-07, |
| "loss": 0.0, |
| "num_tokens": 1047835.0, |
| "reward": 1.9731919765472412, |
| "reward_std": 0.022991076111793518, |
| "rewards/accuracy_reward_func/mean": 0.9731919765472412, |
| "rewards/accuracy_reward_func/std": 0.05601184815168381, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 43, |
| "step_time": 7.3312763730064034 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 132.0, |
| "completions/max_terminated_length": 132.0, |
| "completions/mean_length": 98.15625, |
| "completions/mean_terminated_length": 98.15625, |
| "completions/min_length": 70.0, |
| "completions/min_terminated_length": 70.0, |
| "entropy": 0.2684129625558853, |
| "epoch": 0.176, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 0.5054404086636916, |
| "learning_rate": 1.4e-07, |
| "loss": -0.0, |
| "num_tokens": 1070652.0, |
| "reward": 1.816145896911621, |
| "reward_std": 0.046046242117881775, |
| "rewards/accuracy_reward_func/mean": 0.8161457777023315, |
| "rewards/accuracy_reward_func/std": 0.21198345720767975, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 44, |
| "step_time": 8.090661917813122 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 189.0, |
| "completions/max_terminated_length": 189.0, |
| "completions/mean_length": 119.96875, |
| "completions/mean_terminated_length": 119.96875, |
| "completions/min_length": 63.0, |
| "completions/min_terminated_length": 63.0, |
| "entropy": 0.20931534841656685, |
| "epoch": 0.18, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.4438102774930686, |
| "learning_rate": 1.2e-07, |
| "loss": 0.0, |
| "num_tokens": 1094159.0, |
| "reward": 1.8079866170883179, |
| "reward_std": 0.07453451305627823, |
| "rewards/accuracy_reward_func/mean": 0.8079866170883179, |
| "rewards/accuracy_reward_func/std": 0.1761476993560791, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 45, |
| "step_time": 9.527460671961308 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 215.0, |
| "completions/max_terminated_length": 215.0, |
| "completions/mean_length": 107.15625, |
| "completions/mean_terminated_length": 107.15625, |
| "completions/min_length": 75.0, |
| "completions/min_terminated_length": 75.0, |
| "entropy": 0.25717881694436073, |
| "epoch": 0.184, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.21670389917367197, |
| "learning_rate": 1e-07, |
| "loss": 0.0, |
| "num_tokens": 1122608.0, |
| "reward": 1.808333396911621, |
| "reward_std": 0.03125, |
| "rewards/accuracy_reward_func/mean": 0.8083333373069763, |
| "rewards/accuracy_reward_func/std": 0.21151866018772125, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 46, |
| "step_time": 10.36410805862397 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 126.0, |
| "completions/max_terminated_length": 126.0, |
| "completions/mean_length": 90.375, |
| "completions/mean_terminated_length": 90.375, |
| "completions/min_length": 62.0, |
| "completions/min_terminated_length": 62.0, |
| "entropy": 0.21086286380887032, |
| "epoch": 0.188, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 0.4243741948522665, |
| "learning_rate": 8e-08, |
| "loss": 0.0, |
| "num_tokens": 1142524.0, |
| "reward": 1.8813542127609253, |
| "reward_std": 0.03895833343267441, |
| "rewards/accuracy_reward_func/mean": 0.8813541531562805, |
| "rewards/accuracy_reward_func/std": 0.13226144015789032, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 47, |
| "step_time": 8.480774418450892 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 151.0, |
| "completions/max_terminated_length": 151.0, |
| "completions/mean_length": 98.96875, |
| "completions/mean_terminated_length": 98.96875, |
| "completions/min_length": 64.0, |
| "completions/min_terminated_length": 64.0, |
| "entropy": 0.25533241406083107, |
| "epoch": 0.192, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 0.363734391468509, |
| "learning_rate": 6e-08, |
| "loss": 0.0, |
| "num_tokens": 1167279.0, |
| "reward": 1.8920758962631226, |
| "reward_std": 0.043149448931217194, |
| "rewards/accuracy_reward_func/mean": 0.8920758962631226, |
| "rewards/accuracy_reward_func/std": 0.18521229922771454, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 48, |
| "step_time": 8.397036101669073 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 158.0, |
| "completions/max_terminated_length": 158.0, |
| "completions/mean_length": 93.71875, |
| "completions/mean_terminated_length": 93.71875, |
| "completions/min_length": 55.0, |
| "completions/min_terminated_length": 55.0, |
| "entropy": 0.2753983736038208, |
| "epoch": 0.196, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 0.37001348543936974, |
| "learning_rate": 4e-08, |
| "loss": 0.0, |
| "num_tokens": 1189970.0, |
| "reward": 1.7348958253860474, |
| "reward_std": 0.0466608926653862, |
| "rewards/accuracy_reward_func/mean": 0.7348958253860474, |
| "rewards/accuracy_reward_func/std": 0.23915956914424896, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 49, |
| "step_time": 9.01807147078216 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 685.0, |
| "completions/max_terminated_length": 685.0, |
| "completions/mean_length": 130.3125, |
| "completions/mean_terminated_length": 130.3125, |
| "completions/min_length": 65.0, |
| "completions/min_terminated_length": 65.0, |
| "entropy": 0.271317683160305, |
| "epoch": 0.2, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.3729527358522691, |
| "learning_rate": 2e-08, |
| "loss": 0.0, |
| "num_tokens": 1214600.0, |
| "reward": 1.7553727626800537, |
| "reward_std": 0.13293951749801636, |
| "rewards/accuracy_reward_func/mean": 0.7553727626800537, |
| "rewards/accuracy_reward_func/std": 0.3456610441207886, |
| "rewards/format_reward_func/mean": 1.0, |
| "rewards/format_reward_func/std": 0.0, |
| "step": 50, |
| "step_time": 25.12439160142094 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 50, |
| "num_input_tokens_seen": 1214600, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|