| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.01, |
| "eval_steps": 500, |
| "global_step": 150, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 313.75, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 346.0, |
| "completions/max_terminated_length": 346.0, |
| "completions/mean_length": 313.75, |
| "completions/mean_terminated_length": 313.75, |
| "completions/min_length": 302.0, |
| "completions/min_terminated_length": 302.0, |
| "epoch": 6.666666666666667e-05, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6239553689956665, |
| "kl": 0.28753748536109924, |
| "learning_rate": 0.0, |
| "loss": 0.0003, |
| "num_tokens": 4438.0, |
| "reward": 1.6418367624282837, |
| "reward_std": 0.02908739261329174, |
| "rewards/correctness_reward_func/mean": 1.1418367624282837, |
| "rewards/correctness_reward_func/std": 0.029087385162711143, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 1 |
| }, |
| { |
| "completion_length": 125.875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 199.0, |
| "completions/max_terminated_length": 199.0, |
| "completions/mean_length": 125.875, |
| "completions/mean_terminated_length": 125.875, |
| "completions/min_length": 107.0, |
| "completions/min_terminated_length": 107.0, |
| "epoch": 0.00013333333333333334, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0035445222165435553, |
| "kl": 0.6253855228424072, |
| "learning_rate": 3.3333333333333335e-07, |
| "loss": 0.0006, |
| "num_tokens": 6677.0, |
| "reward": 1.774999976158142, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func/mean": 1.274999976158142, |
| "rewards/correctness_reward_func/std": 0.0, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 2 |
| }, |
| { |
| "completion_length": 240.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 257.0, |
| "completions/max_terminated_length": 257.0, |
| "completions/mean_length": 240.375, |
| "completions/mean_terminated_length": 240.375, |
| "completions/min_length": 232.0, |
| "completions/min_terminated_length": 232.0, |
| "epoch": 0.0002, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8441446423530579, |
| "kl": 0.42504680156707764, |
| "learning_rate": 6.666666666666667e-07, |
| "loss": 0.0004, |
| "num_tokens": 10184.0, |
| "reward": 1.5458333492279053, |
| "reward_std": 0.06408699601888657, |
| "rewards/correctness_reward_func/mean": 1.0458333492279053, |
| "rewards/correctness_reward_func/std": 0.06408701092004776, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 3 |
| }, |
| { |
| "completion_length": 111.875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 132.0, |
| "completions/max_terminated_length": 132.0, |
| "completions/mean_length": 111.875, |
| "completions/mean_terminated_length": 111.875, |
| "completions/min_length": 104.0, |
| "completions/min_terminated_length": 104.0, |
| "epoch": 0.0002666666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4565684795379639, |
| "kl": 0.5555254817008972, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 0.0006, |
| "num_tokens": 12303.0, |
| "reward": 1.6624999046325684, |
| "reward_std": 0.0981980711221695, |
| "rewards/correctness_reward_func/mean": 1.1624999046325684, |
| "rewards/correctness_reward_func/std": 0.0981980711221695, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 4 |
| }, |
| { |
| "completion_length": 315.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 347.0, |
| "completions/max_terminated_length": 347.0, |
| "completions/mean_length": 315.5, |
| "completions/mean_terminated_length": 315.5, |
| "completions/min_length": 302.0, |
| "completions/min_terminated_length": 302.0, |
| "epoch": 0.0003333333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6569257974624634, |
| "kl": 0.38966104388237, |
| "learning_rate": 1.3333333333333334e-06, |
| "loss": 0.0004, |
| "num_tokens": 16731.0, |
| "reward": 1.586734652519226, |
| "reward_std": 0.027575301006436348, |
| "rewards/correctness_reward_func/mean": 1.086734652519226, |
| "rewards/correctness_reward_func/std": 0.02757529728114605, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 5 |
| }, |
| { |
| "completion_length": 387.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 396.0, |
| "completions/max_terminated_length": 396.0, |
| "completions/mean_length": 387.5, |
| "completions/mean_terminated_length": 387.5, |
| "completions/min_length": 379.0, |
| "completions/min_terminated_length": 379.0, |
| "epoch": 0.0004, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7828848958015442, |
| "kl": 0.33463814854621887, |
| "learning_rate": 1.6666666666666667e-06, |
| "loss": 0.0003, |
| "num_tokens": 21935.0, |
| "reward": 1.357812523841858, |
| "reward_std": 0.057355206459760666, |
| "rewards/correctness_reward_func/mean": 0.8578125238418579, |
| "rewards/correctness_reward_func/std": 0.05735520273447037, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 6 |
| }, |
| { |
| "completion_length": 332.75, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 377.0, |
| "completions/max_terminated_length": 377.0, |
| "completions/mean_length": 332.75, |
| "completions/mean_terminated_length": 332.75, |
| "completions/min_length": 300.0, |
| "completions/min_terminated_length": 300.0, |
| "epoch": 0.00046666666666666666, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7264121174812317, |
| "kl": 0.368626207113266, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.0004, |
| "num_tokens": 26381.0, |
| "reward": 1.3227040767669678, |
| "reward_std": 0.09919165819883347, |
| "rewards/correctness_reward_func/mean": 0.8227040767669678, |
| "rewards/correctness_reward_func/std": 0.09919163584709167, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 7 |
| }, |
| { |
| "completion_length": 506.125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 537.0, |
| "completions/max_terminated_length": 537.0, |
| "completions/mean_length": 506.125, |
| "completions/mean_terminated_length": 506.125, |
| "completions/min_length": 496.0, |
| "completions/min_terminated_length": 496.0, |
| "epoch": 0.0005333333333333334, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5257914662361145, |
| "kl": 0.2594059407711029, |
| "learning_rate": 2.3333333333333336e-06, |
| "loss": 0.0003, |
| "num_tokens": 32886.0, |
| "reward": 1.4370369911193848, |
| "reward_std": 0.02962968312203884, |
| "rewards/correctness_reward_func/mean": 0.9370369911193848, |
| "rewards/correctness_reward_func/std": 0.029629632830619812, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 8 |
| }, |
| { |
| "completion_length": 515.25, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 523.0, |
| "completions/max_terminated_length": 523.0, |
| "completions/mean_length": 515.25, |
| "completions/mean_terminated_length": 515.25, |
| "completions/min_length": 507.0, |
| "completions/min_terminated_length": 507.0, |
| "epoch": 0.0006, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.42451590299606323, |
| "kl": 0.31523531675338745, |
| "learning_rate": 2.666666666666667e-06, |
| "loss": 0.0003, |
| "num_tokens": 39448.0, |
| "reward": 1.3203704357147217, |
| "reward_std": 0.04222872108221054, |
| "rewards/correctness_reward_func/mean": 0.8203703761100769, |
| "rewards/correctness_reward_func/std": 0.04222871735692024, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 9 |
| }, |
| { |
| "completion_length": 518.75, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 562.0, |
| "completions/max_terminated_length": 562.0, |
| "completions/mean_length": 518.75, |
| "completions/mean_terminated_length": 518.75, |
| "completions/min_length": 503.0, |
| "completions/min_terminated_length": 503.0, |
| "epoch": 0.0006666666666666666, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4859318137168884, |
| "kl": 0.3406108617782593, |
| "learning_rate": 3e-06, |
| "loss": 0.0003, |
| "num_tokens": 45990.0, |
| "reward": 1.3275463581085205, |
| "reward_std": 0.08047376573085785, |
| "rewards/correctness_reward_func/mean": 0.8275462985038757, |
| "rewards/correctness_reward_func/std": 0.08047378063201904, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 10 |
| }, |
| { |
| "completion_length": 115.25, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 121.0, |
| "completions/max_terminated_length": 121.0, |
| "completions/mean_length": 115.25, |
| "completions/mean_terminated_length": 115.25, |
| "completions/min_length": 109.0, |
| "completions/min_terminated_length": 109.0, |
| "epoch": 0.0007333333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4112221002578735, |
| "kl": 0.6370461583137512, |
| "learning_rate": 3.3333333333333333e-06, |
| "loss": 0.0006, |
| "num_tokens": 48208.0, |
| "reward": 1.7843749523162842, |
| "reward_std": 0.10933034867048264, |
| "rewards/correctness_reward_func/mean": 1.2843749523162842, |
| "rewards/correctness_reward_func/std": 0.10933034867048264, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 11 |
| }, |
| { |
| "completion_length": 229.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 241.0, |
| "completions/max_terminated_length": 241.0, |
| "completions/mean_length": 229.375, |
| "completions/mean_terminated_length": 229.375, |
| "completions/min_length": 220.0, |
| "completions/min_terminated_length": 220.0, |
| "epoch": 0.0008, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.333493947982788, |
| "kl": 0.4309392273426056, |
| "learning_rate": 3.6666666666666666e-06, |
| "loss": 0.0004, |
| "num_tokens": 51659.0, |
| "reward": 1.5499999523162842, |
| "reward_std": 0.03563487157225609, |
| "rewards/correctness_reward_func/mean": 1.0499999523162842, |
| "rewards/correctness_reward_func/std": 0.035634856671094894, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 12 |
| }, |
| { |
| "completion_length": 113.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 120.0, |
| "completions/max_terminated_length": 120.0, |
| "completions/mean_length": 113.5, |
| "completions/mean_terminated_length": 113.5, |
| "completions/min_length": 106.0, |
| "completions/min_terminated_length": 106.0, |
| "epoch": 0.0008666666666666666, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.413602352142334, |
| "kl": 0.5921498537063599, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.0006, |
| "num_tokens": 53775.0, |
| "reward": 1.868749976158142, |
| "reward_std": 0.10415471345186234, |
| "rewards/correctness_reward_func/mean": 1.368749976158142, |
| "rewards/correctness_reward_func/std": 0.10415472090244293, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 13 |
| }, |
| { |
| "completion_length": 402.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 419.0, |
| "completions/max_terminated_length": 419.0, |
| "completions/mean_length": 402.0, |
| "completions/mean_terminated_length": 402.0, |
| "completions/min_length": 392.0, |
| "completions/min_terminated_length": 392.0, |
| "epoch": 0.0009333333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.926260232925415, |
| "kl": 4.320853233337402, |
| "learning_rate": 4.333333333333334e-06, |
| "loss": 0.0043, |
| "num_tokens": 59111.0, |
| "reward": 1.37890625, |
| "reward_std": 0.02542884461581707, |
| "rewards/correctness_reward_func/mean": 0.87890625, |
| "rewards/correctness_reward_func/std": 0.02542879618704319, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 14 |
| }, |
| { |
| "completion_length": 312.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 335.0, |
| "completions/max_terminated_length": 335.0, |
| "completions/mean_length": 312.5, |
| "completions/mean_terminated_length": 312.5, |
| "completions/min_length": 299.0, |
| "completions/min_terminated_length": 299.0, |
| "epoch": 0.001, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7148852944374084, |
| "kl": 0.2926153540611267, |
| "learning_rate": 4.666666666666667e-06, |
| "loss": 0.0003, |
| "num_tokens": 63443.0, |
| "reward": 1.436734676361084, |
| "reward_std": 0.058541782200336456, |
| "rewards/correctness_reward_func/mean": 0.936734676361084, |
| "rewards/correctness_reward_func/std": 0.05854179337620735, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 15 |
| }, |
| { |
| "completion_length": 111.25, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 118.0, |
| "completions/max_terminated_length": 118.0, |
| "completions/mean_length": 111.25, |
| "completions/mean_terminated_length": 111.25, |
| "completions/min_length": 106.0, |
| "completions/min_terminated_length": 106.0, |
| "epoch": 0.0010666666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1900497674942017, |
| "kl": 0.6309098601341248, |
| "learning_rate": 5e-06, |
| "loss": 0.0006, |
| "num_tokens": 65517.0, |
| "reward": 3.331249952316284, |
| "reward_std": 0.4772970676422119, |
| "rewards/correctness_reward_func/mean": 2.831249952316284, |
| "rewards/correctness_reward_func/std": 0.4772970974445343, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 16 |
| }, |
| { |
| "completion_length": 517.875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 584.0, |
| "completions/max_terminated_length": 584.0, |
| "completions/mean_length": 517.875, |
| "completions/mean_terminated_length": 517.875, |
| "completions/min_length": 493.0, |
| "completions/min_terminated_length": 493.0, |
| "epoch": 0.0011333333333333334, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4412200450897217, |
| "kl": 0.35693415999412537, |
| "learning_rate": 4.999323102948655e-06, |
| "loss": 0.0004, |
| "num_tokens": 72052.0, |
| "reward": 1.3960647583007812, |
| "reward_std": 0.06674329191446304, |
| "rewards/correctness_reward_func/mean": 0.896064817905426, |
| "rewards/correctness_reward_func/std": 0.06674332916736603, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 17 |
| }, |
| { |
| "completion_length": 314.625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 325.0, |
| "completions/max_terminated_length": 325.0, |
| "completions/mean_length": 314.625, |
| "completions/mean_terminated_length": 314.625, |
| "completions/min_length": 306.0, |
| "completions/min_terminated_length": 306.0, |
| "epoch": 0.0012, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7217915654182434, |
| "kl": 0.3315311670303345, |
| "learning_rate": 4.997292778346312e-06, |
| "loss": 0.0003, |
| "num_tokens": 76481.0, |
| "reward": 1.540816307067871, |
| "reward_std": 0.062436942011117935, |
| "rewards/correctness_reward_func/mean": 1.040816307067871, |
| "rewards/correctness_reward_func/std": 0.062436968088150024, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 18 |
| }, |
| { |
| "completion_length": 77.875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 116.0, |
| "completions/max_terminated_length": 116.0, |
| "completions/mean_length": 77.875, |
| "completions/mean_terminated_length": 77.875, |
| "completions/min_length": 63.0, |
| "completions/min_terminated_length": 63.0, |
| "epoch": 0.0012666666666666666, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.021702434867620468, |
| "kl": 1.1031895875930786, |
| "learning_rate": 4.993910125649561e-06, |
| "loss": 0.0011, |
| "num_tokens": 78136.0, |
| "reward": 1.8666666746139526, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func/mean": 1.3666666746139526, |
| "rewards/correctness_reward_func/std": 0.0, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 19 |
| }, |
| { |
| "completion_length": 235.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 247.0, |
| "completions/max_terminated_length": 247.0, |
| "completions/mean_length": 235.5, |
| "completions/mean_terminated_length": 235.5, |
| "completions/min_length": 227.0, |
| "completions/min_terminated_length": 227.0, |
| "epoch": 0.0013333333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0724592208862305, |
| "kl": 0.3968948423862457, |
| "learning_rate": 4.989176976624511e-06, |
| "loss": 0.0004, |
| "num_tokens": 81660.0, |
| "reward": 1.5416666269302368, |
| "reward_std": 0.02357025258243084, |
| "rewards/correctness_reward_func/mean": 1.0416666269302368, |
| "rewards/correctness_reward_func/std": 0.023570258170366287, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 20 |
| }, |
| { |
| "completion_length": 327.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 361.0, |
| "completions/max_terminated_length": 361.0, |
| "completions/mean_length": 327.375, |
| "completions/mean_terminated_length": 327.375, |
| "completions/min_length": 303.0, |
| "completions/min_terminated_length": 303.0, |
| "epoch": 0.0014, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.620561420917511, |
| "kl": 0.3201756179332733, |
| "learning_rate": 4.983095894354858e-06, |
| "loss": 0.0003, |
| "num_tokens": 86247.0, |
| "reward": 1.4596939086914062, |
| "reward_std": 0.07869534194469452, |
| "rewards/correctness_reward_func/mean": 0.9596939086914062, |
| "rewards/correctness_reward_func/std": 0.07869534194469452, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 21 |
| }, |
| { |
| "completion_length": 113.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 117.0, |
| "completions/max_terminated_length": 117.0, |
| "completions/mean_length": 113.5, |
| "completions/mean_terminated_length": 113.5, |
| "completions/min_length": 101.0, |
| "completions/min_terminated_length": 101.0, |
| "epoch": 0.0014666666666666667, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0007677710382267833, |
| "kl": 0.603554368019104, |
| "learning_rate": 4.975670171853926e-06, |
| "loss": 0.0006, |
| "num_tokens": 88531.0, |
| "reward": 3.5, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func/mean": 3.0, |
| "rewards/correctness_reward_func/std": 0.0, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 22 |
| }, |
| { |
| "completion_length": 515.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 571.0, |
| "completions/max_terminated_length": 571.0, |
| "completions/mean_length": 515.375, |
| "completions/mean_terminated_length": 515.375, |
| "completions/min_length": 493.0, |
| "completions/min_terminated_length": 493.0, |
| "epoch": 0.0015333333333333334, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5324096083641052, |
| "kl": 0.2494540959596634, |
| "learning_rate": 4.966903830281449e-06, |
| "loss": 0.0002, |
| "num_tokens": 95110.0, |
| "reward": 1.3530092239379883, |
| "reward_std": 0.1169159933924675, |
| "rewards/correctness_reward_func/mean": 0.8530092239379883, |
| "rewards/correctness_reward_func/std": 0.1169159933924675, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 23 |
| }, |
| { |
| "completion_length": 71.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 88.0, |
| "completions/max_terminated_length": 88.0, |
| "completions/mean_length": 71.0, |
| "completions/mean_terminated_length": 71.0, |
| "completions/min_length": 58.0, |
| "completions/min_terminated_length": 58.0, |
| "epoch": 0.0016, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7659872770309448, |
| "kl": 1.0184522867202759, |
| "learning_rate": 4.956801616766033e-06, |
| "loss": 0.001, |
| "num_tokens": 96894.0, |
| "reward": 3.484375, |
| "reward_std": 0.04419417306780815, |
| "rewards/correctness_reward_func/mean": 3.0, |
| "rewards/correctness_reward_func/std": 0.0, |
| "rewards/xmlcount_reward_func/mean": 0.484375, |
| "rewards/xmlcount_reward_func/std": 0.04419417306780815, |
| "step": 24 |
| }, |
| { |
| "completion_length": 516.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 581.0, |
| "completions/max_terminated_length": 581.0, |
| "completions/mean_length": 516.375, |
| "completions/mean_terminated_length": 516.375, |
| "completions/min_length": 492.0, |
| "completions/min_terminated_length": 492.0, |
| "epoch": 0.0016666666666666668, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3833364248275757, |
| "kl": 0.24871811270713806, |
| "learning_rate": 4.9453690018345144e-06, |
| "loss": 0.0002, |
| "num_tokens": 103473.0, |
| "reward": 1.260879635810852, |
| "reward_std": 0.07416288554668427, |
| "rewards/correctness_reward_func/mean": 0.760879635810852, |
| "rewards/correctness_reward_func/std": 0.07416289299726486, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 25 |
| }, |
| { |
| "completion_length": 66.625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 73.0, |
| "completions/max_terminated_length": 73.0, |
| "completions/mean_length": 66.625, |
| "completions/mean_terminated_length": 66.625, |
| "completions/min_length": 60.0, |
| "completions/min_terminated_length": 60.0, |
| "epoch": 0.0017333333333333333, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.006470646243542433, |
| "kl": 1.0391145944595337, |
| "learning_rate": 4.93261217644956e-06, |
| "loss": 0.001, |
| "num_tokens": 105222.0, |
| "reward": 3.5, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func/mean": 3.0, |
| "rewards/correctness_reward_func/std": 0.0, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 26 |
| }, |
| { |
| "completion_length": 332.75, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 407.0, |
| "completions/max_terminated_length": 407.0, |
| "completions/mean_length": 332.75, |
| "completions/mean_terminated_length": 332.75, |
| "completions/min_length": 314.0, |
| "completions/min_terminated_length": 314.0, |
| "epoch": 0.0018, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9204102754592896, |
| "kl": 0.35308611392974854, |
| "learning_rate": 4.91853804865716e-06, |
| "loss": 0.0004, |
| "num_tokens": 109892.0, |
| "reward": 1.4581632614135742, |
| "reward_std": 0.051432717591524124, |
| "rewards/correctness_reward_func/mean": 0.9581632614135742, |
| "rewards/correctness_reward_func/std": 0.051432736217975616, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 27 |
| }, |
| { |
| "completion_length": 80.875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 112.0, |
| "completions/max_terminated_length": 112.0, |
| "completions/mean_length": 80.875, |
| "completions/mean_terminated_length": 80.875, |
| "completions/min_length": 61.0, |
| "completions/min_terminated_length": 61.0, |
| "epoch": 0.0018666666666666666, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.02564767189323902, |
| "kl": 0.903938889503479, |
| "learning_rate": 4.903154239845798e-06, |
| "loss": 0.0009, |
| "num_tokens": 111579.0, |
| "reward": 2.0333333015441895, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func/mean": 1.5333333015441895, |
| "rewards/correctness_reward_func/std": 0.0, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 28 |
| }, |
| { |
| "completion_length": 505.625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 553.0, |
| "completions/max_terminated_length": 553.0, |
| "completions/mean_length": 505.625, |
| "completions/mean_terminated_length": 505.625, |
| "completions/min_length": 488.0, |
| "completions/min_terminated_length": 488.0, |
| "epoch": 0.0019333333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5756263732910156, |
| "kl": 0.296729177236557, |
| "learning_rate": 4.88646908061933e-06, |
| "loss": 0.0003, |
| "num_tokens": 118104.0, |
| "reward": 1.3613426685333252, |
| "reward_std": 0.06403681635856628, |
| "rewards/correctness_reward_func/mean": 0.8925926089286804, |
| "rewards/correctness_reward_func/std": 0.027431709691882133, |
| "rewards/xmlcount_reward_func/mean": 0.46875, |
| "rewards/xmlcount_reward_func/std": 0.0578637570142746, |
| "step": 29 |
| }, |
| { |
| "completion_length": 118.875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 143.0, |
| "completions/max_terminated_length": 143.0, |
| "completions/mean_length": 118.875, |
| "completions/mean_terminated_length": 118.875, |
| "completions/min_length": 103.0, |
| "completions/min_terminated_length": 103.0, |
| "epoch": 0.002, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0035723582841455936, |
| "kl": 0.6597684025764465, |
| "learning_rate": 4.868491606285823e-06, |
| "loss": 0.0007, |
| "num_tokens": 120335.0, |
| "reward": 1.850000023841858, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func/mean": 1.350000023841858, |
| "rewards/correctness_reward_func/std": 0.0, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 30 |
| }, |
| { |
| "completion_length": 505.875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 528.0, |
| "completions/max_terminated_length": 528.0, |
| "completions/mean_length": 505.875, |
| "completions/mean_terminated_length": 505.875, |
| "completions/min_length": 487.0, |
| "completions/min_terminated_length": 487.0, |
| "epoch": 0.0020666666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.46144646406173706, |
| "kl": 0.20666523277759552, |
| "learning_rate": 4.849231551964771e-06, |
| "loss": 0.0002, |
| "num_tokens": 126766.0, |
| "reward": 1.342592477798462, |
| "reward_std": 0.02956344559788704, |
| "rewards/correctness_reward_func/mean": 0.8425925970077515, |
| "rewards/correctness_reward_func/std": 0.0295634176582098, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 31 |
| }, |
| { |
| "completion_length": 504.875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 559.0, |
| "completions/max_terminated_length": 559.0, |
| "completions/mean_length": 504.875, |
| "completions/mean_terminated_length": 504.875, |
| "completions/min_length": 455.0, |
| "completions/min_terminated_length": 455.0, |
| "epoch": 0.0021333333333333334, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5260511040687561, |
| "kl": 0.22362877428531647, |
| "learning_rate": 4.828699347315357e-06, |
| "loss": 0.0002, |
| "num_tokens": 133381.0, |
| "reward": 1.3238425254821777, |
| "reward_std": 0.1052764430642128, |
| "rewards/correctness_reward_func/mean": 0.8238425850868225, |
| "rewards/correctness_reward_func/std": 0.10527642071247101, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 32 |
| }, |
| { |
| "completion_length": 232.875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 239.0, |
| "completions/max_terminated_length": 239.0, |
| "completions/mean_length": 232.875, |
| "completions/mean_terminated_length": 232.875, |
| "completions/min_length": 220.0, |
| "completions/min_terminated_length": 220.0, |
| "epoch": 0.0022, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2515056133270264, |
| "kl": 0.3806723952293396, |
| "learning_rate": 4.806906110888606e-06, |
| "loss": 0.0004, |
| "num_tokens": 136836.0, |
| "reward": 1.5499999523162842, |
| "reward_std": 0.03563486412167549, |
| "rewards/correctness_reward_func/mean": 1.0499999523162842, |
| "rewards/correctness_reward_func/std": 0.035634856671094894, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 33 |
| }, |
| { |
| "completion_length": 231.125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 238.0, |
| "completions/max_terminated_length": 238.0, |
| "completions/mean_length": 231.125, |
| "completions/mean_terminated_length": 231.125, |
| "completions/min_length": 225.0, |
| "completions/min_terminated_length": 225.0, |
| "epoch": 0.002266666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8692136406898499, |
| "kl": 0.3986778259277344, |
| "learning_rate": 4.783863644106502e-06, |
| "loss": 0.0004, |
| "num_tokens": 140421.0, |
| "reward": 1.5833333730697632, |
| "reward_std": 0.05909371376037598, |
| "rewards/correctness_reward_func/mean": 1.0833333730697632, |
| "rewards/correctness_reward_func/std": 0.05909368395805359, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 34 |
| }, |
| { |
| "completion_length": 112.625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 126.0, |
| "completions/max_terminated_length": 126.0, |
| "completions/mean_length": 112.625, |
| "completions/mean_terminated_length": 112.625, |
| "completions/min_length": 106.0, |
| "completions/min_terminated_length": 106.0, |
| "epoch": 0.0023333333333333335, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6767317056655884, |
| "kl": 0.6677060723304749, |
| "learning_rate": 4.759584424871302e-06, |
| "loss": 0.0007, |
| "num_tokens": 142490.0, |
| "reward": 1.7562499046325684, |
| "reward_std": 0.034718237817287445, |
| "rewards/correctness_reward_func/mean": 1.2562499046325684, |
| "rewards/correctness_reward_func/std": 0.03471822291612625, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 35 |
| }, |
| { |
| "completion_length": 111.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 124.0, |
| "completions/max_terminated_length": 124.0, |
| "completions/mean_length": 111.5, |
| "completions/mean_terminated_length": 111.5, |
| "completions/min_length": 106.0, |
| "completions/min_terminated_length": 106.0, |
| "epoch": 0.0024, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.146012544631958, |
| "kl": 0.5600647926330566, |
| "learning_rate": 4.734081600808531e-06, |
| "loss": 0.0006, |
| "num_tokens": 144622.0, |
| "reward": 2.4781250953674316, |
| "reward_std": 0.8475213050842285, |
| "rewards/correctness_reward_func/mean": 1.978124976158142, |
| "rewards/correctness_reward_func/std": 0.8475213646888733, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 36 |
| }, |
| { |
| "completion_length": 402.25, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 413.0, |
| "completions/max_terminated_length": 413.0, |
| "completions/mean_length": 402.25, |
| "completions/mean_terminated_length": 402.25, |
| "completions/min_length": 391.0, |
| "completions/min_terminated_length": 391.0, |
| "epoch": 0.0024666666666666665, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6264684796333313, |
| "kl": 0.30946826934814453, |
| "learning_rate": 4.707368982147318e-06, |
| "loss": 0.0003, |
| "num_tokens": 150032.0, |
| "reward": 1.4140625, |
| "reward_std": 0.06318817287683487, |
| "rewards/correctness_reward_func/mean": 0.9140625, |
| "rewards/correctness_reward_func/std": 0.06318815797567368, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 37 |
| }, |
| { |
| "completion_length": 243.25, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 249.0, |
| "completions/max_terminated_length": 249.0, |
| "completions/mean_length": 243.25, |
| "completions/mean_terminated_length": 243.25, |
| "completions/min_length": 234.0, |
| "completions/min_terminated_length": 234.0, |
| "epoch": 0.002533333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.272317886352539, |
| "kl": 0.37085089087486267, |
| "learning_rate": 4.679461034241906e-06, |
| "loss": 0.0004, |
| "num_tokens": 153546.0, |
| "reward": 1.433333396911621, |
| "reward_std": 0.0471404492855072, |
| "rewards/correctness_reward_func/mean": 0.9333332777023315, |
| "rewards/correctness_reward_func/std": 0.0471404492855072, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 38 |
| }, |
| { |
| "completion_length": 524.125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 565.0, |
| "completions/max_terminated_length": 565.0, |
| "completions/mean_length": 524.125, |
| "completions/mean_terminated_length": 524.125, |
| "completions/min_length": 496.0, |
| "completions/min_terminated_length": 496.0, |
| "epoch": 0.0026, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5807402729988098, |
| "kl": 0.2668019235134125, |
| "learning_rate": 4.650372869738415e-06, |
| "loss": 0.0003, |
| "num_tokens": 160147.0, |
| "reward": 1.3439815044403076, |
| "reward_std": 0.07557178288698196, |
| "rewards/correctness_reward_func/mean": 0.8439815044403076, |
| "rewards/correctness_reward_func/std": 0.07557182013988495, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 39 |
| }, |
| { |
| "completion_length": 76.875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 118.0, |
| "completions/max_terminated_length": 118.0, |
| "completions/mean_length": 76.875, |
| "completions/mean_terminated_length": 76.875, |
| "completions/min_length": 60.0, |
| "completions/min_terminated_length": 60.0, |
| "epoch": 0.0026666666666666666, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3997167348861694, |
| "kl": 0.9243869185447693, |
| "learning_rate": 4.620120240391065e-06, |
| "loss": 0.0009, |
| "num_tokens": 161978.0, |
| "reward": 3.299999952316284, |
| "reward_std": 0.5656854510307312, |
| "rewards/correctness_reward_func/mean": 2.799999952316284, |
| "rewards/correctness_reward_func/std": 0.5656854510307312, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 40 |
| }, |
| { |
| "completion_length": 521.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 626.0, |
| "completions/max_terminated_length": 626.0, |
| "completions/mean_length": 521.375, |
| "completions/mean_terminated_length": 521.375, |
| "completions/min_length": 472.0, |
| "completions/min_terminated_length": 472.0, |
| "epoch": 0.0027333333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6085057258605957, |
| "kl": 0.2621612548828125, |
| "learning_rate": 4.588719528532342e-06, |
| "loss": 0.0003, |
| "num_tokens": 168597.0, |
| "reward": 1.2180554866790771, |
| "reward_std": 0.09554418921470642, |
| "rewards/correctness_reward_func/mean": 0.7180555462837219, |
| "rewards/correctness_reward_func/std": 0.09554421156644821, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 41 |
| }, |
| { |
| "completion_length": 409.75, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 472.0, |
| "completions/max_terminated_length": 472.0, |
| "completions/mean_length": 409.75, |
| "completions/mean_terminated_length": 409.75, |
| "completions/min_length": 388.0, |
| "completions/min_terminated_length": 388.0, |
| "epoch": 0.0028, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6022652983665466, |
| "kl": 0.27105629444122314, |
| "learning_rate": 4.556187738201656e-06, |
| "loss": 0.0003, |
| "num_tokens": 174019.0, |
| "reward": 1.3343749046325684, |
| "reward_std": 0.026516515761613846, |
| "rewards/correctness_reward_func/mean": 0.8343750238418579, |
| "rewards/correctness_reward_func/std": 0.026516523212194443, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 42 |
| }, |
| { |
| "completion_length": 252.25, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 356.0, |
| "completions/max_terminated_length": 356.0, |
| "completions/mean_length": 267.125, |
| "completions/mean_terminated_length": 254.4285888671875, |
| "completions/min_length": 232.0, |
| "completions/min_terminated_length": 232.0, |
| "epoch": 0.0028666666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8693382740020752, |
| "kl": 0.3942442536354065, |
| "learning_rate": 4.522542485937369e-06, |
| "loss": 0.0004, |
| "num_tokens": 177812.0, |
| "reward": 1.4541666507720947, |
| "reward_std": 0.024800769984722137, |
| "rewards/correctness_reward_func/mean": 0.9541666507720947, |
| "rewards/correctness_reward_func/std": 0.02480078488588333, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 43 |
| }, |
| { |
| "completion_length": 231.25, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 242.0, |
| "completions/max_terminated_length": 242.0, |
| "completions/mean_length": 231.25, |
| "completions/mean_terminated_length": 231.25, |
| "completions/min_length": 223.0, |
| "completions/min_terminated_length": 223.0, |
| "epoch": 0.0029333333333333334, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0458123683929443, |
| "kl": 0.39546874165534973, |
| "learning_rate": 4.48780199123712e-06, |
| "loss": 0.0004, |
| "num_tokens": 181310.0, |
| "reward": 1.616666555404663, |
| "reward_std": 0.050395265221595764, |
| "rewards/correctness_reward_func/mean": 1.116666555404663, |
| "rewards/correctness_reward_func/std": 0.05039524286985397, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 44 |
| }, |
| { |
| "completion_length": 69.625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 89.0, |
| "completions/max_terminated_length": 89.0, |
| "completions/mean_length": 69.625, |
| "completions/mean_terminated_length": 69.625, |
| "completions/min_length": 62.0, |
| "completions/min_terminated_length": 62.0, |
| "epoch": 0.003, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.004571537487208843, |
| "kl": 0.9226992130279541, |
| "learning_rate": 4.451985066691649e-06, |
| "loss": 0.0009, |
| "num_tokens": 183003.0, |
| "reward": 2.0333333015441895, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func/mean": 1.5333333015441895, |
| "rewards/correctness_reward_func/std": 0.0, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 45 |
| }, |
| { |
| "completion_length": 238.125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 269.0, |
| "completions/max_terminated_length": 269.0, |
| "completions/mean_length": 238.125, |
| "completions/mean_terminated_length": 238.125, |
| "completions/min_length": 226.0, |
| "completions/min_terminated_length": 226.0, |
| "epoch": 0.0030666666666666668, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8987017273902893, |
| "kl": 0.4024360477924347, |
| "learning_rate": 4.415111107797445e-06, |
| "loss": 0.0004, |
| "num_tokens": 186500.0, |
| "reward": 1.5750000476837158, |
| "reward_std": 0.0388321727514267, |
| "rewards/correctness_reward_func/mean": 1.0750000476837158, |
| "rewards/correctness_reward_func/std": 0.038832176476716995, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 46 |
| }, |
| { |
| "completion_length": 242.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 321.0, |
| "completions/max_terminated_length": 321.0, |
| "completions/mean_length": 242.375, |
| "completions/mean_terminated_length": 242.375, |
| "completions/min_length": 224.0, |
| "completions/min_terminated_length": 224.0, |
| "epoch": 0.0031333333333333335, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7395060658454895, |
| "kl": 0.3209053874015808, |
| "learning_rate": 4.377200082453748e-06, |
| "loss": 0.0003, |
| "num_tokens": 190023.0, |
| "reward": 1.4749999046325684, |
| "reward_std": 0.07506612688302994, |
| "rewards/correctness_reward_func/mean": 0.9750000238418579, |
| "rewards/correctness_reward_func/std": 0.07506611198186874, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 47 |
| }, |
| { |
| "completion_length": 510.25, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 557.0, |
| "completions/max_terminated_length": 557.0, |
| "completions/mean_length": 510.25, |
| "completions/mean_terminated_length": 510.25, |
| "completions/min_length": 494.0, |
| "completions/min_terminated_length": 494.0, |
| "epoch": 0.0032, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.46141135692596436, |
| "kl": 0.288358598947525, |
| "learning_rate": 4.338272520149572e-06, |
| "loss": 0.0003, |
| "num_tokens": 196513.0, |
| "reward": 1.4010416269302368, |
| "reward_std": 0.053073856979608536, |
| "rewards/correctness_reward_func/mean": 0.9166666269302368, |
| "rewards/correctness_reward_func/std": 0.03259018436074257, |
| "rewards/xmlcount_reward_func/mean": 0.484375, |
| "rewards/xmlcount_reward_func/std": 0.04419417306780815, |
| "step": 48 |
| }, |
| { |
| "completion_length": 126.625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 200.0, |
| "completions/max_terminated_length": 200.0, |
| "completions/mean_length": 126.625, |
| "completions/mean_terminated_length": 126.625, |
| "completions/min_length": 104.0, |
| "completions/min_terminated_length": 104.0, |
| "epoch": 0.003266666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.2877793312072754, |
| "kl": 0.701366126537323, |
| "learning_rate": 4.2983495008466285e-06, |
| "loss": 0.0007, |
| "num_tokens": 198702.0, |
| "reward": 2.2718749046325684, |
| "reward_std": 0.7668858170509338, |
| "rewards/correctness_reward_func/mean": 1.771875023841858, |
| "rewards/correctness_reward_func/std": 0.7668858766555786, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 49 |
| }, |
| { |
| "completion_length": 77.125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 139.0, |
| "completions/max_terminated_length": 139.0, |
| "completions/mean_length": 77.125, |
| "completions/mean_terminated_length": 77.125, |
| "completions/min_length": 63.0, |
| "completions/min_terminated_length": 63.0, |
| "epoch": 0.0033333333333333335, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0013717930996790528, |
| "kl": 0.7978397011756897, |
| "learning_rate": 4.257452643564155e-06, |
| "loss": 0.0008, |
| "num_tokens": 200543.0, |
| "reward": 3.5, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func/mean": 3.0, |
| "rewards/correctness_reward_func/std": 0.0, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 50 |
| }, |
| { |
| "completion_length": 300.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 310.0, |
| "completions/max_terminated_length": 310.0, |
| "completions/mean_length": 300.0, |
| "completions/mean_terminated_length": 300.0, |
| "completions/min_length": 292.0, |
| "completions/min_terminated_length": 292.0, |
| "epoch": 0.0034, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9057684540748596, |
| "kl": 0.36814409494400024, |
| "learning_rate": 4.215604094671835e-06, |
| "loss": 0.0004, |
| "num_tokens": 204775.0, |
| "reward": 1.4673469066619873, |
| "reward_std": 0.03400970622897148, |
| "rewards/correctness_reward_func/mean": 0.9673469066619873, |
| "rewards/correctness_reward_func/std": 0.03400970995426178, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 51 |
| }, |
| { |
| "completion_length": 264.25, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 278.0, |
| "completions/max_terminated_length": 278.0, |
| "completions/mean_length": 264.25, |
| "completions/mean_terminated_length": 264.25, |
| "completions/min_length": 257.0, |
| "completions/min_terminated_length": 257.0, |
| "epoch": 0.0034666666666666665, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1917438507080078, |
| "kl": 0.4059743583202362, |
| "learning_rate": 4.172826515897146e-06, |
| "loss": 0.0004, |
| "num_tokens": 208865.0, |
| "reward": 1.1244897842407227, |
| "reward_std": 0.05225903540849686, |
| "rewards/correctness_reward_func/mean": 0.6244897842407227, |
| "rewards/correctness_reward_func/std": 0.05225902795791626, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 52 |
| }, |
| { |
| "completion_length": 82.25, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 128.0, |
| "completions/max_terminated_length": 128.0, |
| "completions/mean_length": 82.25, |
| "completions/mean_terminated_length": 82.25, |
| "completions/min_length": 61.0, |
| "completions/min_terminated_length": 61.0, |
| "epoch": 0.003533333333333333, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.004285034723579884, |
| "kl": 0.8721116781234741, |
| "learning_rate": 4.129143072053639e-06, |
| "loss": 0.0009, |
| "num_tokens": 210611.0, |
| "reward": 3.5, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func/mean": 3.0, |
| "rewards/correctness_reward_func/std": 0.0, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 53 |
| }, |
| { |
| "completion_length": 394.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 401.0, |
| "completions/max_terminated_length": 401.0, |
| "completions/mean_length": 394.5, |
| "completions/mean_terminated_length": 394.5, |
| "completions/min_length": 389.0, |
| "completions/min_terminated_length": 389.0, |
| "epoch": 0.0036, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7819033861160278, |
| "kl": 0.23851144313812256, |
| "learning_rate": 4.084577418496775e-06, |
| "loss": 0.0002, |
| "num_tokens": 215887.0, |
| "reward": 1.44921875, |
| "reward_std": 0.04475096985697746, |
| "rewards/correctness_reward_func/mean": 0.94921875, |
| "rewards/correctness_reward_func/std": 0.044750988483428955, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 54 |
| }, |
| { |
| "completion_length": 309.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 321.0, |
| "completions/max_terminated_length": 321.0, |
| "completions/mean_length": 309.375, |
| "completions/mean_terminated_length": 309.375, |
| "completions/min_length": 302.0, |
| "completions/min_terminated_length": 302.0, |
| "epoch": 0.0036666666666666666, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6779981255531311, |
| "kl": 0.37480688095092773, |
| "learning_rate": 4.039153688314146e-06, |
| "loss": 0.0004, |
| "num_tokens": 220202.0, |
| "reward": 1.5561224222183228, |
| "reward_std": 0.05771271511912346, |
| "rewards/correctness_reward_func/mean": 1.0561224222183228, |
| "rewards/correctness_reward_func/std": 0.05771271139383316, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 55 |
| }, |
| { |
| "completion_length": 124.625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 162.0, |
| "completions/max_terminated_length": 162.0, |
| "completions/mean_length": 124.625, |
| "completions/mean_terminated_length": 124.625, |
| "completions/min_length": 110.0, |
| "completions/min_terminated_length": 110.0, |
| "epoch": 0.0037333333333333333, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.004615799989551306, |
| "kl": 0.7520985007286072, |
| "learning_rate": 3.992896479256966e-06, |
| "loss": 0.0008, |
| "num_tokens": 222471.0, |
| "reward": 1.774999976158142, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func/mean": 1.274999976158142, |
| "rewards/correctness_reward_func/std": 0.0, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 56 |
| }, |
| { |
| "completion_length": 392.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 400.0, |
| "completions/max_terminated_length": 400.0, |
| "completions/mean_length": 392.5, |
| "completions/mean_terminated_length": 392.5, |
| "completions/min_length": 385.0, |
| "completions/min_terminated_length": 385.0, |
| "epoch": 0.0038, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.64808189868927, |
| "kl": 0.4047853946685791, |
| "learning_rate": 3.945830840419966e-06, |
| "loss": 0.0004, |
| "num_tokens": 227763.0, |
| "reward": 1.271093726158142, |
| "reward_std": 0.09154777228832245, |
| "rewards/correctness_reward_func/mean": 0.7710937261581421, |
| "rewards/correctness_reward_func/std": 0.09154780954122543, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 57 |
| }, |
| { |
| "completion_length": 236.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 248.0, |
| "completions/max_terminated_length": 248.0, |
| "completions/mean_length": 236.375, |
| "completions/mean_terminated_length": 236.375, |
| "completions/min_length": 227.0, |
| "completions/min_terminated_length": 227.0, |
| "epoch": 0.0038666666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7382640242576599, |
| "kl": 0.40867918729782104, |
| "learning_rate": 3.897982258676867e-06, |
| "loss": 0.0004, |
| "num_tokens": 231302.0, |
| "reward": 1.649999976158142, |
| "reward_std": 0.05039524659514427, |
| "rewards/correctness_reward_func/mean": 1.149999976158142, |
| "rewards/correctness_reward_func/std": 0.05039524286985397, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 58 |
| }, |
| { |
| "completion_length": 490.75, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 495.0, |
| "completions/max_terminated_length": 495.0, |
| "completions/mean_length": 490.75, |
| "completions/mean_terminated_length": 490.75, |
| "completions/min_length": 486.0, |
| "completions/min_terminated_length": 486.0, |
| "epoch": 0.003933333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.47874385118484497, |
| "kl": 0.26013773679733276, |
| "learning_rate": 3.849376644878783e-06, |
| "loss": 0.0003, |
| "num_tokens": 237692.0, |
| "reward": 1.3481481075286865, |
| "reward_std": 0.036288779228925705, |
| "rewards/correctness_reward_func/mean": 0.8481481075286865, |
| "rewards/correctness_reward_func/std": 0.03628873825073242, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 59 |
| }, |
| { |
| "completion_length": 220.25, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 235.0, |
| "completions/max_terminated_length": 235.0, |
| "completions/mean_length": 220.25, |
| "completions/mean_terminated_length": 220.25, |
| "completions/min_length": 211.0, |
| "completions/min_terminated_length": 211.0, |
| "epoch": 0.004, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9544376730918884, |
| "kl": 0.49662235379219055, |
| "learning_rate": 3.8000403198230385e-06, |
| "loss": 0.0005, |
| "num_tokens": 241198.0, |
| "reward": 1.5499999523162842, |
| "reward_std": 0.039840973913669586, |
| "rewards/correctness_reward_func/mean": 1.0499999523162842, |
| "rewards/correctness_reward_func/std": 0.039840973913669586, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 60 |
| }, |
| { |
| "completion_length": 393.875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 399.0, |
| "completions/max_terminated_length": 399.0, |
| "completions/mean_length": 393.875, |
| "completions/mean_terminated_length": 393.875, |
| "completions/min_length": 388.0, |
| "completions/min_terminated_length": 388.0, |
| "epoch": 0.004066666666666666, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5445326566696167, |
| "kl": 0.28289133310317993, |
| "learning_rate": 3.7500000000000005e-06, |
| "loss": 0.0003, |
| "num_tokens": 246621.0, |
| "reward": 1.498437523841858, |
| "reward_std": 0.024032628163695335, |
| "rewards/correctness_reward_func/mean": 0.9984374642372131, |
| "rewards/correctness_reward_func/std": 0.024032630026340485, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 61 |
| }, |
| { |
| "completion_length": 525.75, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 931.0, |
| "completions/max_terminated_length": 931.0, |
| "completions/mean_length": 525.75, |
| "completions/mean_terminated_length": 525.75, |
| "completions/min_length": 449.0, |
| "completions/min_terminated_length": 449.0, |
| "epoch": 0.0041333333333333335, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5638792514801025, |
| "kl": 0.28673431277275085, |
| "learning_rate": 3.699282783125616e-06, |
| "loss": 0.0003, |
| "num_tokens": 253339.0, |
| "reward": 1.1914352178573608, |
| "reward_std": 0.10863616317510605, |
| "rewards/correctness_reward_func/mean": 0.6914352178573608, |
| "rewards/correctness_reward_func/std": 0.10863618552684784, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 62 |
| }, |
| { |
| "completion_length": 114.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 157.0, |
| "completions/max_terminated_length": 157.0, |
| "completions/mean_length": 114.5, |
| "completions/mean_terminated_length": 114.5, |
| "completions/min_length": 102.0, |
| "completions/min_terminated_length": 102.0, |
| "epoch": 0.0042, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2769147157669067, |
| "kl": 0.7663796544075012, |
| "learning_rate": 3.6479161334675294e-06, |
| "loss": 0.0008, |
| "num_tokens": 255431.0, |
| "reward": 1.7281250953674316, |
| "reward_std": 0.05580177903175354, |
| "rewards/correctness_reward_func/mean": 1.2281250953674316, |
| "rewards/correctness_reward_func/std": 0.05580177158117294, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 63 |
| }, |
| { |
| "completion_length": 232.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 246.0, |
| "completions/max_terminated_length": 246.0, |
| "completions/mean_length": 232.375, |
| "completions/mean_terminated_length": 232.375, |
| "completions/min_length": 218.0, |
| "completions/min_terminated_length": 218.0, |
| "epoch": 0.004266666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7423648834228516, |
| "kl": 0.4253622889518738, |
| "learning_rate": 3.595927866972694e-06, |
| "loss": 0.0004, |
| "num_tokens": 259026.0, |
| "reward": 1.5833332538604736, |
| "reward_std": 0.03984096646308899, |
| "rewards/correctness_reward_func/mean": 1.0833332538604736, |
| "rewards/correctness_reward_func/std": 0.039840951561927795, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 64 |
| }, |
| { |
| "completion_length": 390.875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 396.0, |
| "completions/max_terminated_length": 396.0, |
| "completions/mean_length": 390.875, |
| "completions/mean_terminated_length": 390.875, |
| "completions/min_length": 380.0, |
| "completions/min_terminated_length": 380.0, |
| "epoch": 0.004333333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6674738526344299, |
| "kl": 0.24484872817993164, |
| "learning_rate": 3.543346136204545e-06, |
| "loss": 0.0002, |
| "num_tokens": 264257.0, |
| "reward": 1.482031226158142, |
| "reward_std": 0.03741618990898132, |
| "rewards/correctness_reward_func/mean": 0.9820312261581421, |
| "rewards/correctness_reward_func/std": 0.037416212260723114, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 65 |
| }, |
| { |
| "completion_length": 457.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 502.0, |
| "completions/max_terminated_length": 502.0, |
| "completions/mean_length": 457.0, |
| "completions/mean_terminated_length": 457.0, |
| "completions/min_length": 434.0, |
| "completions/min_terminated_length": 434.0, |
| "epoch": 0.0044, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6465625166893005, |
| "kl": 0.7725451588630676, |
| "learning_rate": 3.4901994150978926e-06, |
| "loss": 0.0008, |
| "num_tokens": 270425.0, |
| "reward": 1.0481481552124023, |
| "reward_std": 0.20149599015712738, |
| "rewards/correctness_reward_func/mean": 0.5481481552124023, |
| "rewards/correctness_reward_func/std": 0.20149599015712738, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 66 |
| }, |
| { |
| "completion_length": 130.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 254.0, |
| "completions/max_terminated_length": 254.0, |
| "completions/mean_length": 130.0, |
| "completions/mean_terminated_length": 130.0, |
| "completions/min_length": 102.0, |
| "completions/min_terminated_length": 102.0, |
| "epoch": 0.0044666666666666665, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4314974546432495, |
| "kl": 0.6890926361083984, |
| "learning_rate": 3.436516483539781e-06, |
| "loss": 0.0007, |
| "num_tokens": 272641.0, |
| "reward": 2.3375000953674316, |
| "reward_std": 0.7304597496986389, |
| "rewards/correctness_reward_func/mean": 1.837499976158142, |
| "rewards/correctness_reward_func/std": 0.7304597496986389, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 67 |
| }, |
| { |
| "completion_length": 98.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 163.0, |
| "completions/max_terminated_length": 163.0, |
| "completions/mean_length": 98.375, |
| "completions/mean_terminated_length": 98.375, |
| "completions/min_length": 63.0, |
| "completions/min_terminated_length": 63.0, |
| "epoch": 0.004533333333333334, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.40536630153656, |
| "kl": 0.7224768400192261, |
| "learning_rate": 3.3823264117846722e-06, |
| "loss": 0.0007, |
| "num_tokens": 274532.0, |
| "reward": 3.299999952316284, |
| "reward_std": 0.5656854510307312, |
| "rewards/correctness_reward_func/mean": 2.799999952316284, |
| "rewards/correctness_reward_func/std": 0.5656854510307312, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 68 |
| }, |
| { |
| "completion_length": 109.875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 121.0, |
| "completions/max_terminated_length": 121.0, |
| "completions/mean_length": 109.875, |
| "completions/mean_terminated_length": 109.875, |
| "completions/min_length": 103.0, |
| "completions/min_terminated_length": 103.0, |
| "epoch": 0.0046, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5540027618408203, |
| "kl": 0.5734180808067322, |
| "learning_rate": 3.3276585447123957e-06, |
| "loss": 0.0006, |
| "num_tokens": 276595.0, |
| "reward": 1.859375, |
| "reward_std": 0.02651650831103325, |
| "rewards/correctness_reward_func/mean": 1.359375, |
| "rewards/correctness_reward_func/std": 0.02651648037135601, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 69 |
| }, |
| { |
| "completion_length": 303.625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 312.0, |
| "completions/max_terminated_length": 312.0, |
| "completions/mean_length": 303.625, |
| "completions/mean_terminated_length": 303.625, |
| "completions/min_length": 299.0, |
| "completions/min_terminated_length": 299.0, |
| "epoch": 0.004666666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7526683211326599, |
| "kl": 0.30397114157676697, |
| "learning_rate": 3.272542485937369e-06, |
| "loss": 0.0003, |
| "num_tokens": 280896.0, |
| "reward": 1.4918367862701416, |
| "reward_std": 0.06512364000082016, |
| "rewards/correctness_reward_func/mean": 0.9918367266654968, |
| "rewards/correctness_reward_func/std": 0.06512364000082016, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 70 |
| }, |
| { |
| "completion_length": 443.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 761.0, |
| "completions/max_terminated_length": 761.0, |
| "completions/mean_length": 443.375, |
| "completions/mean_terminated_length": 443.375, |
| "completions/min_length": 391.0, |
| "completions/min_terminated_length": 391.0, |
| "epoch": 0.004733333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3900875747203827, |
| "kl": 0.45952847599983215, |
| "learning_rate": 3.217008081777726e-06, |
| "loss": 0.0005, |
| "num_tokens": 286619.0, |
| "reward": 1.3507812023162842, |
| "reward_std": 0.13555601239204407, |
| "rewards/correctness_reward_func/mean": 0.897656261920929, |
| "rewards/correctness_reward_func/std": 0.0323791466653347, |
| "rewards/xmlcount_reward_func/mean": 0.453125, |
| "rewards/xmlcount_reward_func/std": 0.13258251547813416, |
| "step": 71 |
| }, |
| { |
| "completion_length": 493.75, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 512.0, |
| "completions/mean_length": 493.75, |
| "completions/mean_terminated_length": 493.75, |
| "completions/min_length": 444.0, |
| "completions/min_terminated_length": 444.0, |
| "epoch": 0.0048, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4038362205028534, |
| "kl": 0.24884635210037231, |
| "learning_rate": 3.1610854050930063e-06, |
| "loss": 0.0002, |
| "num_tokens": 293065.0, |
| "reward": 1.3905092477798462, |
| "reward_std": 0.09639514237642288, |
| "rewards/correctness_reward_func/mean": 0.8905092477798462, |
| "rewards/correctness_reward_func/std": 0.09639513492584229, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 72 |
| }, |
| { |
| "completion_length": 329.875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 468.0, |
| "completions/max_terminated_length": 468.0, |
| "completions/mean_length": 329.875, |
| "completions/mean_terminated_length": 329.875, |
| "completions/min_length": 299.0, |
| "completions/min_terminated_length": 299.0, |
| "epoch": 0.004866666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7928107976913452, |
| "kl": 0.33076590299606323, |
| "learning_rate": 3.1048047389991693e-06, |
| "loss": 0.0003, |
| "num_tokens": 297712.0, |
| "reward": 1.445918321609497, |
| "reward_std": 0.04708431288599968, |
| "rewards/correctness_reward_func/mean": 0.9459183216094971, |
| "rewards/correctness_reward_func/std": 0.04708430916070938, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 73 |
| }, |
| { |
| "completion_length": 497.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 533.0, |
| "completions/max_terminated_length": 533.0, |
| "completions/mean_length": 497.375, |
| "completions/mean_terminated_length": 497.375, |
| "completions/min_length": 447.0, |
| "completions/min_terminated_length": 447.0, |
| "epoch": 0.004933333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5035240650177002, |
| "kl": 0.27565228939056396, |
| "learning_rate": 3.0481965604697582e-06, |
| "loss": 0.0003, |
| "num_tokens": 304155.0, |
| "reward": 1.2884259223937988, |
| "reward_std": 0.14154860377311707, |
| "rewards/correctness_reward_func/mean": 0.7884259223937988, |
| "rewards/correctness_reward_func/std": 0.14154860377311707, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 74 |
| }, |
| { |
| "completion_length": 309.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 325.0, |
| "completions/max_terminated_length": 325.0, |
| "completions/mean_length": 309.5, |
| "completions/mean_terminated_length": 309.5, |
| "completions/min_length": 301.0, |
| "completions/min_terminated_length": 301.0, |
| "epoch": 0.005, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7498885989189148, |
| "kl": 0.40710383653640747, |
| "learning_rate": 2.9912915238320755e-06, |
| "loss": 0.0004, |
| "num_tokens": 308503.0, |
| "reward": 1.4122449159622192, |
| "reward_std": 0.029270855709910393, |
| "rewards/correctness_reward_func/mean": 0.9122449159622192, |
| "rewards/correctness_reward_func/std": 0.029270906001329422, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 75 |
| }, |
| { |
| "completion_length": 106.125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 113.0, |
| "completions/max_terminated_length": 113.0, |
| "completions/mean_length": 106.125, |
| "completions/mean_terminated_length": 106.125, |
| "completions/min_length": 101.0, |
| "completions/min_terminated_length": 101.0, |
| "epoch": 0.005066666666666666, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.3539204597473145, |
| "kl": 0.5555666089057922, |
| "learning_rate": 2.9341204441673267e-06, |
| "loss": 0.0006, |
| "num_tokens": 310560.0, |
| "reward": 1.803125023841858, |
| "reward_std": 0.03881620615720749, |
| "rewards/correctness_reward_func/mean": 1.303125023841858, |
| "rewards/correctness_reward_func/std": 0.038816213607788086, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 76 |
| }, |
| { |
| "completion_length": 62.625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 70.0, |
| "completions/max_terminated_length": 70.0, |
| "completions/mean_length": 62.625, |
| "completions/mean_terminated_length": 62.625, |
| "completions/min_length": 59.0, |
| "completions/min_terminated_length": 59.0, |
| "epoch": 0.0051333333333333335, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.8112568855285645, |
| "kl": 1.0676077604293823, |
| "learning_rate": 2.876714280623708e-06, |
| "loss": 0.0011, |
| "num_tokens": 312141.0, |
| "reward": 1.691666603088379, |
| "reward_std": 0.17343400418758392, |
| "rewards/correctness_reward_func/mean": 1.191666603088379, |
| "rewards/correctness_reward_func/std": 0.17343401908874512, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 77 |
| }, |
| { |
| "completion_length": 307.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 324.0, |
| "completions/max_terminated_length": 324.0, |
| "completions/mean_length": 307.0, |
| "completions/mean_terminated_length": 307.0, |
| "completions/min_length": 301.0, |
| "completions/min_terminated_length": 301.0, |
| "epoch": 0.0052, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8042576909065247, |
| "kl": 0.4135895371437073, |
| "learning_rate": 2.8191041196514874e-06, |
| "loss": 0.0004, |
| "num_tokens": 316581.0, |
| "reward": 1.5102040767669678, |
| "reward_std": 0.026180749759078026, |
| "rewards/correctness_reward_func/mean": 1.0102040767669678, |
| "rewards/correctness_reward_func/std": 0.026180710643529892, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 78 |
| }, |
| { |
| "completion_length": 445.125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 454.0, |
| "completions/max_terminated_length": 454.0, |
| "completions/mean_length": 445.125, |
| "completions/mean_terminated_length": 445.125, |
| "completions/min_length": 437.0, |
| "completions/min_terminated_length": 437.0, |
| "epoch": 0.005266666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3808617889881134, |
| "kl": 0.3184884190559387, |
| "learning_rate": 2.761321158169134e-06, |
| "loss": 0.0003, |
| "num_tokens": 322718.0, |
| "reward": 1.229629635810852, |
| "reward_std": 0.020091881975531578, |
| "rewards/correctness_reward_func/mean": 0.729629635810852, |
| "rewards/correctness_reward_func/std": 0.020091887563467026, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 79 |
| }, |
| { |
| "completion_length": 117.875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 150.0, |
| "completions/max_terminated_length": 150.0, |
| "completions/mean_length": 117.875, |
| "completions/mean_terminated_length": 117.875, |
| "completions/min_length": 103.0, |
| "completions/min_terminated_length": 103.0, |
| "epoch": 0.005333333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.203380584716797, |
| "kl": 0.9543490409851074, |
| "learning_rate": 2.703396686669646e-06, |
| "loss": 0.001, |
| "num_tokens": 325029.0, |
| "reward": 1.9249999523162842, |
| "reward_std": 0.13887302577495575, |
| "rewards/correctness_reward_func/mean": 1.4249999523162842, |
| "rewards/correctness_reward_func/std": 0.13887299597263336, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 80 |
| }, |
| { |
| "completion_length": 401.875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 466.0, |
| "completions/max_terminated_length": 466.0, |
| "completions/mean_length": 401.875, |
| "completions/mean_terminated_length": 401.875, |
| "completions/min_length": 387.0, |
| "completions/min_terminated_length": 387.0, |
| "epoch": 0.0054, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6207239031791687, |
| "kl": 0.3206217586994171, |
| "learning_rate": 2.6453620722761897e-06, |
| "loss": 0.0003, |
| "num_tokens": 330380.0, |
| "reward": 1.4375, |
| "reward_std": 0.034718237817287445, |
| "rewards/correctness_reward_func/mean": 0.9375, |
| "rewards/correctness_reward_func/std": 0.03471824526786804, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 81 |
| }, |
| { |
| "completion_length": 510.75, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 521.0, |
| "completions/max_terminated_length": 521.0, |
| "completions/mean_length": 510.75, |
| "completions/mean_terminated_length": 510.75, |
| "completions/min_length": 502.0, |
| "completions/min_terminated_length": 502.0, |
| "epoch": 0.0054666666666666665, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4737114906311035, |
| "kl": 0.22106648981571198, |
| "learning_rate": 2.587248741756253e-06, |
| "loss": 0.0002, |
| "num_tokens": 336954.0, |
| "reward": 1.440740704536438, |
| "reward_std": 0.06372092664241791, |
| "rewards/correctness_reward_func/mean": 0.940740704536438, |
| "rewards/correctness_reward_func/std": 0.06372092664241791, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 82 |
| }, |
| { |
| "completion_length": 226.125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 230.0, |
| "completions/max_terminated_length": 230.0, |
| "completions/mean_length": 226.125, |
| "completions/mean_terminated_length": 226.125, |
| "completions/min_length": 222.0, |
| "completions/min_terminated_length": 222.0, |
| "epoch": 0.005533333333333334, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6092992424964905, |
| "kl": 0.3960443437099457, |
| "learning_rate": 2.5290881645034932e-06, |
| "loss": 0.0004, |
| "num_tokens": 340379.0, |
| "reward": 1.5791666507720947, |
| "reward_std": 0.024800803512334824, |
| "rewards/correctness_reward_func/mean": 1.0791666507720947, |
| "rewards/correctness_reward_func/std": 0.024800801649689674, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 83 |
| }, |
| { |
| "completion_length": 393.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 410.0, |
| "completions/max_terminated_length": 410.0, |
| "completions/mean_length": 393.375, |
| "completions/mean_terminated_length": 393.375, |
| "completions/min_length": 385.0, |
| "completions/min_terminated_length": 385.0, |
| "epoch": 0.0056, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7555404901504517, |
| "kl": 0.3491273820400238, |
| "learning_rate": 2.470911835496508e-06, |
| "loss": 0.0003, |
| "num_tokens": 345710.0, |
| "reward": 1.3624999523162842, |
| "reward_std": 0.036135926842689514, |
| "rewards/correctness_reward_func/mean": 0.8624999523162842, |
| "rewards/correctness_reward_func/std": 0.03613590821623802, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 84 |
| }, |
| { |
| "completion_length": 231.625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 243.0, |
| "completions/max_terminated_length": 243.0, |
| "completions/mean_length": 231.625, |
| "completions/mean_terminated_length": 231.625, |
| "completions/min_length": 221.0, |
| "completions/min_terminated_length": 221.0, |
| "epoch": 0.005666666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1376268863677979, |
| "kl": 0.4538520574569702, |
| "learning_rate": 2.4127512582437486e-06, |
| "loss": 0.0005, |
| "num_tokens": 349107.0, |
| "reward": 1.6541666984558105, |
| "reward_std": 0.0889756828546524, |
| "rewards/correctness_reward_func/mean": 1.1541666984558105, |
| "rewards/correctness_reward_func/std": 0.08897567540407181, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 85 |
| }, |
| { |
| "completion_length": 308.875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 321.0, |
| "completions/max_terminated_length": 321.0, |
| "completions/mean_length": 308.875, |
| "completions/mean_terminated_length": 308.875, |
| "completions/min_length": 304.0, |
| "completions/min_terminated_length": 304.0, |
| "epoch": 0.005733333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5613489747047424, |
| "kl": 0.33147352933883667, |
| "learning_rate": 2.3546379277238107e-06, |
| "loss": 0.0003, |
| "num_tokens": 353554.0, |
| "reward": 1.5193877220153809, |
| "reward_std": 0.018220985308289528, |
| "rewards/correctness_reward_func/mean": 1.0193877220153809, |
| "rewards/correctness_reward_func/std": 0.01822100207209587, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 86 |
| }, |
| { |
| "completion_length": 482.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 508.0, |
| "completions/max_terminated_length": 508.0, |
| "completions/mean_length": 482.0, |
| "completions/mean_terminated_length": 482.0, |
| "completions/min_length": 390.0, |
| "completions/min_terminated_length": 390.0, |
| "epoch": 0.0058, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7360633611679077, |
| "kl": 0.4801836907863617, |
| "learning_rate": 2.296603313330355e-06, |
| "loss": 0.0005, |
| "num_tokens": 359986.0, |
| "reward": 1.3534722328186035, |
| "reward_std": 0.1422896832227707, |
| "rewards/correctness_reward_func/mean": 0.8534722328186035, |
| "rewards/correctness_reward_func/std": 0.14228971302509308, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 87 |
| }, |
| { |
| "completion_length": 392.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 399.0, |
| "completions/max_terminated_length": 399.0, |
| "completions/mean_length": 392.5, |
| "completions/mean_terminated_length": 392.5, |
| "completions/min_length": 388.0, |
| "completions/min_terminated_length": 388.0, |
| "epoch": 0.005866666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.529614269733429, |
| "kl": 0.22710314393043518, |
| "learning_rate": 2.238678841830867e-06, |
| "loss": 0.0002, |
| "num_tokens": 365254.0, |
| "reward": 1.470312476158142, |
| "reward_std": 0.04101800173521042, |
| "rewards/correctness_reward_func/mean": 0.9703124761581421, |
| "rewards/correctness_reward_func/std": 0.04101802781224251, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 88 |
| }, |
| { |
| "completion_length": 410.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 488.0, |
| "completions/max_terminated_length": 488.0, |
| "completions/mean_length": 410.5, |
| "completions/mean_terminated_length": 410.5, |
| "completions/min_length": 388.0, |
| "completions/min_terminated_length": 388.0, |
| "epoch": 0.005933333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5982156991958618, |
| "kl": 0.3257940411567688, |
| "learning_rate": 2.1808958803485134e-06, |
| "loss": 0.0003, |
| "num_tokens": 370786.0, |
| "reward": 1.4328124523162842, |
| "reward_std": 0.026038672775030136, |
| "rewards/correctness_reward_func/mean": 0.932812511920929, |
| "rewards/correctness_reward_func/std": 0.02603868953883648, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 89 |
| }, |
| { |
| "completion_length": 110.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 117.0, |
| "completions/max_terminated_length": 117.0, |
| "completions/mean_length": 110.375, |
| "completions/mean_terminated_length": 110.375, |
| "completions/min_length": 105.0, |
| "completions/min_terminated_length": 105.0, |
| "epoch": 0.006, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7572274208068848, |
| "kl": 0.7833219766616821, |
| "learning_rate": 2.1232857193762923e-06, |
| "loss": 0.0008, |
| "num_tokens": 372957.0, |
| "reward": 1.774999976158142, |
| "reward_std": 0.0400891974568367, |
| "rewards/correctness_reward_func/mean": 1.274999976158142, |
| "rewards/correctness_reward_func/std": 0.04008918255567551, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 90 |
| }, |
| { |
| "completion_length": 229.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 237.0, |
| "completions/max_terminated_length": 237.0, |
| "completions/mean_length": 229.5, |
| "completions/mean_terminated_length": 229.5, |
| "completions/min_length": 217.0, |
| "completions/min_terminated_length": 217.0, |
| "epoch": 0.006066666666666666, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.748127281665802, |
| "kl": 0.7552896738052368, |
| "learning_rate": 2.0658795558326745e-06, |
| "loss": 0.0008, |
| "num_tokens": 376433.0, |
| "reward": 1.3583333492279053, |
| "reward_std": 0.04272470250725746, |
| "rewards/correctness_reward_func/mean": 0.8583333492279053, |
| "rewards/correctness_reward_func/std": 0.04272466525435448, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 91 |
| }, |
| { |
| "completion_length": 306.25, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 317.0, |
| "completions/max_terminated_length": 317.0, |
| "completions/mean_length": 306.25, |
| "completions/mean_terminated_length": 306.25, |
| "completions/min_length": 299.0, |
| "completions/min_terminated_length": 299.0, |
| "epoch": 0.0061333333333333335, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6332816481590271, |
| "kl": 0.35773783922195435, |
| "learning_rate": 2.0087084761679245e-06, |
| "loss": 0.0004, |
| "num_tokens": 380867.0, |
| "reward": 1.4734693765640259, |
| "reward_std": 0.050698716193437576, |
| "rewards/correctness_reward_func/mean": 0.9734693765640259, |
| "rewards/correctness_reward_func/std": 0.050698697566986084, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 92 |
| }, |
| { |
| "completion_length": 226.75, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 250.0, |
| "completions/max_terminated_length": 250.0, |
| "completions/mean_length": 226.75, |
| "completions/mean_terminated_length": 226.75, |
| "completions/min_length": 217.0, |
| "completions/min_terminated_length": 217.0, |
| "epoch": 0.0062, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8360308408737183, |
| "kl": 0.3975153863430023, |
| "learning_rate": 1.9518034395302413e-06, |
| "loss": 0.0004, |
| "num_tokens": 384409.0, |
| "reward": 1.566666603088379, |
| "reward_std": 0.04364359378814697, |
| "rewards/correctness_reward_func/mean": 1.066666603088379, |
| "rewards/correctness_reward_func/std": 0.043643590062856674, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 93 |
| }, |
| { |
| "completion_length": 705.25, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 1745.0, |
| "completions/max_terminated_length": 788.0, |
| "completions/mean_length": 705.25, |
| "completions/mean_terminated_length": 556.7142944335938, |
| "completions/min_length": 501.0, |
| "completions/min_terminated_length": 501.0, |
| "epoch": 0.006266666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3474324941635132, |
| "kl": 0.18562744557857513, |
| "learning_rate": 1.895195261000831e-06, |
| "loss": 0.0002, |
| "num_tokens": 392475.0, |
| "reward": 1.246759295463562, |
| "reward_std": 0.16773828864097595, |
| "rewards/correctness_reward_func/mean": 0.8092592358589172, |
| "rewards/correctness_reward_func/std": 0.04941357672214508, |
| "rewards/xmlcount_reward_func/mean": 0.4375, |
| "rewards/xmlcount_reward_func/std": 0.1767766922712326, |
| "step": 94 |
| }, |
| { |
| "completion_length": 66.875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 81.0, |
| "completions/max_terminated_length": 81.0, |
| "completions/mean_length": 66.875, |
| "completions/mean_terminated_length": 66.875, |
| "completions/min_length": 60.0, |
| "completions/min_terminated_length": 60.0, |
| "epoch": 0.006333333333333333, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.022137416526675224, |
| "kl": 1.1477253437042236, |
| "learning_rate": 1.8389145949069953e-06, |
| "loss": 0.0011, |
| "num_tokens": 394226.0, |
| "reward": 3.5, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func/mean": 3.0, |
| "rewards/correctness_reward_func/std": 0.0, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 95 |
| }, |
| { |
| "completion_length": 403.75, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 411.0, |
| "completions/max_terminated_length": 411.0, |
| "completions/mean_length": 403.75, |
| "completions/mean_terminated_length": 403.75, |
| "completions/min_length": 396.0, |
| "completions/min_terminated_length": 396.0, |
| "epoch": 0.0064, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.46710848808288574, |
| "kl": 0.2613021433353424, |
| "learning_rate": 1.7829919182222752e-06, |
| "loss": 0.0003, |
| "num_tokens": 399752.0, |
| "reward": 1.3953125476837158, |
| "reward_std": 0.03578673303127289, |
| "rewards/correctness_reward_func/mean": 0.8953125476837158, |
| "rewards/correctness_reward_func/std": 0.03578675538301468, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 96 |
| }, |
| { |
| "completion_length": 264.25, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 499.0, |
| "completions/max_terminated_length": 499.0, |
| "completions/mean_length": 264.25, |
| "completions/mean_terminated_length": 264.25, |
| "completions/min_length": 222.0, |
| "completions/min_terminated_length": 222.0, |
| "epoch": 0.006466666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8940762281417847, |
| "kl": 0.3712972402572632, |
| "learning_rate": 1.7274575140626318e-06, |
| "loss": 0.0004, |
| "num_tokens": 403538.0, |
| "reward": 1.5916666984558105, |
| "reward_std": 0.042724691331386566, |
| "rewards/correctness_reward_func/mean": 1.0916666984558105, |
| "rewards/correctness_reward_func/std": 0.042724668979644775, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 97 |
| }, |
| { |
| "completion_length": 304.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 312.0, |
| "completions/max_terminated_length": 312.0, |
| "completions/mean_length": 304.375, |
| "completions/mean_terminated_length": 304.375, |
| "completions/min_length": 298.0, |
| "completions/min_terminated_length": 298.0, |
| "epoch": 0.006533333333333334, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7366277575492859, |
| "kl": 0.27435365319252014, |
| "learning_rate": 1.6723414552876052e-06, |
| "loss": 0.0003, |
| "num_tokens": 407853.0, |
| "reward": 1.4581632614135742, |
| "reward_std": 0.027575310319662094, |
| "rewards/correctness_reward_func/mean": 0.9581632614135742, |
| "rewards/correctness_reward_func/std": 0.02757529355585575, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 98 |
| }, |
| { |
| "completion_length": 385.875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 416.0, |
| "completions/max_terminated_length": 416.0, |
| "completions/mean_length": 385.875, |
| "completions/mean_terminated_length": 385.875, |
| "completions/min_length": 359.0, |
| "completions/min_terminated_length": 359.0, |
| "epoch": 0.0066, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7135072350502014, |
| "kl": 0.2686983048915863, |
| "learning_rate": 1.6176735882153284e-06, |
| "loss": 0.0003, |
| "num_tokens": 413020.0, |
| "reward": 1.3273437023162842, |
| "reward_std": 0.17168211936950684, |
| "rewards/correctness_reward_func/mean": 0.827343761920929, |
| "rewards/correctness_reward_func/std": 0.17168213427066803, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 99 |
| }, |
| { |
| "completion_length": 232.75, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 253.0, |
| "completions/max_terminated_length": 253.0, |
| "completions/mean_length": 232.75, |
| "completions/mean_terminated_length": 232.75, |
| "completions/min_length": 223.0, |
| "completions/min_terminated_length": 223.0, |
| "epoch": 0.006666666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6561140418052673, |
| "kl": 0.36341872811317444, |
| "learning_rate": 1.56348351646022e-06, |
| "loss": 0.0004, |
| "num_tokens": 416618.0, |
| "reward": 1.6083333492279053, |
| "reward_std": 0.046291012316942215, |
| "rewards/correctness_reward_func/mean": 1.1083333492279053, |
| "rewards/correctness_reward_func/std": 0.04629099741578102, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 100 |
| }, |
| { |
| "completion_length": 69.625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 79.0, |
| "completions/max_terminated_length": 79.0, |
| "completions/mean_length": 69.625, |
| "completions/mean_terminated_length": 69.625, |
| "completions/min_length": 64.0, |
| "completions/min_terminated_length": 64.0, |
| "epoch": 0.006733333333333333, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0018204136285930872, |
| "kl": 1.0169363021850586, |
| "learning_rate": 1.509800584902108e-06, |
| "loss": 0.001, |
| "num_tokens": 418319.0, |
| "reward": 3.5, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func/mean": 3.0, |
| "rewards/correctness_reward_func/std": 0.0, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 101 |
| }, |
| { |
| "completion_length": 305.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 319.0, |
| "completions/max_terminated_length": 319.0, |
| "completions/mean_length": 305.5, |
| "completions/mean_terminated_length": 305.5, |
| "completions/min_length": 294.0, |
| "completions/min_terminated_length": 294.0, |
| "epoch": 0.0068, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5131522417068481, |
| "kl": 0.3132336437702179, |
| "learning_rate": 1.4566538637954556e-06, |
| "loss": 0.0003, |
| "num_tokens": 422547.0, |
| "reward": 1.4489796161651611, |
| "reward_std": 0.03463377058506012, |
| "rewards/correctness_reward_func/mean": 0.9489796161651611, |
| "rewards/correctness_reward_func/std": 0.03463379293680191, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 102 |
| }, |
| { |
| "completion_length": 502.625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 530.0, |
| "completions/max_terminated_length": 530.0, |
| "completions/mean_length": 502.625, |
| "completions/mean_terminated_length": 502.625, |
| "completions/min_length": 488.0, |
| "completions/min_terminated_length": 488.0, |
| "epoch": 0.006866666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.40155088901519775, |
| "kl": 0.25858941674232483, |
| "learning_rate": 1.4040721330273063e-06, |
| "loss": 0.0003, |
| "num_tokens": 429144.0, |
| "reward": 1.3925926685333252, |
| "reward_std": 0.017707079648971558, |
| "rewards/correctness_reward_func/mean": 0.8925925493240356, |
| "rewards/correctness_reward_func/std": 0.017707087099552155, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 103 |
| }, |
| { |
| "completion_length": 304.75, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 312.0, |
| "completions/max_terminated_length": 312.0, |
| "completions/mean_length": 304.75, |
| "completions/mean_terminated_length": 304.75, |
| "completions/min_length": 296.0, |
| "completions/min_terminated_length": 296.0, |
| "epoch": 0.006933333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8429935574531555, |
| "kl": 0.3152122497558594, |
| "learning_rate": 1.3520838665324704e-06, |
| "loss": 0.0003, |
| "num_tokens": 433502.0, |
| "reward": 1.620408296585083, |
| "reward_std": 0.04719792306423187, |
| "rewards/correctness_reward_func/mean": 1.120408296585083, |
| "rewards/correctness_reward_func/std": 0.04719793051481247, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 104 |
| }, |
| { |
| "completion_length": 227.25, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 234.0, |
| "completions/max_terminated_length": 234.0, |
| "completions/mean_length": 227.25, |
| "completions/mean_terminated_length": 227.25, |
| "completions/min_length": 223.0, |
| "completions/min_terminated_length": 223.0, |
| "epoch": 0.007, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8614357113838196, |
| "kl": 0.37386342883110046, |
| "learning_rate": 1.3007172168743854e-06, |
| "loss": 0.0004, |
| "num_tokens": 436944.0, |
| "reward": 1.537500023841858, |
| "reward_std": 0.051754895597696304, |
| "rewards/correctness_reward_func/mean": 1.037500023841858, |
| "rewards/correctness_reward_func/std": 0.05175492912530899, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 105 |
| }, |
| { |
| "completion_length": 104.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 114.0, |
| "completions/max_terminated_length": 114.0, |
| "completions/mean_length": 104.375, |
| "completions/mean_terminated_length": 104.375, |
| "completions/min_length": 100.0, |
| "completions/min_terminated_length": 100.0, |
| "epoch": 0.007066666666666666, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7808598279953003, |
| "kl": 0.581811249256134, |
| "learning_rate": 1.2500000000000007e-06, |
| "loss": 0.0006, |
| "num_tokens": 439019.0, |
| "reward": 3.293750047683716, |
| "reward_std": 0.5833630561828613, |
| "rewards/correctness_reward_func/mean": 2.793750047683716, |
| "rewards/correctness_reward_func/std": 0.5833631157875061, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 106 |
| }, |
| { |
| "completion_length": 228.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 236.0, |
| "completions/max_terminated_length": 236.0, |
| "completions/mean_length": 228.0, |
| "completions/mean_terminated_length": 228.0, |
| "completions/min_length": 223.0, |
| "completions/min_terminated_length": 223.0, |
| "epoch": 0.0071333333333333335, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6257885694503784, |
| "kl": 0.40867361426353455, |
| "learning_rate": 1.1999596801769617e-06, |
| "loss": 0.0004, |
| "num_tokens": 442611.0, |
| "reward": 1.5999999046325684, |
| "reward_std": 0.04364356771111488, |
| "rewards/correctness_reward_func/mean": 1.0999999046325684, |
| "rewards/correctness_reward_func/std": 0.043643563985824585, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 107 |
| }, |
| { |
| "completion_length": 102.875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 108.0, |
| "completions/max_terminated_length": 108.0, |
| "completions/mean_length": 102.875, |
| "completions/mean_terminated_length": 102.875, |
| "completions/min_length": 98.0, |
| "completions/min_terminated_length": 98.0, |
| "epoch": 0.0072, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.187460422515869, |
| "kl": 0.6943673491477966, |
| "learning_rate": 1.1506233551212186e-06, |
| "loss": 0.0007, |
| "num_tokens": 444634.0, |
| "reward": 1.6156249046325684, |
| "reward_std": 0.13557912409305573, |
| "rewards/correctness_reward_func/mean": 1.1156249046325684, |
| "rewards/correctness_reward_func/std": 0.13557912409305573, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 108 |
| }, |
| { |
| "completion_length": 225.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 229.0, |
| "completions/max_terminated_length": 229.0, |
| "completions/mean_length": 225.0, |
| "completions/mean_terminated_length": 225.0, |
| "completions/min_length": 222.0, |
| "completions/min_terminated_length": 222.0, |
| "epoch": 0.007266666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7688984870910645, |
| "kl": 0.2997400760650635, |
| "learning_rate": 1.1020177413231334e-06, |
| "loss": 0.0003, |
| "num_tokens": 448178.0, |
| "reward": 1.9041666984558105, |
| "reward_std": 0.12010247260332108, |
| "rewards/correctness_reward_func/mean": 1.4041666984558105, |
| "rewards/correctness_reward_func/std": 0.12010248750448227, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 109 |
| }, |
| { |
| "completion_length": 511.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 543.0, |
| "completions/max_terminated_length": 543.0, |
| "completions/mean_length": 511.375, |
| "completions/mean_terminated_length": 511.375, |
| "completions/min_length": 499.0, |
| "completions/min_terminated_length": 499.0, |
| "epoch": 0.007333333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5489156246185303, |
| "kl": 0.3106227219104767, |
| "learning_rate": 1.0541691595800338e-06, |
| "loss": 0.0003, |
| "num_tokens": 454669.0, |
| "reward": 1.2565972805023193, |
| "reward_std": 0.12646111845970154, |
| "rewards/correctness_reward_func/mean": 0.7722221612930298, |
| "rewards/correctness_reward_func/std": 0.09936108440160751, |
| "rewards/xmlcount_reward_func/mean": 0.484375, |
| "rewards/xmlcount_reward_func/std": 0.04419417306780815, |
| "step": 110 |
| }, |
| { |
| "completion_length": 105.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 109.0, |
| "completions/max_terminated_length": 109.0, |
| "completions/mean_length": 105.0, |
| "completions/mean_terminated_length": 105.0, |
| "completions/min_length": 100.0, |
| "completions/min_terminated_length": 100.0, |
| "epoch": 0.0074, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0009156708256341517, |
| "kl": 0.6540140509605408, |
| "learning_rate": 1.0071035207430352e-06, |
| "loss": 0.0007, |
| "num_tokens": 456733.0, |
| "reward": 1.850000023841858, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func/mean": 1.350000023841858, |
| "rewards/correctness_reward_func/std": 0.0, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 111 |
| }, |
| { |
| "completion_length": 305.625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 319.0, |
| "completions/max_terminated_length": 319.0, |
| "completions/mean_length": 305.625, |
| "completions/mean_terminated_length": 305.625, |
| "completions/min_length": 295.0, |
| "completions/min_terminated_length": 295.0, |
| "epoch": 0.007466666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6442992687225342, |
| "kl": 0.36618179082870483, |
| "learning_rate": 9.608463116858544e-07, |
| "loss": 0.0004, |
| "num_tokens": 461034.0, |
| "reward": 1.4826531410217285, |
| "reward_std": 0.04229113087058067, |
| "rewards/correctness_reward_func/mean": 0.982653021812439, |
| "rewards/correctness_reward_func/std": 0.04229113087058067, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 112 |
| }, |
| { |
| "completion_length": 144.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 232.0, |
| "completions/max_terminated_length": 232.0, |
| "completions/mean_length": 144.5, |
| "completions/mean_terminated_length": 144.5, |
| "completions/min_length": 102.0, |
| "completions/min_terminated_length": 102.0, |
| "epoch": 0.007533333333333334, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6790902614593506, |
| "kl": 0.6071375012397766, |
| "learning_rate": 9.154225815032242e-07, |
| "loss": 0.0006, |
| "num_tokens": 463350.0, |
| "reward": 1.90625, |
| "reward_std": 0.03471820428967476, |
| "rewards/correctness_reward_func/mean": 1.40625, |
| "rewards/correctness_reward_func/std": 0.03471821919083595, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 113 |
| }, |
| { |
| "completion_length": 227.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 234.0, |
| "completions/max_terminated_length": 234.0, |
| "completions/mean_length": 227.0, |
| "completions/mean_terminated_length": 227.0, |
| "completions/min_length": 223.0, |
| "completions/min_terminated_length": 223.0, |
| "epoch": 0.0076, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0019490718841553, |
| "kl": 0.36533433198928833, |
| "learning_rate": 8.708569279463622e-07, |
| "loss": 0.0004, |
| "num_tokens": 466782.0, |
| "reward": 1.625, |
| "reward_std": 0.06842906773090363, |
| "rewards/correctness_reward_func/mean": 1.125, |
| "rewards/correctness_reward_func/std": 0.06842907518148422, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 114 |
| }, |
| { |
| "completion_length": 111.125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 120.0, |
| "completions/max_terminated_length": 120.0, |
| "completions/mean_length": 111.125, |
| "completions/mean_terminated_length": 111.125, |
| "completions/min_length": 105.0, |
| "completions/min_terminated_length": 105.0, |
| "epoch": 0.007666666666666666, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.1805026531219482, |
| "kl": 0.7322518825531006, |
| "learning_rate": 8.271734841028553e-07, |
| "loss": 0.0007, |
| "num_tokens": 468943.0, |
| "reward": 1.803125023841858, |
| "reward_std": 0.08908011764287949, |
| "rewards/correctness_reward_func/mean": 1.303125023841858, |
| "rewards/correctness_reward_func/std": 0.08908012509346008, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 115 |
| }, |
| { |
| "completion_length": 134.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 223.0, |
| "completions/max_terminated_length": 223.0, |
| "completions/mean_length": 134.5, |
| "completions/mean_terminated_length": 134.5, |
| "completions/min_length": 105.0, |
| "completions/min_terminated_length": 105.0, |
| "epoch": 0.007733333333333333, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.005333705805242062, |
| "kl": 0.5382705330848694, |
| "learning_rate": 7.843959053281663e-07, |
| "loss": 0.0005, |
| "num_tokens": 471387.0, |
| "reward": 3.5, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func/mean": 3.0, |
| "rewards/correctness_reward_func/std": 0.0, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 116 |
| }, |
| { |
| "completion_length": 388.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 401.0, |
| "completions/max_terminated_length": 401.0, |
| "completions/mean_length": 388.5, |
| "completions/mean_terminated_length": 388.5, |
| "completions/min_length": 381.0, |
| "completions/min_terminated_length": 381.0, |
| "epoch": 0.0078, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6017757654190063, |
| "kl": 0.38339361548423767, |
| "learning_rate": 7.425473564358457e-07, |
| "loss": 0.0004, |
| "num_tokens": 476759.0, |
| "reward": 1.30859375, |
| "reward_std": 0.03673892840743065, |
| "rewards/correctness_reward_func/mean": 0.80859375, |
| "rewards/correctness_reward_func/std": 0.03673893213272095, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 117 |
| }, |
| { |
| "completion_length": 72.25, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 108.0, |
| "completions/max_terminated_length": 108.0, |
| "completions/mean_length": 72.25, |
| "completions/mean_terminated_length": 72.25, |
| "completions/min_length": 56.0, |
| "completions/min_terminated_length": 56.0, |
| "epoch": 0.007866666666666666, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.01756666600704193, |
| "kl": 1.2723251581192017, |
| "learning_rate": 7.016504991533727e-07, |
| "loss": 0.0013, |
| "num_tokens": 478377.0, |
| "reward": 3.5, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func/mean": 3.0, |
| "rewards/correctness_reward_func/std": 0.0, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 118 |
| }, |
| { |
| "completion_length": 226.625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 237.0, |
| "completions/max_terminated_length": 237.0, |
| "completions/mean_length": 226.625, |
| "completions/mean_terminated_length": 226.625, |
| "completions/min_length": 219.0, |
| "completions/min_terminated_length": 219.0, |
| "epoch": 0.007933333333333334, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8822813034057617, |
| "kl": 0.5029420256614685, |
| "learning_rate": 6.617274798504286e-07, |
| "loss": 0.0005, |
| "num_tokens": 481918.0, |
| "reward": 1.566666603088379, |
| "reward_std": 0.025197675451636314, |
| "rewards/correctness_reward_func/mean": 1.066666603088379, |
| "rewards/correctness_reward_func/std": 0.025197653099894524, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 119 |
| }, |
| { |
| "completion_length": 401.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 411.0, |
| "completions/max_terminated_length": 411.0, |
| "completions/mean_length": 401.5, |
| "completions/mean_terminated_length": 401.5, |
| "completions/min_length": 393.0, |
| "completions/min_terminated_length": 393.0, |
| "epoch": 0.008, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8926092386245728, |
| "kl": 0.3140659034252167, |
| "learning_rate": 6.227999175462521e-07, |
| "loss": 0.0003, |
| "num_tokens": 487298.0, |
| "reward": 1.5382813215255737, |
| "reward_std": 0.049035049974918365, |
| "rewards/correctness_reward_func/mean": 1.0382813215255737, |
| "rewards/correctness_reward_func/std": 0.049035049974918365, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 120 |
| }, |
| { |
| "completion_length": 296.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 309.0, |
| "completions/max_terminated_length": 309.0, |
| "completions/mean_length": 296.5, |
| "completions/mean_terminated_length": 296.5, |
| "completions/min_length": 288.0, |
| "completions/min_terminated_length": 288.0, |
| "epoch": 0.008066666666666666, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1491118669509888, |
| "kl": 0.42884907126426697, |
| "learning_rate": 5.848888922025553e-07, |
| "loss": 0.0004, |
| "num_tokens": 491462.0, |
| "reward": 1.4948980808258057, |
| "reward_std": 0.04329225793480873, |
| "rewards/correctness_reward_func/mean": 0.9948980212211609, |
| "rewards/correctness_reward_func/std": 0.04329225420951843, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 121 |
| }, |
| { |
| "completion_length": 235.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 250.0, |
| "completions/max_terminated_length": 250.0, |
| "completions/mean_length": 235.0, |
| "completions/mean_terminated_length": 235.0, |
| "completions/min_length": 227.0, |
| "completions/min_terminated_length": 227.0, |
| "epoch": 0.008133333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8497012257575989, |
| "kl": 0.38695231080055237, |
| "learning_rate": 5.48014933308352e-07, |
| "loss": 0.0004, |
| "num_tokens": 494910.0, |
| "reward": 1.5625, |
| "reward_std": 0.02136235125362873, |
| "rewards/correctness_reward_func/mean": 1.0625, |
| "rewards/correctness_reward_func/std": 0.021362358704209328, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 122 |
| }, |
| { |
| "completion_length": 493.625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 515.0, |
| "completions/max_terminated_length": 515.0, |
| "completions/mean_length": 493.625, |
| "completions/mean_terminated_length": 493.625, |
| "completions/min_length": 458.0, |
| "completions/min_terminated_length": 458.0, |
| "epoch": 0.0082, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5217005610466003, |
| "kl": 0.29427477717399597, |
| "learning_rate": 5.121980087628802e-07, |
| "loss": 0.0003, |
| "num_tokens": 501307.0, |
| "reward": 1.3701388835906982, |
| "reward_std": 0.08480729162693024, |
| "rewards/correctness_reward_func/mean": 0.8701388835906982, |
| "rewards/correctness_reward_func/std": 0.08480728417634964, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 123 |
| }, |
| { |
| "completion_length": 300.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 305.0, |
| "completions/max_terminated_length": 305.0, |
| "completions/mean_length": 300.0, |
| "completions/mean_terminated_length": 300.0, |
| "completions/min_length": 290.0, |
| "completions/min_terminated_length": 290.0, |
| "epoch": 0.008266666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6252700090408325, |
| "kl": 0.3322438895702362, |
| "learning_rate": 4.774575140626317e-07, |
| "loss": 0.0003, |
| "num_tokens": 505563.0, |
| "reward": 1.5040816068649292, |
| "reward_std": 0.04487145319581032, |
| "rewards/correctness_reward_func/mean": 1.0040816068649292, |
| "rewards/correctness_reward_func/std": 0.04487145319581032, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 124 |
| }, |
| { |
| "completion_length": 398.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 409.0, |
| "completions/max_terminated_length": 409.0, |
| "completions/mean_length": 398.375, |
| "completions/mean_terminated_length": 398.375, |
| "completions/min_length": 393.0, |
| "completions/min_terminated_length": 393.0, |
| "epoch": 0.008333333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5247986316680908, |
| "kl": 0.2880602180957794, |
| "learning_rate": 4.438122617983442e-07, |
| "loss": 0.0003, |
| "num_tokens": 510910.0, |
| "reward": 1.47265625, |
| "reward_std": 0.03534547612071037, |
| "rewards/correctness_reward_func/mean": 0.97265625, |
| "rewards/correctness_reward_func/std": 0.035345472395420074, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 125 |
| }, |
| { |
| "completion_length": 109.25, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 122.0, |
| "completions/max_terminated_length": 122.0, |
| "completions/mean_length": 109.25, |
| "completions/mean_terminated_length": 109.25, |
| "completions/min_length": 103.0, |
| "completions/min_terminated_length": 103.0, |
| "epoch": 0.0084, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.40777325630188, |
| "kl": 0.7852993607521057, |
| "learning_rate": 4.1128047146765936e-07, |
| "loss": 0.0008, |
| "num_tokens": 512952.0, |
| "reward": 1.8125, |
| "reward_std": 0.08017835021018982, |
| "rewards/correctness_reward_func/mean": 1.3125, |
| "rewards/correctness_reward_func/std": 0.08017835021018982, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 126 |
| }, |
| { |
| "completion_length": 113.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 140.0, |
| "completions/max_terminated_length": 140.0, |
| "completions/mean_length": 113.0, |
| "completions/mean_terminated_length": 113.0, |
| "completions/min_length": 107.0, |
| "completions/min_terminated_length": 107.0, |
| "epoch": 0.008466666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6808403730392456, |
| "kl": 0.7311285734176636, |
| "learning_rate": 3.798797596089351e-07, |
| "loss": 0.0007, |
| "num_tokens": 515128.0, |
| "reward": 1.615625023841858, |
| "reward_std": 0.06258923560380936, |
| "rewards/correctness_reward_func/mean": 1.115625023841858, |
| "rewards/correctness_reward_func/std": 0.06258922815322876, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 127 |
| }, |
| { |
| "completion_length": 487.625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 530.0, |
| "completions/max_terminated_length": 530.0, |
| "completions/mean_length": 487.625, |
| "completions/mean_terminated_length": 487.625, |
| "completions/min_length": 437.0, |
| "completions/min_terminated_length": 437.0, |
| "epoch": 0.008533333333333334, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8878220319747925, |
| "kl": 0.2915445566177368, |
| "learning_rate": 3.4962713026158697e-07, |
| "loss": 0.0003, |
| "num_tokens": 521421.0, |
| "reward": 1.3439815044403076, |
| "reward_std": 0.11603187024593353, |
| "rewards/correctness_reward_func/mean": 0.8439815044403076, |
| "rewards/correctness_reward_func/std": 0.11603190749883652, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 128 |
| }, |
| { |
| "completion_length": 428.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 528.0, |
| "completions/max_terminated_length": 528.0, |
| "completions/mean_length": 428.5, |
| "completions/mean_terminated_length": 428.5, |
| "completions/min_length": 388.0, |
| "completions/min_terminated_length": 388.0, |
| "epoch": 0.0086, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4781865179538727, |
| "kl": 0.2633122205734253, |
| "learning_rate": 3.2053896575809426e-07, |
| "loss": 0.0003, |
| "num_tokens": 526913.0, |
| "reward": 1.40234375, |
| "reward_std": 0.038081441074609756, |
| "rewards/correctness_reward_func/mean": 0.90234375, |
| "rewards/correctness_reward_func/std": 0.03808142989873886, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 129 |
| }, |
| { |
| "completion_length": 119.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 173.0, |
| "completions/max_terminated_length": 173.0, |
| "completions/mean_length": 119.5, |
| "completions/mean_terminated_length": 119.5, |
| "completions/min_length": 105.0, |
| "completions/min_terminated_length": 105.0, |
| "epoch": 0.008666666666666666, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0010288404300808907, |
| "kl": 0.6225458979606628, |
| "learning_rate": 2.9263101785268253e-07, |
| "loss": 0.0006, |
| "num_tokens": 529037.0, |
| "reward": 1.774999976158142, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func/mean": 1.274999976158142, |
| "rewards/correctness_reward_func/std": 0.0, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 130 |
| }, |
| { |
| "completion_length": 474.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 499.0, |
| "completions/max_terminated_length": 499.0, |
| "completions/mean_length": 474.5, |
| "completions/mean_terminated_length": 474.5, |
| "completions/min_length": 431.0, |
| "completions/min_terminated_length": 431.0, |
| "epoch": 0.008733333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6302768588066101, |
| "kl": 0.34326815605163574, |
| "learning_rate": 2.6591839919146963e-07, |
| "loss": 0.0003, |
| "num_tokens": 535233.0, |
| "reward": 1.3050925731658936, |
| "reward_std": 0.12362809479236603, |
| "rewards/correctness_reward_func/mean": 0.8050925731658936, |
| "rewards/correctness_reward_func/std": 0.12362809479236603, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 131 |
| }, |
| { |
| "completion_length": 493.875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 499.0, |
| "completions/max_terminated_length": 499.0, |
| "completions/mean_length": 493.875, |
| "completions/mean_terminated_length": 493.875, |
| "completions/min_length": 484.0, |
| "completions/min_terminated_length": 484.0, |
| "epoch": 0.0088, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.7365954518318176, |
| "kl": 0.9036705493927002, |
| "learning_rate": 2.404155751286988e-07, |
| "loss": 0.0009, |
| "num_tokens": 541680.0, |
| "reward": 1.3611111640930176, |
| "reward_std": 0.05035635083913803, |
| "rewards/correctness_reward_func/mean": 0.8611111044883728, |
| "rewards/correctness_reward_func/std": 0.050356365740299225, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 132 |
| }, |
| { |
| "completion_length": 403.25, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 432.0, |
| "completions/max_terminated_length": 432.0, |
| "completions/mean_length": 403.25, |
| "completions/mean_terminated_length": 403.25, |
| "completions/min_length": 391.0, |
| "completions/min_terminated_length": 391.0, |
| "epoch": 0.008866666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5254420638084412, |
| "kl": 0.32496964931488037, |
| "learning_rate": 2.1613635589349756e-07, |
| "loss": 0.0003, |
| "num_tokens": 547090.0, |
| "reward": 1.4890625476837158, |
| "reward_std": 0.027900906279683113, |
| "rewards/correctness_reward_func/mean": 0.989062488079071, |
| "rewards/correctness_reward_func/std": 0.027900898829102516, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 133 |
| }, |
| { |
| "completion_length": 308.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 315.0, |
| "completions/max_terminated_length": 315.0, |
| "completions/mean_length": 308.0, |
| "completions/mean_terminated_length": 308.0, |
| "completions/min_length": 301.0, |
| "completions/min_terminated_length": 301.0, |
| "epoch": 0.008933333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.06843101978302, |
| "kl": 0.3652152419090271, |
| "learning_rate": 1.9309388911139427e-07, |
| "loss": 0.0004, |
| "num_tokens": 551474.0, |
| "reward": 1.485714316368103, |
| "reward_std": 0.04719790816307068, |
| "rewards/correctness_reward_func/mean": 0.985714316368103, |
| "rewards/correctness_reward_func/std": 0.04719791188836098, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 134 |
| }, |
| { |
| "completion_length": 511.875, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 560.0, |
| "completions/max_terminated_length": 515.0, |
| "completions/mean_length": 511.875, |
| "completions/mean_terminated_length": 505.0000305175781, |
| "completions/min_length": 489.0, |
| "completions/min_terminated_length": 489.0, |
| "epoch": 0.009, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.595820426940918, |
| "kl": 0.25694188475608826, |
| "learning_rate": 1.713006526846439e-07, |
| "loss": 0.0003, |
| "num_tokens": 558169.0, |
| "reward": 1.4703704118728638, |
| "reward_std": 0.046680908650159836, |
| "rewards/correctness_reward_func/mean": 0.970370352268219, |
| "rewards/correctness_reward_func/std": 0.04668092727661133, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 135 |
| }, |
| { |
| "completion_length": 311.25, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 325.0, |
| "completions/max_terminated_length": 325.0, |
| "completions/mean_length": 311.25, |
| "completions/mean_terminated_length": 311.25, |
| "completions/min_length": 305.0, |
| "completions/min_terminated_length": 305.0, |
| "epoch": 0.009066666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8594949841499329, |
| "kl": 0.34354594349861145, |
| "learning_rate": 1.507684480352292e-07, |
| "loss": 0.0003, |
| "num_tokens": 562531.0, |
| "reward": 1.4979591369628906, |
| "reward_std": 0.0392710380256176, |
| "rewards/correctness_reward_func/mean": 0.9979591369628906, |
| "rewards/correctness_reward_func/std": 0.03927105292677879, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 136 |
| }, |
| { |
| "completion_length": 312.25, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 327.0, |
| "completions/max_terminated_length": 327.0, |
| "completions/mean_length": 312.25, |
| "completions/mean_terminated_length": 312.25, |
| "completions/min_length": 300.0, |
| "completions/min_terminated_length": 300.0, |
| "epoch": 0.009133333333333334, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.516212821006775, |
| "kl": 0.3029043972492218, |
| "learning_rate": 1.31508393714177e-07, |
| "loss": 0.0003, |
| "num_tokens": 566933.0, |
| "reward": 1.5897959470748901, |
| "reward_std": 0.031389568001031876, |
| "rewards/correctness_reward_func/mean": 1.0897959470748901, |
| "rewards/correctness_reward_func/std": 0.03138954937458038, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 137 |
| }, |
| { |
| "completion_length": 64.125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 70.0, |
| "completions/max_terminated_length": 70.0, |
| "completions/mean_length": 64.125, |
| "completions/mean_terminated_length": 64.125, |
| "completions/min_length": 58.0, |
| "completions/min_terminated_length": 58.0, |
| "epoch": 0.0092, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.05483343079686165, |
| "kl": 1.4965819120407104, |
| "learning_rate": 1.1353091938067024e-07, |
| "loss": 0.0015, |
| "num_tokens": 568670.0, |
| "reward": 3.5, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func/mean": 3.0, |
| "rewards/correctness_reward_func/std": 0.0, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 138 |
| }, |
| { |
| "completion_length": 303.25, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 310.0, |
| "completions/max_terminated_length": 310.0, |
| "completions/mean_length": 303.25, |
| "completions/mean_terminated_length": 303.25, |
| "completions/min_length": 298.0, |
| "completions/min_terminated_length": 298.0, |
| "epoch": 0.009266666666666666, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6266666650772095, |
| "kl": 0.35942456126213074, |
| "learning_rate": 9.684576015420277e-08, |
| "loss": 0.0004, |
| "num_tokens": 572984.0, |
| "reward": 1.430612325668335, |
| "reward_std": 0.03644194081425667, |
| "rewards/correctness_reward_func/mean": 0.9306122064590454, |
| "rewards/correctness_reward_func/std": 0.03644197806715965, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 139 |
| }, |
| { |
| "completion_length": 494.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 517.0, |
| "completions/max_terminated_length": 517.0, |
| "completions/mean_length": 494.375, |
| "completions/mean_terminated_length": 494.375, |
| "completions/min_length": 444.0, |
| "completions/min_terminated_length": 444.0, |
| "epoch": 0.009333333333333334, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5252565145492554, |
| "kl": 0.28363481163978577, |
| "learning_rate": 8.146195134284052e-08, |
| "loss": 0.0003, |
| "num_tokens": 579523.0, |
| "reward": 1.3534722328186035, |
| "reward_std": 0.10252274572849274, |
| "rewards/correctness_reward_func/mean": 0.8534722328186035, |
| "rewards/correctness_reward_func/std": 0.10252274572849274, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 140 |
| }, |
| { |
| "completion_length": 229.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 239.0, |
| "completions/max_terminated_length": 239.0, |
| "completions/mean_length": 229.0, |
| "completions/mean_terminated_length": 229.0, |
| "completions/min_length": 221.0, |
| "completions/min_terminated_length": 221.0, |
| "epoch": 0.0094, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1325074434280396, |
| "kl": 0.38521093130111694, |
| "learning_rate": 6.738782355044048e-08, |
| "loss": 0.0004, |
| "num_tokens": 583011.0, |
| "reward": 1.5833333730697632, |
| "reward_std": 0.043643586337566376, |
| "rewards/correctness_reward_func/mean": 1.0833333730697632, |
| "rewards/correctness_reward_func/std": 0.04364357516169548, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 141 |
| }, |
| { |
| "completion_length": 312.625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 376.0, |
| "completions/max_terminated_length": 376.0, |
| "completions/mean_length": 312.625, |
| "completions/mean_terminated_length": 312.625, |
| "completions/min_length": 300.0, |
| "completions/min_terminated_length": 300.0, |
| "epoch": 0.009466666666666667, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6133989691734314, |
| "kl": 0.3078048825263977, |
| "learning_rate": 5.463099816548578e-08, |
| "loss": 0.0003, |
| "num_tokens": 587368.0, |
| "reward": 1.4489796161651611, |
| "reward_std": 0.03463380038738251, |
| "rewards/correctness_reward_func/mean": 0.9489796161651611, |
| "rewards/correctness_reward_func/std": 0.03463379293680191, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 142 |
| }, |
| { |
| "completion_length": 400.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 416.0, |
| "completions/max_terminated_length": 416.0, |
| "completions/mean_length": 400.375, |
| "completions/mean_terminated_length": 400.375, |
| "completions/min_length": 394.0, |
| "completions/min_terminated_length": 394.0, |
| "epoch": 0.009533333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5225932002067566, |
| "kl": 0.2902478873729706, |
| "learning_rate": 4.319838323396691e-08, |
| "loss": 0.0003, |
| "num_tokens": 592715.0, |
| "reward": 1.4421875476837158, |
| "reward_std": 0.0357866995036602, |
| "rewards/correctness_reward_func/mean": 0.942187488079071, |
| "rewards/correctness_reward_func/std": 0.03578675165772438, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 143 |
| }, |
| { |
| "completion_length": 401.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 411.0, |
| "completions/max_terminated_length": 411.0, |
| "completions/mean_length": 401.375, |
| "completions/mean_terminated_length": 401.375, |
| "completions/min_length": 391.0, |
| "completions/min_terminated_length": 391.0, |
| "epoch": 0.0096, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5049280524253845, |
| "kl": 0.27406245470046997, |
| "learning_rate": 3.309616971855195e-08, |
| "loss": 0.0003, |
| "num_tokens": 598118.0, |
| "reward": 1.5125000476837158, |
| "reward_std": 0.056694649159908295, |
| "rewards/correctness_reward_func/mean": 1.0125000476837158, |
| "rewards/correctness_reward_func/std": 0.05669466406106949, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 144 |
| }, |
| { |
| "completion_length": 68.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 95.0, |
| "completions/max_terminated_length": 95.0, |
| "completions/mean_length": 68.5, |
| "completions/mean_terminated_length": 68.5, |
| "completions/min_length": 51.0, |
| "completions/min_terminated_length": 51.0, |
| "epoch": 0.009666666666666667, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.006095044314861298, |
| "kl": 1.1216628551483154, |
| "learning_rate": 2.4329828146074096e-08, |
| "loss": 0.0011, |
| "num_tokens": 599706.0, |
| "reward": 2.0333333015441895, |
| "reward_std": 0.0, |
| "rewards/correctness_reward_func/mean": 1.5333333015441895, |
| "rewards/correctness_reward_func/std": 0.0, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 145 |
| }, |
| { |
| "completion_length": 60.625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 70.0, |
| "completions/max_terminated_length": 70.0, |
| "completions/mean_length": 60.625, |
| "completions/mean_terminated_length": 60.625, |
| "completions/min_length": 57.0, |
| "completions/min_terminated_length": 57.0, |
| "epoch": 0.009733333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.0240890979766846, |
| "kl": 1.1488873958587646, |
| "learning_rate": 1.6904105645142443e-08, |
| "loss": 0.0011, |
| "num_tokens": 601295.0, |
| "reward": 2.308333396911621, |
| "reward_std": 0.7535629868507385, |
| "rewards/correctness_reward_func/mean": 1.808333396911621, |
| "rewards/correctness_reward_func/std": 0.7535629868507385, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 146 |
| }, |
| { |
| "completion_length": 228.5, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 236.0, |
| "completions/max_terminated_length": 236.0, |
| "completions/mean_length": 228.5, |
| "completions/mean_terminated_length": 228.5, |
| "completions/min_length": 222.0, |
| "completions/min_terminated_length": 222.0, |
| "epoch": 0.0098, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8666861653327942, |
| "kl": 0.8305746912956238, |
| "learning_rate": 1.0823023375489128e-08, |
| "loss": 0.0008, |
| "num_tokens": 604659.0, |
| "reward": 1.379166603088379, |
| "reward_std": 0.017251623794436455, |
| "rewards/correctness_reward_func/mean": 0.8791666626930237, |
| "rewards/correctness_reward_func/std": 0.017251623794436455, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 147 |
| }, |
| { |
| "completion_length": 226.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 237.0, |
| "completions/max_terminated_length": 237.0, |
| "completions/mean_length": 226.0, |
| "completions/mean_terminated_length": 226.0, |
| "completions/min_length": 221.0, |
| "completions/min_terminated_length": 221.0, |
| "epoch": 0.009866666666666666, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6299178600311279, |
| "kl": 0.3639201819896698, |
| "learning_rate": 6.089874350439507e-09, |
| "loss": 0.0004, |
| "num_tokens": 608107.0, |
| "reward": 1.629166603088379, |
| "reward_std": 0.05473604425787926, |
| "rewards/correctness_reward_func/mean": 1.129166603088379, |
| "rewards/correctness_reward_func/std": 0.054736021906137466, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 148 |
| }, |
| { |
| "completion_length": 103.375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 107.0, |
| "completions/max_terminated_length": 107.0, |
| "completions/mean_length": 103.375, |
| "completions/mean_terminated_length": 103.375, |
| "completions/min_length": 99.0, |
| "completions/min_terminated_length": 99.0, |
| "epoch": 0.009933333333333334, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.348041534423828, |
| "kl": 0.7296304702758789, |
| "learning_rate": 2.7072216536885855e-09, |
| "loss": 0.0007, |
| "num_tokens": 610142.0, |
| "reward": 1.896875023841858, |
| "reward_std": 0.03881613910198212, |
| "rewards/correctness_reward_func/mean": 1.396875023841858, |
| "rewards/correctness_reward_func/std": 0.03881615400314331, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 149 |
| }, |
| { |
| "completion_length": 74.25, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 95.0, |
| "completions/max_terminated_length": 95.0, |
| "completions/mean_length": 74.25, |
| "completions/mean_terminated_length": 74.25, |
| "completions/min_length": 65.0, |
| "completions/min_terminated_length": 65.0, |
| "epoch": 0.01, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.000831127166748, |
| "kl": 1.3544789552688599, |
| "learning_rate": 6.768970513457151e-10, |
| "loss": 0.0014, |
| "num_tokens": 611768.0, |
| "reward": 1.7000000476837158, |
| "reward_std": 0.0617213174700737, |
| "rewards/correctness_reward_func/mean": 1.2000000476837158, |
| "rewards/correctness_reward_func/std": 0.06172133609652519, |
| "rewards/xmlcount_reward_func/mean": 0.5, |
| "rewards/xmlcount_reward_func/std": 0.0, |
| "step": 150 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 150, |
| "num_input_tokens_seen": 611768, |
| "num_train_epochs": 1, |
| "save_steps": 150, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|