| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.16146393972012918, |
| "eval_steps": 500, |
| "global_step": 150, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1674.0, |
| "completions/max_terminated_length": 1674.0, |
| "completions/mean_length": 871.875, |
| "completions/mean_terminated_length": 871.875, |
| "completions/min_length": 399.0, |
| "completions/min_terminated_length": 399.0, |
| "entropy": 0.18132147192955017, |
| "epoch": 0.001076426264800861, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6484375, |
| "learning_rate": 1e-05, |
| "loss": -0.1149, |
| "num_tokens": 90882.0, |
| "reward": 42.04616928100586, |
| "reward_std": 51.792091369628906, |
| "rewards/Rewards/mean": 42.046173095703125, |
| "rewards/Rewards/std": 144.7001190185547, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9934591054916382, |
| "sampling/importance_sampling_ratio/min": 0.08985913544893265, |
| "sampling/sampling_logp_difference/max": 2.4095120429992676, |
| "sampling/sampling_logp_difference/mean": 0.02156674861907959, |
| "step": 1 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1762.0, |
| "completions/max_terminated_length": 1762.0, |
| "completions/mean_length": 896.2291870117188, |
| "completions/mean_terminated_length": 896.2291870117188, |
| "completions/min_length": 468.0, |
| "completions/min_terminated_length": 468.0, |
| "entropy": 0.15218639373779297, |
| "epoch": 0.002152852529601722, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9296875, |
| "learning_rate": 9.989235737351993e-06, |
| "loss": -0.1743, |
| "num_tokens": 176615.0, |
| "reward": 168.61300659179688, |
| "reward_std": 52.48692321777344, |
| "rewards/Rewards/mean": 168.61297607421875, |
| "rewards/Rewards/std": 160.77197265625, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9949066638946533, |
| "sampling/importance_sampling_ratio/min": 0.19706952571868896, |
| "sampling/sampling_logp_difference/max": 1.6241986751556396, |
| "sampling/sampling_logp_difference/mean": 0.020902985706925392, |
| "step": 2 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1708.0, |
| "completions/max_terminated_length": 1708.0, |
| "completions/mean_length": 674.625, |
| "completions/mean_terminated_length": 674.625, |
| "completions/min_length": 297.0, |
| "completions/min_terminated_length": 297.0, |
| "entropy": 0.10972679406404495, |
| "epoch": 0.0032292787944025836, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.546875, |
| "learning_rate": 9.978471474703984e-06, |
| "loss": -0.1742, |
| "num_tokens": 255089.0, |
| "reward": 114.97002410888672, |
| "reward_std": 28.174999237060547, |
| "rewards/Rewards/mean": 114.97003173828125, |
| "rewards/Rewards/std": 152.3834228515625, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9959638714790344, |
| "sampling/importance_sampling_ratio/min": 0.11370240896940231, |
| "sampling/sampling_logp_difference/max": 2.174170732498169, |
| "sampling/sampling_logp_difference/mean": 0.01859966665506363, |
| "step": 3 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1724.0, |
| "completions/max_terminated_length": 1724.0, |
| "completions/mean_length": 789.8541870117188, |
| "completions/mean_terminated_length": 789.8541870117188, |
| "completions/min_length": 484.0, |
| "completions/min_terminated_length": 484.0, |
| "entropy": 0.10646244883537292, |
| "epoch": 0.004305705059203444, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.359375, |
| "learning_rate": 9.967707212055974e-06, |
| "loss": -0.1154, |
| "num_tokens": 338194.0, |
| "reward": 225.43746948242188, |
| "reward_std": 24.845914840698242, |
| "rewards/Rewards/mean": 225.4374542236328, |
| "rewards/Rewards/std": 109.95954132080078, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9960993528366089, |
| "sampling/importance_sampling_ratio/min": 0.10363264381885529, |
| "sampling/sampling_logp_difference/max": 2.2669029235839844, |
| "sampling/sampling_logp_difference/mean": 0.01849844679236412, |
| "step": 4 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.02083333395421505, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2045.0, |
| "completions/mean_length": 1027.5208740234375, |
| "completions/mean_terminated_length": 1005.8084716796875, |
| "completions/min_length": 355.0, |
| "completions/min_terminated_length": 355.0, |
| "entropy": 0.11067882180213928, |
| "epoch": 0.005382131324004306, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.953125, |
| "learning_rate": 9.956942949407966e-06, |
| "loss": -0.1189, |
| "num_tokens": 433241.0, |
| "reward": 120.4049072265625, |
| "reward_std": 54.20726013183594, |
| "rewards/Rewards/mean": 120.4049072265625, |
| "rewards/Rewards/std": 154.53668212890625, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9961210489273071, |
| "sampling/importance_sampling_ratio/min": 0.1546708047389984, |
| "sampling/sampling_logp_difference/max": 1.8664562702178955, |
| "sampling/sampling_logp_difference/mean": 0.01938852109014988, |
| "step": 5 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1719.0, |
| "completions/max_terminated_length": 1719.0, |
| "completions/mean_length": 1006.4791870117188, |
| "completions/mean_terminated_length": 1006.4791870117188, |
| "completions/min_length": 622.0, |
| "completions/min_terminated_length": 622.0, |
| "entropy": 0.11143378913402557, |
| "epoch": 0.006458557588805167, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.109375, |
| "learning_rate": 9.946178686759958e-06, |
| "loss": -0.0849, |
| "num_tokens": 527308.0, |
| "reward": 137.47239685058594, |
| "reward_std": 92.79216003417969, |
| "rewards/Rewards/mean": 137.47238159179688, |
| "rewards/Rewards/std": 149.94862365722656, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9963271021842957, |
| "sampling/importance_sampling_ratio/min": 0.09281564503908157, |
| "sampling/sampling_logp_difference/max": 2.3771400451660156, |
| "sampling/sampling_logp_difference/mean": 0.019732967019081116, |
| "step": 6 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1742.0, |
| "completions/mean_length": 1053.125, |
| "completions/mean_terminated_length": 1009.8695678710938, |
| "completions/min_length": 555.0, |
| "completions/min_terminated_length": 555.0, |
| "entropy": 0.10790788382291794, |
| "epoch": 0.007534983853606028, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.125, |
| "learning_rate": 9.93541442411195e-06, |
| "loss": -0.1188, |
| "num_tokens": 625750.0, |
| "reward": 123.31615447998047, |
| "reward_std": 75.22834014892578, |
| "rewards/Rewards/mean": 123.3161392211914, |
| "rewards/Rewards/std": 152.19955444335938, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9965001344680786, |
| "sampling/importance_sampling_ratio/min": 0.10315173864364624, |
| "sampling/sampling_logp_difference/max": 2.2715542316436768, |
| "sampling/sampling_logp_difference/mean": 0.01938672922551632, |
| "step": 7 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1666666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1997.0, |
| "completions/mean_length": 1300.5, |
| "completions/mean_terminated_length": 1151.0, |
| "completions/min_length": 540.0, |
| "completions/min_terminated_length": 540.0, |
| "entropy": 0.11116605252027512, |
| "epoch": 0.008611410118406888, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 1.515625, |
| "learning_rate": 9.92465016146394e-06, |
| "loss": -0.008, |
| "num_tokens": 735160.0, |
| "reward": 127.77455139160156, |
| "reward_std": 100.4245834350586, |
| "rewards/Rewards/mean": 127.77454376220703, |
| "rewards/Rewards/std": 161.16799926757812, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.996357798576355, |
| "sampling/importance_sampling_ratio/min": 0.07584778964519501, |
| "sampling/sampling_logp_difference/max": 2.579026699066162, |
| "sampling/sampling_logp_difference/mean": 0.01950543373823166, |
| "step": 8 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1982.0, |
| "completions/mean_length": 1292.666748046875, |
| "completions/mean_terminated_length": 1242.3111572265625, |
| "completions/min_length": 615.0, |
| "completions/min_terminated_length": 615.0, |
| "entropy": 0.10612079501152039, |
| "epoch": 0.00968783638320775, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.8203125, |
| "learning_rate": 9.913885898815931e-06, |
| "loss": -0.0727, |
| "num_tokens": 843408.0, |
| "reward": 78.78534698486328, |
| "reward_std": 78.33949279785156, |
| "rewards/Rewards/mean": 78.78535461425781, |
| "rewards/Rewards/std": 138.2502899169922, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9955548048019409, |
| "sampling/importance_sampling_ratio/min": 0.007099959533661604, |
| "sampling/sampling_logp_difference/max": 4.947666168212891, |
| "sampling/sampling_logp_difference/mean": 0.0185045525431633, |
| "step": 9 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2291666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2004.0, |
| "completions/mean_length": 1513.8958740234375, |
| "completions/mean_terminated_length": 1355.108154296875, |
| "completions/min_length": 587.0, |
| "completions/min_terminated_length": 587.0, |
| "entropy": 0.10934104025363922, |
| "epoch": 0.010764262648008612, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6640625, |
| "learning_rate": 9.903121636167923e-06, |
| "loss": 0.0008, |
| "num_tokens": 964243.0, |
| "reward": 142.08834838867188, |
| "reward_std": 111.20391082763672, |
| "rewards/Rewards/mean": 142.08831787109375, |
| "rewards/Rewards/std": 154.59608459472656, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9958118200302124, |
| "sampling/importance_sampling_ratio/min": 0.05715308338403702, |
| "sampling/sampling_logp_difference/max": 2.8620219230651855, |
| "sampling/sampling_logp_difference/mean": 0.018251175060868263, |
| "step": 10 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2043.0, |
| "completions/mean_length": 1362.8333740234375, |
| "completions/mean_terminated_length": 1264.952392578125, |
| "completions/min_length": 560.0, |
| "completions/min_terminated_length": 560.0, |
| "entropy": 0.1081463098526001, |
| "epoch": 0.011840688912809472, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.765625, |
| "learning_rate": 9.892357373519915e-06, |
| "loss": -0.0131, |
| "num_tokens": 1071719.0, |
| "reward": 195.29440307617188, |
| "reward_std": 109.97010803222656, |
| "rewards/Rewards/mean": 195.29437255859375, |
| "rewards/Rewards/std": 153.7947540283203, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9960513114929199, |
| "sampling/importance_sampling_ratio/min": 0.009758302941918373, |
| "sampling/sampling_logp_difference/max": 4.629636764526367, |
| "sampling/sampling_logp_difference/mean": 0.019215960055589676, |
| "step": 11 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2916666865348816, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2043.0, |
| "completions/mean_length": 1750.354248046875, |
| "completions/mean_terminated_length": 1627.7940673828125, |
| "completions/min_length": 831.0, |
| "completions/min_terminated_length": 831.0, |
| "entropy": 0.11442270129919052, |
| "epoch": 0.012917115177610334, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4375, |
| "learning_rate": 9.881593110871906e-06, |
| "loss": 0.0184, |
| "num_tokens": 1211050.0, |
| "reward": 164.88296508789062, |
| "reward_std": 126.29165649414062, |
| "rewards/Rewards/mean": 164.88294982910156, |
| "rewards/Rewards/std": 163.17112731933594, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9958729147911072, |
| "sampling/importance_sampling_ratio/min": 0.13337160646915436, |
| "sampling/sampling_logp_difference/max": 2.014616012573242, |
| "sampling/sampling_logp_difference/mean": 0.018926220014691353, |
| "step": 12 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3333333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1879.0, |
| "completions/mean_length": 1591.3958740234375, |
| "completions/mean_terminated_length": 1363.09375, |
| "completions/min_length": 665.0, |
| "completions/min_terminated_length": 665.0, |
| "entropy": 0.10113458335399628, |
| "epoch": 0.013993541442411194, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4921875, |
| "learning_rate": 9.870828848223898e-06, |
| "loss": 0.0255, |
| "num_tokens": 1332917.0, |
| "reward": 112.35260009765625, |
| "reward_std": 93.94807434082031, |
| "rewards/Rewards/mean": 112.35260009765625, |
| "rewards/Rewards/std": 141.7664337158203, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962171912193298, |
| "sampling/importance_sampling_ratio/min": 0.04543043673038483, |
| "sampling/sampling_logp_difference/max": 3.0915729999542236, |
| "sampling/sampling_logp_difference/mean": 0.017922332510352135, |
| "step": 13 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1997.0, |
| "completions/mean_length": 1511.2708740234375, |
| "completions/mean_terminated_length": 1387.4102783203125, |
| "completions/min_length": 651.0, |
| "completions/min_terminated_length": 651.0, |
| "entropy": 0.10830745100975037, |
| "epoch": 0.015069967707212056, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.53125, |
| "learning_rate": 9.86006458557589e-06, |
| "loss": -0.0334, |
| "num_tokens": 1449234.0, |
| "reward": 123.8992919921875, |
| "reward_std": 88.19905853271484, |
| "rewards/Rewards/mean": 123.8992691040039, |
| "rewards/Rewards/std": 146.86688232421875, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9959671497344971, |
| "sampling/importance_sampling_ratio/min": 0.18372225761413574, |
| "sampling/sampling_logp_difference/max": 1.694330096244812, |
| "sampling/sampling_logp_difference/mean": 0.01905238628387451, |
| "step": 14 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2916666865348816, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2016.0, |
| "completions/mean_length": 1455.4583740234375, |
| "completions/mean_terminated_length": 1211.4705810546875, |
| "completions/min_length": 693.0, |
| "completions/min_terminated_length": 693.0, |
| "entropy": 0.10178375989198685, |
| "epoch": 0.016146393972012917, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7109375, |
| "learning_rate": 9.84930032292788e-06, |
| "loss": -0.0285, |
| "num_tokens": 1566238.0, |
| "reward": 104.54838562011719, |
| "reward_std": 63.17893600463867, |
| "rewards/Rewards/mean": 104.54837036132812, |
| "rewards/Rewards/std": 155.53961181640625, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962810277938843, |
| "sampling/importance_sampling_ratio/min": 0.1905871480703354, |
| "sampling/sampling_logp_difference/max": 1.6576457023620605, |
| "sampling/sampling_logp_difference/mean": 0.018221702426671982, |
| "step": 15 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.4583333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2045.0, |
| "completions/mean_length": 1741.9583740234375, |
| "completions/mean_terminated_length": 1483.0, |
| "completions/min_length": 720.0, |
| "completions/min_terminated_length": 720.0, |
| "entropy": 0.10251286625862122, |
| "epoch": 0.017222820236813777, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 1.2421875, |
| "learning_rate": 9.838536060279871e-06, |
| "loss": 0.0534, |
| "num_tokens": 1704008.0, |
| "reward": 89.67097473144531, |
| "reward_std": 84.6678695678711, |
| "rewards/Rewards/mean": 89.67096710205078, |
| "rewards/Rewards/std": 131.33070373535156, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9964886903762817, |
| "sampling/importance_sampling_ratio/min": 0.035292141139507294, |
| "sampling/sampling_logp_difference/max": 3.34409499168396, |
| "sampling/sampling_logp_difference/mean": 0.01839987002313137, |
| "step": 16 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.4583333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2025.0, |
| "completions/mean_length": 1682.666748046875, |
| "completions/mean_terminated_length": 1373.5384521484375, |
| "completions/min_length": 472.0, |
| "completions/min_terminated_length": 472.0, |
| "entropy": 0.10447351634502411, |
| "epoch": 0.01829924650161464, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3984375, |
| "learning_rate": 9.827771797631863e-06, |
| "loss": 0.0515, |
| "num_tokens": 1833718.0, |
| "reward": 132.6174774169922, |
| "reward_std": 95.32293701171875, |
| "rewards/Rewards/mean": 132.6174774169922, |
| "rewards/Rewards/std": 155.01036071777344, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9960591197013855, |
| "sampling/importance_sampling_ratio/min": 0.11370313912630081, |
| "sampling/sampling_logp_difference/max": 2.174164295196533, |
| "sampling/sampling_logp_difference/mean": 0.01829143613576889, |
| "step": 17 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2916666865348816, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2029.0, |
| "completions/mean_length": 1645.479248046875, |
| "completions/mean_terminated_length": 1479.7353515625, |
| "completions/min_length": 807.0, |
| "completions/min_terminated_length": 807.0, |
| "entropy": 0.10255686193704605, |
| "epoch": 0.0193756727664155, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5234375, |
| "learning_rate": 9.817007534983855e-06, |
| "loss": -0.0021, |
| "num_tokens": 1959159.0, |
| "reward": 118.39625549316406, |
| "reward_std": 70.67788696289062, |
| "rewards/Rewards/mean": 118.396240234375, |
| "rewards/Rewards/std": 147.87017822265625, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.995873749256134, |
| "sampling/importance_sampling_ratio/min": 0.04542801156640053, |
| "sampling/sampling_logp_difference/max": 3.0916264057159424, |
| "sampling/sampling_logp_difference/mean": 0.018819186836481094, |
| "step": 18 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2708333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1975.0, |
| "completions/mean_length": 1593.291748046875, |
| "completions/mean_terminated_length": 1424.4000244140625, |
| "completions/min_length": 788.0, |
| "completions/min_terminated_length": 788.0, |
| "entropy": 0.10415040701627731, |
| "epoch": 0.02045209903121636, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6796875, |
| "learning_rate": 9.806243272335847e-06, |
| "loss": 0.02, |
| "num_tokens": 2082473.0, |
| "reward": 192.97549438476562, |
| "reward_std": 106.2840805053711, |
| "rewards/Rewards/mean": 192.9755096435547, |
| "rewards/Rewards/std": 158.14520263671875, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962199330329895, |
| "sampling/importance_sampling_ratio/min": 0.15721529722213745, |
| "sampling/sampling_logp_difference/max": 1.8501391410827637, |
| "sampling/sampling_logp_difference/mean": 0.018406182527542114, |
| "step": 19 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1458333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2025.0, |
| "completions/mean_length": 1522.75, |
| "completions/mean_terminated_length": 1433.0731201171875, |
| "completions/min_length": 720.0, |
| "completions/min_terminated_length": 720.0, |
| "entropy": 0.09845548868179321, |
| "epoch": 0.021528525296017224, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.625, |
| "learning_rate": 9.795479009687837e-06, |
| "loss": -0.0126, |
| "num_tokens": 2202803.0, |
| "reward": 203.66549682617188, |
| "reward_std": 74.4580078125, |
| "rewards/Rewards/mean": 203.6654815673828, |
| "rewards/Rewards/std": 151.41436767578125, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9963464736938477, |
| "sampling/importance_sampling_ratio/min": 0.025838343426585197, |
| "sampling/sampling_logp_difference/max": 3.655895709991455, |
| "sampling/sampling_logp_difference/mean": 0.017935417592525482, |
| "step": 20 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2015.0, |
| "completions/mean_length": 1649.7708740234375, |
| "completions/mean_terminated_length": 1468.757568359375, |
| "completions/min_length": 834.0, |
| "completions/min_terminated_length": 834.0, |
| "entropy": 0.10193191468715668, |
| "epoch": 0.022604951560818085, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.640625, |
| "learning_rate": 9.784714747039828e-06, |
| "loss": 0.0792, |
| "num_tokens": 2327952.0, |
| "reward": 170.25717163085938, |
| "reward_std": 125.09175872802734, |
| "rewards/Rewards/mean": 170.2571563720703, |
| "rewards/Rewards/std": 162.55516052246094, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9963719844818115, |
| "sampling/importance_sampling_ratio/min": 0.15727432072162628, |
| "sampling/sampling_logp_difference/max": 1.8497637510299683, |
| "sampling/sampling_logp_difference/mean": 0.018050983548164368, |
| "step": 21 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2291666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2012.0, |
| "completions/mean_length": 1579.729248046875, |
| "completions/mean_terminated_length": 1440.5135498046875, |
| "completions/min_length": 643.0, |
| "completions/min_terminated_length": 643.0, |
| "entropy": 0.10178973525762558, |
| "epoch": 0.023681377825618945, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7109375, |
| "learning_rate": 9.77395048439182e-06, |
| "loss": -0.0035, |
| "num_tokens": 2447459.0, |
| "reward": 188.00665283203125, |
| "reward_std": 95.58663177490234, |
| "rewards/Rewards/mean": 188.00665283203125, |
| "rewards/Rewards/std": 158.27713012695312, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962644577026367, |
| "sampling/importance_sampling_ratio/min": 0.1137038916349411, |
| "sampling/sampling_logp_difference/max": 2.1741576194763184, |
| "sampling/sampling_logp_difference/mean": 0.018942642956972122, |
| "step": 22 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2291666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2047.0, |
| "completions/mean_length": 1589.791748046875, |
| "completions/mean_terminated_length": 1453.567626953125, |
| "completions/min_length": 630.0, |
| "completions/min_terminated_length": 630.0, |
| "entropy": 0.10466985404491425, |
| "epoch": 0.024757804090419805, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.59375, |
| "learning_rate": 9.763186221743812e-06, |
| "loss": 0.0422, |
| "num_tokens": 2565853.0, |
| "reward": 163.7479248046875, |
| "reward_std": 122.3080062866211, |
| "rewards/Rewards/mean": 163.7479248046875, |
| "rewards/Rewards/std": 170.48175048828125, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9957739114761353, |
| "sampling/importance_sampling_ratio/min": 0.057831499725580215, |
| "sampling/sampling_logp_difference/max": 2.850221633911133, |
| "sampling/sampling_logp_difference/mean": 0.019438397139310837, |
| "step": 23 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2021.0, |
| "completions/mean_length": 1572.8125, |
| "completions/mean_terminated_length": 1414.4166259765625, |
| "completions/min_length": 722.0, |
| "completions/min_terminated_length": 722.0, |
| "entropy": 0.1026575118303299, |
| "epoch": 0.02583423035522067, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.390625, |
| "learning_rate": 9.752421959095802e-06, |
| "loss": -0.0083, |
| "num_tokens": 2690938.0, |
| "reward": 131.00326538085938, |
| "reward_std": 95.41658020019531, |
| "rewards/Rewards/mean": 131.00328063964844, |
| "rewards/Rewards/std": 156.06686401367188, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9963216781616211, |
| "sampling/importance_sampling_ratio/min": 0.20426322519779205, |
| "sampling/sampling_logp_difference/max": 1.5883457660675049, |
| "sampling/sampling_logp_difference/mean": 0.018099797889590263, |
| "step": 24 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3958333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2013.0, |
| "completions/mean_length": 1594.0, |
| "completions/mean_terminated_length": 1296.5517578125, |
| "completions/min_length": 548.0, |
| "completions/min_terminated_length": 548.0, |
| "entropy": 0.10098929703235626, |
| "epoch": 0.02691065662002153, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 1.375, |
| "learning_rate": 9.741657696447793e-06, |
| "loss": 0.0231, |
| "num_tokens": 2830276.0, |
| "reward": 183.90872192382812, |
| "reward_std": 96.02549743652344, |
| "rewards/Rewards/mean": 183.90869140625, |
| "rewards/Rewards/std": 162.77676391601562, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9963605403900146, |
| "sampling/importance_sampling_ratio/min": 0.06367813050746918, |
| "sampling/sampling_logp_difference/max": 2.7539141178131104, |
| "sampling/sampling_logp_difference/mean": 0.01794135943055153, |
| "step": 25 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1666666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1964.0, |
| "completions/mean_length": 1354.9375, |
| "completions/mean_terminated_length": 1216.3250732421875, |
| "completions/min_length": 513.0, |
| "completions/min_terminated_length": 513.0, |
| "entropy": 0.09489695727825165, |
| "epoch": 0.02798708288482239, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.8046875, |
| "learning_rate": 9.730893433799785e-06, |
| "loss": -0.0773, |
| "num_tokens": 2943301.0, |
| "reward": 75.51364135742188, |
| "reward_std": 65.03620910644531, |
| "rewards/Rewards/mean": 75.51363372802734, |
| "rewards/Rewards/std": 119.27247619628906, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9964057207107544, |
| "sampling/importance_sampling_ratio/min": 0.03296323120594025, |
| "sampling/sampling_logp_difference/max": 3.412362575531006, |
| "sampling/sampling_logp_difference/mean": 0.018709205090999603, |
| "step": 26 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2291666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2031.0, |
| "completions/mean_length": 1607.979248046875, |
| "completions/mean_terminated_length": 1477.1622314453125, |
| "completions/min_length": 870.0, |
| "completions/min_terminated_length": 870.0, |
| "entropy": 0.10251633822917938, |
| "epoch": 0.02906350914962325, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4921875, |
| "learning_rate": 9.720129171151777e-06, |
| "loss": -0.0425, |
| "num_tokens": 3064410.0, |
| "reward": 191.07308959960938, |
| "reward_std": 90.68852233886719, |
| "rewards/Rewards/mean": 191.0730743408203, |
| "rewards/Rewards/std": 165.14541625976562, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9957290291786194, |
| "sampling/importance_sampling_ratio/min": 0.002294244710355997, |
| "sampling/sampling_logp_difference/max": 6.0773515701293945, |
| "sampling/sampling_logp_difference/mean": 0.018729567527770996, |
| "step": 27 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2708333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2009.0, |
| "completions/mean_length": 1671.2083740234375, |
| "completions/mean_terminated_length": 1531.2572021484375, |
| "completions/min_length": 905.0, |
| "completions/min_terminated_length": 905.0, |
| "entropy": 0.1027567908167839, |
| "epoch": 0.030139935414424113, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5703125, |
| "learning_rate": 9.709364908503769e-06, |
| "loss": 0.0418, |
| "num_tokens": 3188380.0, |
| "reward": 136.59169006347656, |
| "reward_std": 62.256492614746094, |
| "rewards/Rewards/mean": 136.5916748046875, |
| "rewards/Rewards/std": 151.45384216308594, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9961684942245483, |
| "sampling/importance_sampling_ratio/min": 0.0541137270629406, |
| "sampling/sampling_logp_difference/max": 2.9166674613952637, |
| "sampling/sampling_logp_difference/mean": 0.018425792455673218, |
| "step": 28 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3541666865348816, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2048.0, |
| "completions/mean_length": 1587.7083740234375, |
| "completions/mean_terminated_length": 1335.290283203125, |
| "completions/min_length": 547.0, |
| "completions/min_terminated_length": 547.0, |
| "entropy": 0.10235518962144852, |
| "epoch": 0.031216361679224973, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 1.5859375, |
| "learning_rate": 9.69860064585576e-06, |
| "loss": -0.0208, |
| "num_tokens": 3326762.0, |
| "reward": 143.9744873046875, |
| "reward_std": 42.498687744140625, |
| "rewards/Rewards/mean": 143.97447204589844, |
| "rewards/Rewards/std": 149.9228973388672, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9966323971748352, |
| "sampling/importance_sampling_ratio/min": 0.12920065224170685, |
| "sampling/sampling_logp_difference/max": 2.046388626098633, |
| "sampling/sampling_logp_difference/mean": 0.018576383590698242, |
| "step": 29 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2291666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2032.0, |
| "completions/mean_length": 1538.7083740234375, |
| "completions/mean_terminated_length": 1387.29736328125, |
| "completions/min_length": 705.0, |
| "completions/min_terminated_length": 705.0, |
| "entropy": 0.10081931948661804, |
| "epoch": 0.03229278794402583, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6640625, |
| "learning_rate": 9.687836383207752e-06, |
| "loss": 0.0021, |
| "num_tokens": 3450126.0, |
| "reward": 155.98631286621094, |
| "reward_std": 135.351806640625, |
| "rewards/Rewards/mean": 155.98631286621094, |
| "rewards/Rewards/std": 177.09036254882812, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9961260557174683, |
| "sampling/importance_sampling_ratio/min": 0.0660436749458313, |
| "sampling/sampling_logp_difference/max": 2.7174389362335205, |
| "sampling/sampling_logp_difference/mean": 0.018216893076896667, |
| "step": 30 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2042.0, |
| "completions/mean_length": 1672.3125, |
| "completions/mean_terminated_length": 1585.6153564453125, |
| "completions/min_length": 804.0, |
| "completions/min_terminated_length": 804.0, |
| "entropy": 0.0999954417347908, |
| "epoch": 0.03336921420882669, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.677072120559744e-06, |
| "loss": 0.0166, |
| "num_tokens": 3577095.0, |
| "reward": 157.2335205078125, |
| "reward_std": 114.414306640625, |
| "rewards/Rewards/mean": 157.23350524902344, |
| "rewards/Rewards/std": 159.78680419921875, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9964659214019775, |
| "sampling/importance_sampling_ratio/min": 0.019857462495565414, |
| "sampling/sampling_logp_difference/max": 3.919175386428833, |
| "sampling/sampling_logp_difference/mean": 0.018892712891101837, |
| "step": 31 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2023.0, |
| "completions/mean_length": 1509.9583740234375, |
| "completions/mean_terminated_length": 1433.09521484375, |
| "completions/min_length": 661.0, |
| "completions/min_terminated_length": 661.0, |
| "entropy": 0.10067085921764374, |
| "epoch": 0.03444564047362755, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7734375, |
| "learning_rate": 9.666307857911734e-06, |
| "loss": 0.0353, |
| "num_tokens": 3701461.0, |
| "reward": 224.95938110351562, |
| "reward_std": 124.48177337646484, |
| "rewards/Rewards/mean": 224.9593505859375, |
| "rewards/Rewards/std": 151.01626586914062, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962152242660522, |
| "sampling/importance_sampling_ratio/min": 0.14445149898529053, |
| "sampling/sampling_logp_difference/max": 1.9348114728927612, |
| "sampling/sampling_logp_difference/mean": 0.019468065351247787, |
| "step": 32 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2083333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2021.0, |
| "completions/mean_length": 1636.104248046875, |
| "completions/mean_terminated_length": 1527.7105712890625, |
| "completions/min_length": 636.0, |
| "completions/min_terminated_length": 636.0, |
| "entropy": 0.10358748584985733, |
| "epoch": 0.03552206673842842, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.578125, |
| "learning_rate": 9.655543595263725e-06, |
| "loss": 0.0059, |
| "num_tokens": 3847320.0, |
| "reward": 138.1477813720703, |
| "reward_std": 55.14875411987305, |
| "rewards/Rewards/mean": 138.14776611328125, |
| "rewards/Rewards/std": 136.7897186279297, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9959272146224976, |
| "sampling/importance_sampling_ratio/min": 0.12920619547367096, |
| "sampling/sampling_logp_difference/max": 2.0463457107543945, |
| "sampling/sampling_logp_difference/mean": 0.018886592239141464, |
| "step": 33 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2044.0, |
| "completions/mean_length": 1700.0, |
| "completions/mean_terminated_length": 1491.2000732421875, |
| "completions/min_length": 777.0, |
| "completions/min_terminated_length": 777.0, |
| "entropy": 0.10209772735834122, |
| "epoch": 0.03659849300322928, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.65625, |
| "learning_rate": 9.644779332615717e-06, |
| "loss": -0.0092, |
| "num_tokens": 3980100.0, |
| "reward": 121.0146713256836, |
| "reward_std": 102.0250244140625, |
| "rewards/Rewards/mean": 121.01465606689453, |
| "rewards/Rewards/std": 146.56837463378906, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9963019490242004, |
| "sampling/importance_sampling_ratio/min": 0.085022933781147, |
| "sampling/sampling_logp_difference/max": 2.6755123138427734, |
| "sampling/sampling_logp_difference/mean": 0.018831949681043625, |
| "step": 34 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2045.0, |
| "completions/mean_length": 1679.541748046875, |
| "completions/mean_terminated_length": 1512.0606689453125, |
| "completions/min_length": 943.0, |
| "completions/min_terminated_length": 943.0, |
| "entropy": 0.1014847606420517, |
| "epoch": 0.03767491926803014, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6015625, |
| "learning_rate": 9.634015069967709e-06, |
| "loss": -0.0016, |
| "num_tokens": 4113914.0, |
| "reward": 79.2403564453125, |
| "reward_std": 42.394386291503906, |
| "rewards/Rewards/mean": 79.2403564453125, |
| "rewards/Rewards/std": 115.3869400024414, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9961296319961548, |
| "sampling/importance_sampling_ratio/min": 0.02546442113816738, |
| "sampling/sampling_logp_difference/max": 3.670473098754883, |
| "sampling/sampling_logp_difference/mean": 0.0189987625926733, |
| "step": 35 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2708333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2014.0, |
| "completions/mean_length": 1772.6458740234375, |
| "completions/mean_terminated_length": 1670.3714599609375, |
| "completions/min_length": 1016.0, |
| "completions/min_terminated_length": 1016.0, |
| "entropy": 0.1009664312005043, |
| "epoch": 0.038751345532831, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.623250807319699e-06, |
| "loss": 0.0061, |
| "num_tokens": 4249329.0, |
| "reward": 142.5376434326172, |
| "reward_std": 83.20094299316406, |
| "rewards/Rewards/mean": 142.5376434326172, |
| "rewards/Rewards/std": 157.02310180664062, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.996181845664978, |
| "sampling/importance_sampling_ratio/min": 0.00419560307636857, |
| "sampling/sampling_logp_difference/max": 5.473718166351318, |
| "sampling/sampling_logp_difference/mean": 0.01883939653635025, |
| "step": 36 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1458333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1978.0, |
| "completions/mean_length": 1526.25, |
| "completions/mean_terminated_length": 1437.170654296875, |
| "completions/min_length": 756.0, |
| "completions/min_terminated_length": 756.0, |
| "entropy": 0.09632325172424316, |
| "epoch": 0.03982777179763186, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.84375, |
| "learning_rate": 9.61248654467169e-06, |
| "loss": -0.0497, |
| "num_tokens": 4368357.0, |
| "reward": 153.72262573242188, |
| "reward_std": 64.56527709960938, |
| "rewards/Rewards/mean": 153.72262573242188, |
| "rewards/Rewards/std": 167.7002716064453, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9960031509399414, |
| "sampling/importance_sampling_ratio/min": 0.013016091659665108, |
| "sampling/sampling_logp_difference/max": 4.341568946838379, |
| "sampling/sampling_logp_difference/mean": 0.01888095960021019, |
| "step": 37 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2039.0, |
| "completions/mean_length": 1351.5833740234375, |
| "completions/mean_terminated_length": 1190.871826171875, |
| "completions/min_length": 622.0, |
| "completions/min_terminated_length": 622.0, |
| "entropy": 0.09179762005805969, |
| "epoch": 0.04090419806243272, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.8671875, |
| "learning_rate": 9.601722282023682e-06, |
| "loss": -0.024, |
| "num_tokens": 4475905.0, |
| "reward": 105.28202819824219, |
| "reward_std": 40.796363830566406, |
| "rewards/Rewards/mean": 105.28202056884766, |
| "rewards/Rewards/std": 128.94845581054688, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9963054656982422, |
| "sampling/importance_sampling_ratio/min": 0.042128000408411026, |
| "sampling/sampling_logp_difference/max": 3.1670427322387695, |
| "sampling/sampling_logp_difference/mean": 0.018371229991316795, |
| "step": 38 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1987.0, |
| "completions/mean_length": 1484.2083740234375, |
| "completions/mean_terminated_length": 1227.939453125, |
| "completions/min_length": 615.0, |
| "completions/min_terminated_length": 615.0, |
| "entropy": 0.0977960154414177, |
| "epoch": 0.04198062432723358, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 1.671875, |
| "learning_rate": 9.590958019375674e-06, |
| "loss": -0.0293, |
| "num_tokens": 4597715.0, |
| "reward": 184.84185791015625, |
| "reward_std": 77.36622619628906, |
| "rewards/Rewards/mean": 184.8418426513672, |
| "rewards/Rewards/std": 149.9651336669922, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9964928030967712, |
| "sampling/importance_sampling_ratio/min": 0.0114639513194561, |
| "sampling/sampling_logp_difference/max": 4.468547821044922, |
| "sampling/sampling_logp_difference/mean": 0.01864359900355339, |
| "step": 39 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2291666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2040.0, |
| "completions/mean_length": 1668.7083740234375, |
| "completions/mean_terminated_length": 1555.9459228515625, |
| "completions/min_length": 728.0, |
| "completions/min_terminated_length": 728.0, |
| "entropy": 0.09704931080341339, |
| "epoch": 0.04305705059203445, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7421875, |
| "learning_rate": 9.580193756727666e-06, |
| "loss": -0.047, |
| "num_tokens": 4724073.0, |
| "reward": 123.34667205810547, |
| "reward_std": 45.43848419189453, |
| "rewards/Rewards/mean": 123.34667205810547, |
| "rewards/Rewards/std": 150.5157012939453, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.996220052242279, |
| "sampling/importance_sampling_ratio/min": 0.09609924256801605, |
| "sampling/sampling_logp_difference/max": 2.342373847961426, |
| "sampling/sampling_logp_difference/mean": 0.01860324665904045, |
| "step": 40 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1991.0, |
| "completions/mean_length": 1652.25, |
| "completions/mean_terminated_length": 1520.3333740234375, |
| "completions/min_length": 1028.0, |
| "completions/min_terminated_length": 1028.0, |
| "entropy": 0.09834767878055573, |
| "epoch": 0.04413347685683531, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.671875, |
| "learning_rate": 9.569429494079656e-06, |
| "loss": 0.066, |
| "num_tokens": 4858017.0, |
| "reward": 152.06527709960938, |
| "reward_std": 94.49375915527344, |
| "rewards/Rewards/mean": 152.0652618408203, |
| "rewards/Rewards/std": 155.73765563964844, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9955779314041138, |
| "sampling/importance_sampling_ratio/min": 0.016845213249325752, |
| "sampling/sampling_logp_difference/max": 4.083688735961914, |
| "sampling/sampling_logp_difference/mean": 0.018709905445575714, |
| "step": 41 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0833333358168602, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2035.0, |
| "completions/mean_length": 1488.25, |
| "completions/mean_terminated_length": 1437.3636474609375, |
| "completions/min_length": 923.0, |
| "completions/min_terminated_length": 923.0, |
| "entropy": 0.09486360847949982, |
| "epoch": 0.04520990312163617, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.828125, |
| "learning_rate": 9.558665231431647e-06, |
| "loss": -0.0715, |
| "num_tokens": 4982931.0, |
| "reward": 171.48448181152344, |
| "reward_std": 81.55918884277344, |
| "rewards/Rewards/mean": 171.48448181152344, |
| "rewards/Rewards/std": 155.43109130859375, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9963211417198181, |
| "sampling/importance_sampling_ratio/min": 0.007348766550421715, |
| "sampling/sampling_logp_difference/max": 4.913222789764404, |
| "sampling/sampling_logp_difference/mean": 0.018740836530923843, |
| "step": 42 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2003.0, |
| "completions/mean_length": 1424.3125, |
| "completions/mean_terminated_length": 1382.7333984375, |
| "completions/min_length": 729.0, |
| "completions/min_terminated_length": 729.0, |
| "entropy": 0.09741601347923279, |
| "epoch": 0.04628632938643703, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.859375, |
| "learning_rate": 9.547900968783639e-06, |
| "loss": -0.0045, |
| "num_tokens": 5097222.0, |
| "reward": 211.81884765625, |
| "reward_std": 101.86795043945312, |
| "rewards/Rewards/mean": 211.81884765625, |
| "rewards/Rewards/std": 150.73831176757812, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9959251284599304, |
| "sampling/importance_sampling_ratio/min": 0.03727293387055397, |
| "sampling/sampling_logp_difference/max": 3.289487838745117, |
| "sampling/sampling_logp_difference/mean": 0.019588638097047806, |
| "step": 43 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1041666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1989.0, |
| "completions/mean_length": 1534.8333740234375, |
| "completions/mean_terminated_length": 1475.162841796875, |
| "completions/min_length": 738.0, |
| "completions/min_terminated_length": 738.0, |
| "entropy": 0.09020587801933289, |
| "epoch": 0.04736275565123789, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.578125, |
| "learning_rate": 9.53713670613563e-06, |
| "loss": -0.0717, |
| "num_tokens": 5224438.0, |
| "reward": 110.83538818359375, |
| "reward_std": 79.80099487304688, |
| "rewards/Rewards/mean": 110.83538818359375, |
| "rewards/Rewards/std": 144.04507446289062, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962780475616455, |
| "sampling/importance_sampling_ratio/min": 0.04941500723361969, |
| "sampling/sampling_logp_difference/max": 3.0075011253356934, |
| "sampling/sampling_logp_difference/mean": 0.017612557858228683, |
| "step": 44 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2024.0, |
| "completions/mean_length": 1616.25, |
| "completions/mean_terminated_length": 1472.3333740234375, |
| "completions/min_length": 673.0, |
| "completions/min_terminated_length": 673.0, |
| "entropy": 0.09505733847618103, |
| "epoch": 0.04843918191603875, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.75, |
| "learning_rate": 9.52637244348762e-06, |
| "loss": -0.0156, |
| "num_tokens": 5360650.0, |
| "reward": 135.80593872070312, |
| "reward_std": 77.96055603027344, |
| "rewards/Rewards/mean": 135.80592346191406, |
| "rewards/Rewards/std": 162.7121124267578, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9965173006057739, |
| "sampling/importance_sampling_ratio/min": 0.09018614143133163, |
| "sampling/sampling_logp_difference/max": 2.405879497528076, |
| "sampling/sampling_logp_difference/mean": 0.018498722463846207, |
| "step": 45 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2708333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2027.0, |
| "completions/mean_length": 1686.229248046875, |
| "completions/mean_terminated_length": 1551.857177734375, |
| "completions/min_length": 947.0, |
| "completions/min_terminated_length": 947.0, |
| "entropy": 0.09737753868103027, |
| "epoch": 0.04951560818083961, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9140625, |
| "learning_rate": 9.515608180839613e-06, |
| "loss": -0.0086, |
| "num_tokens": 5487141.0, |
| "reward": 54.04819869995117, |
| "reward_std": 46.59992218017578, |
| "rewards/Rewards/mean": 54.04819869995117, |
| "rewards/Rewards/std": 70.76473999023438, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9959502220153809, |
| "sampling/importance_sampling_ratio/min": 2.4299904907820746e-05, |
| "sampling/sampling_logp_difference/max": 10.625038146972656, |
| "sampling/sampling_logp_difference/mean": 0.01930040866136551, |
| "step": 46 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1981.0, |
| "completions/mean_length": 1558.2708740234375, |
| "completions/mean_terminated_length": 1488.3095703125, |
| "completions/min_length": 770.0, |
| "completions/min_terminated_length": 770.0, |
| "entropy": 0.10088413953781128, |
| "epoch": 0.05059203444564048, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.75, |
| "learning_rate": 9.504843918191604e-06, |
| "loss": -0.0587, |
| "num_tokens": 5617066.0, |
| "reward": 110.54548645019531, |
| "reward_std": 60.73766326904297, |
| "rewards/Rewards/mean": 110.54547882080078, |
| "rewards/Rewards/std": 137.8954620361328, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9965789318084717, |
| "sampling/importance_sampling_ratio/min": 0.030534274876117706, |
| "sampling/sampling_logp_difference/max": 3.488905429840088, |
| "sampling/sampling_logp_difference/mean": 0.018777910619974136, |
| "step": 47 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1458333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2047.0, |
| "completions/mean_length": 1724.7708740234375, |
| "completions/mean_terminated_length": 1669.5853271484375, |
| "completions/min_length": 1103.0, |
| "completions/min_terminated_length": 1103.0, |
| "entropy": 0.0975833609700203, |
| "epoch": 0.05166846071044134, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5859375, |
| "learning_rate": 9.494079655543596e-06, |
| "loss": 0.0125, |
| "num_tokens": 5746925.0, |
| "reward": 165.01998901367188, |
| "reward_std": 72.62326049804688, |
| "rewards/Rewards/mean": 165.01995849609375, |
| "rewards/Rewards/std": 155.69970703125, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9964895248413086, |
| "sampling/importance_sampling_ratio/min": 0.0012031460646539927, |
| "sampling/sampling_logp_difference/max": 6.72281551361084, |
| "sampling/sampling_logp_difference/mean": 0.018396669998764992, |
| "step": 48 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1952.0, |
| "completions/mean_length": 1696.4375, |
| "completions/mean_terminated_length": 1615.3077392578125, |
| "completions/min_length": 1103.0, |
| "completions/min_terminated_length": 1103.0, |
| "entropy": 0.10078628361225128, |
| "epoch": 0.0527448869752422, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7890625, |
| "learning_rate": 9.483315392895588e-06, |
| "loss": -0.0133, |
| "num_tokens": 5888714.0, |
| "reward": 121.20083618164062, |
| "reward_std": 103.41592407226562, |
| "rewards/Rewards/mean": 121.2008285522461, |
| "rewards/Rewards/std": 139.4957733154297, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9960814714431763, |
| "sampling/importance_sampling_ratio/min": 0.01684529334306717, |
| "sampling/sampling_logp_difference/max": 4.083683967590332, |
| "sampling/sampling_logp_difference/mean": 0.019092433154582977, |
| "step": 49 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2036.0, |
| "completions/mean_length": 1389.416748046875, |
| "completions/mean_terminated_length": 1345.5111083984375, |
| "completions/min_length": 643.0, |
| "completions/min_terminated_length": 643.0, |
| "entropy": 0.09157487750053406, |
| "epoch": 0.05382131324004306, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.953125, |
| "learning_rate": 9.47255113024758e-06, |
| "loss": -0.0557, |
| "num_tokens": 5997592.0, |
| "reward": 144.40060424804688, |
| "reward_std": 57.521636962890625, |
| "rewards/Rewards/mean": 144.4005889892578, |
| "rewards/Rewards/std": 166.4522247314453, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9964755773544312, |
| "sampling/importance_sampling_ratio/min": 0.038425933569669724, |
| "sampling/sampling_logp_difference/max": 3.2590227127075195, |
| "sampling/sampling_logp_difference/mean": 0.018797706812620163, |
| "step": 50 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1458333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2037.0, |
| "completions/mean_length": 1519.3958740234375, |
| "completions/mean_terminated_length": 1429.146240234375, |
| "completions/min_length": 624.0, |
| "completions/min_terminated_length": 624.0, |
| "entropy": 0.09402786195278168, |
| "epoch": 0.05489773950484392, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6875, |
| "learning_rate": 9.461786867599571e-06, |
| "loss": -0.0112, |
| "num_tokens": 6117167.0, |
| "reward": 212.0793914794922, |
| "reward_std": 130.76829528808594, |
| "rewards/Rewards/mean": 212.0793914794922, |
| "rewards/Rewards/std": 167.76551818847656, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9960064888000488, |
| "sampling/importance_sampling_ratio/min": 0.0003944017516914755, |
| "sampling/sampling_logp_difference/max": 7.838140487670898, |
| "sampling/sampling_logp_difference/mean": 0.018857020884752274, |
| "step": 51 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1041666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2031.0, |
| "completions/mean_length": 1518.0, |
| "completions/mean_terminated_length": 1456.3720703125, |
| "completions/min_length": 671.0, |
| "completions/min_terminated_length": 671.0, |
| "entropy": 0.10275686532258987, |
| "epoch": 0.05597416576964478, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6484375, |
| "learning_rate": 9.451022604951561e-06, |
| "loss": -0.0197, |
| "num_tokens": 6234137.0, |
| "reward": 163.3297882080078, |
| "reward_std": 112.94554138183594, |
| "rewards/Rewards/mean": 163.3297882080078, |
| "rewards/Rewards/std": 149.05389404296875, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9961435794830322, |
| "sampling/importance_sampling_ratio/min": 0.020013727247714996, |
| "sampling/sampling_logp_difference/max": 3.911336898803711, |
| "sampling/sampling_logp_difference/mean": 0.01944819837808609, |
| "step": 52 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2017.0, |
| "completions/mean_length": 1657.875, |
| "completions/mean_terminated_length": 1567.84619140625, |
| "completions/min_length": 663.0, |
| "completions/min_terminated_length": 663.0, |
| "entropy": 0.09955208003520966, |
| "epoch": 0.05705059203444564, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5859375, |
| "learning_rate": 9.440258342303553e-06, |
| "loss": 0.0054, |
| "num_tokens": 6360593.0, |
| "reward": 186.9022216796875, |
| "reward_std": 92.23899841308594, |
| "rewards/Rewards/mean": 186.90220642089844, |
| "rewards/Rewards/std": 152.16409301757812, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962495565414429, |
| "sampling/importance_sampling_ratio/min": 0.03338921442627907, |
| "sampling/sampling_logp_difference/max": 3.399522304534912, |
| "sampling/sampling_logp_difference/mean": 0.018768608570098877, |
| "step": 53 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1974.0, |
| "completions/mean_length": 1413.104248046875, |
| "completions/mean_terminated_length": 1385.5, |
| "completions/min_length": 738.0, |
| "completions/min_terminated_length": 738.0, |
| "entropy": 0.10132155567407608, |
| "epoch": 0.0581270182992465, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9609375, |
| "learning_rate": 9.429494079655545e-06, |
| "loss": -0.026, |
| "num_tokens": 6476044.0, |
| "reward": 105.79423522949219, |
| "reward_std": 78.7714614868164, |
| "rewards/Rewards/mean": 105.7942123413086, |
| "rewards/Rewards/std": 127.42036437988281, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.995726466178894, |
| "sampling/importance_sampling_ratio/min": 0.02481662854552269, |
| "sampling/sampling_logp_difference/max": 3.6962413787841797, |
| "sampling/sampling_logp_difference/mean": 0.020124241709709167, |
| "step": 54 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2029.0, |
| "completions/mean_length": 1417.166748046875, |
| "completions/mean_terminated_length": 1375.111083984375, |
| "completions/min_length": 668.0, |
| "completions/min_terminated_length": 668.0, |
| "entropy": 0.09215769916772842, |
| "epoch": 0.059203444564047365, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.96875, |
| "learning_rate": 9.418729817007536e-06, |
| "loss": -0.0679, |
| "num_tokens": 6588270.0, |
| "reward": 135.9485626220703, |
| "reward_std": 112.82146453857422, |
| "rewards/Rewards/mean": 135.9485626220703, |
| "rewards/Rewards/std": 160.89108276367188, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962545037269592, |
| "sampling/importance_sampling_ratio/min": 0.030534042045474052, |
| "sampling/sampling_logp_difference/max": 3.488913059234619, |
| "sampling/sampling_logp_difference/mean": 0.0190061517059803, |
| "step": 55 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2916666865348816, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2028.0, |
| "completions/mean_length": 1753.0208740234375, |
| "completions/mean_terminated_length": 1631.558837890625, |
| "completions/min_length": 1058.0, |
| "completions/min_terminated_length": 1058.0, |
| "entropy": 0.10360866785049438, |
| "epoch": 0.060279870828848225, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6875, |
| "learning_rate": 9.407965554359528e-06, |
| "loss": 0.0544, |
| "num_tokens": 6725311.0, |
| "reward": 150.8729705810547, |
| "reward_std": 118.97310638427734, |
| "rewards/Rewards/mean": 150.87294006347656, |
| "rewards/Rewards/std": 161.175537109375, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9956259727478027, |
| "sampling/importance_sampling_ratio/min": 0.015444685705006123, |
| "sampling/sampling_logp_difference/max": 4.170490264892578, |
| "sampling/sampling_logp_difference/mean": 0.019745338708162308, |
| "step": 56 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1458333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2029.0, |
| "completions/mean_length": 1575.666748046875, |
| "completions/mean_terminated_length": 1495.0242919921875, |
| "completions/min_length": 802.0, |
| "completions/min_terminated_length": 802.0, |
| "entropy": 0.09608377516269684, |
| "epoch": 0.061356297093649086, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6953125, |
| "learning_rate": 9.397201291711518e-06, |
| "loss": -0.0215, |
| "num_tokens": 6850545.0, |
| "reward": 178.48880004882812, |
| "reward_std": 125.83811950683594, |
| "rewards/Rewards/mean": 178.4888153076172, |
| "rewards/Rewards/std": 155.5189971923828, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9961973428726196, |
| "sampling/importance_sampling_ratio/min": 0.06855442374944687, |
| "sampling/sampling_logp_difference/max": 2.6801273822784424, |
| "sampling/sampling_logp_difference/mean": 0.018560823053121567, |
| "step": 57 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2026.0, |
| "completions/mean_length": 1514.7708740234375, |
| "completions/mean_terminated_length": 1438.59521484375, |
| "completions/min_length": 665.0, |
| "completions/min_terminated_length": 665.0, |
| "entropy": 0.09563355147838593, |
| "epoch": 0.062432723358449946, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9296875, |
| "learning_rate": 9.38643702906351e-06, |
| "loss": -0.0427, |
| "num_tokens": 6978388.0, |
| "reward": 117.72795104980469, |
| "reward_std": 99.36922454833984, |
| "rewards/Rewards/mean": 117.72793579101562, |
| "rewards/Rewards/std": 152.76295471191406, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9963783025741577, |
| "sampling/importance_sampling_ratio/min": 0.023247025907039642, |
| "sampling/sampling_logp_difference/max": 3.76157808303833, |
| "sampling/sampling_logp_difference/mean": 0.019177014008164406, |
| "step": 58 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1458333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1973.0, |
| "completions/mean_length": 1525.479248046875, |
| "completions/mean_terminated_length": 1436.2681884765625, |
| "completions/min_length": 863.0, |
| "completions/min_terminated_length": 863.0, |
| "entropy": 0.09957332164049149, |
| "epoch": 0.06350914962325081, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.59375, |
| "learning_rate": 9.375672766415501e-06, |
| "loss": -0.033, |
| "num_tokens": 7098075.0, |
| "reward": 209.320068359375, |
| "reward_std": 75.28262329101562, |
| "rewards/Rewards/mean": 209.320068359375, |
| "rewards/Rewards/std": 161.8440399169922, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9965529441833496, |
| "sampling/importance_sampling_ratio/min": 0.07584026455879211, |
| "sampling/sampling_logp_difference/max": 2.5791258811950684, |
| "sampling/sampling_logp_difference/mean": 0.019289657473564148, |
| "step": 59 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1666666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2004.0, |
| "completions/mean_length": 1673.8125, |
| "completions/mean_terminated_length": 1598.9749755859375, |
| "completions/min_length": 869.0, |
| "completions/min_terminated_length": 869.0, |
| "entropy": 0.09496274590492249, |
| "epoch": 0.06458557588805167, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.65625, |
| "learning_rate": 9.364908503767493e-06, |
| "loss": 0.0237, |
| "num_tokens": 7235586.0, |
| "reward": 95.792724609375, |
| "reward_std": 94.22042083740234, |
| "rewards/Rewards/mean": 95.79271697998047, |
| "rewards/Rewards/std": 115.19368743896484, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9958664774894714, |
| "sampling/importance_sampling_ratio/min": 0.026444947347044945, |
| "sampling/sampling_logp_difference/max": 3.632690191268921, |
| "sampling/sampling_logp_difference/mean": 0.019015762954950333, |
| "step": 60 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1992.0, |
| "completions/mean_length": 1550.666748046875, |
| "completions/mean_terminated_length": 1479.6190185546875, |
| "completions/min_length": 767.0, |
| "completions/min_terminated_length": 767.0, |
| "entropy": 0.097409687936306, |
| "epoch": 0.06566200215285253, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7109375, |
| "learning_rate": 9.354144241119483e-06, |
| "loss": 0.0211, |
| "num_tokens": 7361786.0, |
| "reward": 134.22329711914062, |
| "reward_std": 93.79661560058594, |
| "rewards/Rewards/mean": 134.22328186035156, |
| "rewards/Rewards/std": 143.19869995117188, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962544441223145, |
| "sampling/importance_sampling_ratio/min": 0.030053241178393364, |
| "sampling/sampling_logp_difference/max": 3.5047848224639893, |
| "sampling/sampling_logp_difference/mean": 0.01930471695959568, |
| "step": 61 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2028.0, |
| "completions/mean_length": 1623.479248046875, |
| "completions/mean_terminated_length": 1562.8333740234375, |
| "completions/min_length": 1080.0, |
| "completions/min_terminated_length": 1080.0, |
| "entropy": 0.09310100972652435, |
| "epoch": 0.06673842841765339, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.296875, |
| "learning_rate": 9.343379978471475e-06, |
| "loss": 0.0072, |
| "num_tokens": 7488655.0, |
| "reward": 212.15054321289062, |
| "reward_std": 121.80223846435547, |
| "rewards/Rewards/mean": 212.15052795410156, |
| "rewards/Rewards/std": 165.37892150878906, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9959063529968262, |
| "sampling/importance_sampling_ratio/min": 0.02023465186357498, |
| "sampling/sampling_logp_difference/max": 3.9003586769104004, |
| "sampling/sampling_logp_difference/mean": 0.01847892440855503, |
| "step": 62 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2041.0, |
| "completions/mean_length": 1498.416748046875, |
| "completions/mean_terminated_length": 1419.90478515625, |
| "completions/min_length": 833.0, |
| "completions/min_terminated_length": 833.0, |
| "entropy": 0.09334105253219604, |
| "epoch": 0.06781485468245425, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.8046875, |
| "learning_rate": 9.332615715823467e-06, |
| "loss": -0.0597, |
| "num_tokens": 7605093.0, |
| "reward": 122.33708953857422, |
| "reward_std": 82.52408599853516, |
| "rewards/Rewards/mean": 122.33708953857422, |
| "rewards/Rewards/std": 139.99977111816406, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9959535002708435, |
| "sampling/importance_sampling_ratio/min": 0.012991238385438919, |
| "sampling/sampling_logp_difference/max": 4.343480110168457, |
| "sampling/sampling_logp_difference/mean": 0.01902196742594242, |
| "step": 63 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2083333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1966.0, |
| "completions/mean_length": 1604.5833740234375, |
| "completions/mean_terminated_length": 1487.894775390625, |
| "completions/min_length": 762.0, |
| "completions/min_terminated_length": 762.0, |
| "entropy": 0.09379036724567413, |
| "epoch": 0.0688912809472551, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7265625, |
| "learning_rate": 9.321851453175458e-06, |
| "loss": 0.027, |
| "num_tokens": 7734385.0, |
| "reward": 152.49241638183594, |
| "reward_std": 91.75732421875, |
| "rewards/Rewards/mean": 152.49241638183594, |
| "rewards/Rewards/std": 155.2235107421875, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9959923028945923, |
| "sampling/importance_sampling_ratio/min": 0.0454302616417408, |
| "sampling/sampling_logp_difference/max": 3.0915768146514893, |
| "sampling/sampling_logp_difference/mean": 0.019177217036485672, |
| "step": 64 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1458333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1992.0, |
| "completions/mean_length": 1404.854248046875, |
| "completions/mean_terminated_length": 1295.0487060546875, |
| "completions/min_length": 602.0, |
| "completions/min_terminated_length": 602.0, |
| "entropy": 0.09235261380672455, |
| "epoch": 0.06996770721205597, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.31108719052745e-06, |
| "loss": 0.0104, |
| "num_tokens": 7854288.0, |
| "reward": 142.0937957763672, |
| "reward_std": 94.76582336425781, |
| "rewards/Rewards/mean": 142.09378051757812, |
| "rewards/Rewards/std": 150.158447265625, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962871074676514, |
| "sampling/importance_sampling_ratio/min": 0.016520341858267784, |
| "sampling/sampling_logp_difference/max": 4.10316276550293, |
| "sampling/sampling_logp_difference/mean": 0.019058652222156525, |
| "step": 65 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2291666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2039.0, |
| "completions/mean_length": 1712.75, |
| "completions/mean_terminated_length": 1613.0811767578125, |
| "completions/min_length": 964.0, |
| "completions/min_terminated_length": 964.0, |
| "entropy": 0.09805743396282196, |
| "epoch": 0.07104413347685684, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5625, |
| "learning_rate": 9.30032292787944e-06, |
| "loss": 0.0247, |
| "num_tokens": 7991970.0, |
| "reward": 109.45711517333984, |
| "reward_std": 96.96968841552734, |
| "rewards/Rewards/mean": 109.45711517333984, |
| "rewards/Rewards/std": 129.8672637939453, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9963368773460388, |
| "sampling/importance_sampling_ratio/min": 0.03675702214241028, |
| "sampling/sampling_logp_difference/max": 3.3034260272979736, |
| "sampling/sampling_logp_difference/mean": 0.018985003232955933, |
| "step": 66 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0833333358168602, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2042.0, |
| "completions/mean_length": 1568.0833740234375, |
| "completions/mean_terminated_length": 1524.45458984375, |
| "completions/min_length": 920.0, |
| "completions/min_terminated_length": 920.0, |
| "entropy": 0.09471037983894348, |
| "epoch": 0.0721205597416577, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.625, |
| "learning_rate": 9.289558665231433e-06, |
| "loss": -0.0555, |
| "num_tokens": 8115184.0, |
| "reward": 217.14907836914062, |
| "reward_std": 111.93682861328125, |
| "rewards/Rewards/mean": 217.1490478515625, |
| "rewards/Rewards/std": 165.21253967285156, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962868690490723, |
| "sampling/importance_sampling_ratio/min": 0.025223759934306145, |
| "sampling/sampling_logp_difference/max": 3.67996883392334, |
| "sampling/sampling_logp_difference/mean": 0.018947413191199303, |
| "step": 67 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3541666865348816, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2040.0, |
| "completions/mean_length": 1741.041748046875, |
| "completions/mean_terminated_length": 1572.7095947265625, |
| "completions/min_length": 599.0, |
| "completions/min_terminated_length": 599.0, |
| "entropy": 0.09793736040592194, |
| "epoch": 0.07319698600645856, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6953125, |
| "learning_rate": 9.278794402583423e-06, |
| "loss": -0.0115, |
| "num_tokens": 8246124.0, |
| "reward": 127.42999267578125, |
| "reward_std": 106.5477294921875, |
| "rewards/Rewards/mean": 127.42999267578125, |
| "rewards/Rewards/std": 152.0902099609375, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.995684027671814, |
| "sampling/importance_sampling_ratio/min": 0.07607259601354599, |
| "sampling/sampling_logp_difference/max": 2.5760672092437744, |
| "sampling/sampling_logp_difference/mean": 0.019500732421875, |
| "step": 68 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1458333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1925.0, |
| "completions/mean_length": 1410.291748046875, |
| "completions/mean_terminated_length": 1301.41455078125, |
| "completions/min_length": 671.0, |
| "completions/min_terminated_length": 671.0, |
| "entropy": 0.09403970837593079, |
| "epoch": 0.07427341227125941, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.90625, |
| "learning_rate": 9.268030139935415e-06, |
| "loss": -0.0468, |
| "num_tokens": 8355194.0, |
| "reward": 137.93809509277344, |
| "reward_std": 79.21134948730469, |
| "rewards/Rewards/mean": 137.93809509277344, |
| "rewards/Rewards/std": 153.5146484375, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9967970252037048, |
| "sampling/importance_sampling_ratio/min": 0.017168184742331505, |
| "sampling/sampling_logp_difference/max": 4.064697265625, |
| "sampling/sampling_logp_difference/mean": 0.01961737498641014, |
| "step": 69 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3333333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2012.0, |
| "completions/mean_length": 1782.0208740234375, |
| "completions/mean_terminated_length": 1649.03125, |
| "completions/min_length": 1140.0, |
| "completions/min_terminated_length": 1140.0, |
| "entropy": 0.09640659391880035, |
| "epoch": 0.07534983853606028, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6640625, |
| "learning_rate": 9.257265877287407e-06, |
| "loss": 0.0284, |
| "num_tokens": 8492217.0, |
| "reward": 138.16677856445312, |
| "reward_std": 84.71370697021484, |
| "rewards/Rewards/mean": 138.16676330566406, |
| "rewards/Rewards/std": 155.8728790283203, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962026476860046, |
| "sampling/importance_sampling_ratio/min": 0.08669225126504898, |
| "sampling/sampling_logp_difference/max": 2.4453907012939453, |
| "sampling/sampling_logp_difference/mean": 0.018790341913700104, |
| "step": 70 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1666666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2040.0, |
| "completions/mean_length": 1621.8333740234375, |
| "completions/mean_terminated_length": 1536.5999755859375, |
| "completions/min_length": 657.0, |
| "completions/min_terminated_length": 657.0, |
| "entropy": 0.09194771200418472, |
| "epoch": 0.07642626480086114, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.96875, |
| "learning_rate": 9.246501614639399e-06, |
| "loss": -0.01, |
| "num_tokens": 8614717.0, |
| "reward": 162.2823944091797, |
| "reward_std": 151.30052185058594, |
| "rewards/Rewards/mean": 162.28236389160156, |
| "rewards/Rewards/std": 162.62136840820312, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9964730143547058, |
| "sampling/importance_sampling_ratio/min": 0.10786180943250656, |
| "sampling/sampling_logp_difference/max": 2.2269043922424316, |
| "sampling/sampling_logp_difference/mean": 0.018355056643486023, |
| "step": 71 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1990.0, |
| "completions/mean_length": 1564.0833740234375, |
| "completions/mean_terminated_length": 1494.952392578125, |
| "completions/min_length": 832.0, |
| "completions/min_terminated_length": 832.0, |
| "entropy": 0.09191286563873291, |
| "epoch": 0.077502691065662, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7421875, |
| "learning_rate": 9.23573735199139e-06, |
| "loss": -0.0089, |
| "num_tokens": 8739977.0, |
| "reward": 202.41168212890625, |
| "reward_std": 113.517822265625, |
| "rewards/Rewards/mean": 202.41168212890625, |
| "rewards/Rewards/std": 167.29736328125, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9957892894744873, |
| "sampling/importance_sampling_ratio/min": 0.08669490367174149, |
| "sampling/sampling_logp_difference/max": 2.4453601837158203, |
| "sampling/sampling_logp_difference/mean": 0.018984520807862282, |
| "step": 72 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0833333358168602, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1973.0, |
| "completions/mean_length": 1552.041748046875, |
| "completions/mean_terminated_length": 1506.95458984375, |
| "completions/min_length": 943.0, |
| "completions/min_terminated_length": 943.0, |
| "entropy": 0.09602253884077072, |
| "epoch": 0.07857911733046287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.828125, |
| "learning_rate": 9.22497308934338e-06, |
| "loss": 0.0344, |
| "num_tokens": 8861137.0, |
| "reward": 157.66354370117188, |
| "reward_std": 144.0146942138672, |
| "rewards/Rewards/mean": 157.66354370117188, |
| "rewards/Rewards/std": 143.1285858154297, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9959803819656372, |
| "sampling/importance_sampling_ratio/min": 0.00499701825901866, |
| "sampling/sampling_logp_difference/max": 5.298913955688477, |
| "sampling/sampling_logp_difference/mean": 0.01990203559398651, |
| "step": 73 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2083333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2012.0, |
| "completions/mean_length": 1609.041748046875, |
| "completions/mean_terminated_length": 1493.5263671875, |
| "completions/min_length": 942.0, |
| "completions/min_terminated_length": 942.0, |
| "entropy": 0.08647584915161133, |
| "epoch": 0.07965554359526372, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.65625, |
| "learning_rate": 9.214208826695372e-06, |
| "loss": -0.0107, |
| "num_tokens": 8989995.0, |
| "reward": 197.0958709716797, |
| "reward_std": 68.34449768066406, |
| "rewards/Rewards/mean": 197.09584045410156, |
| "rewards/Rewards/std": 159.35365295410156, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9968025088310242, |
| "sampling/importance_sampling_ratio/min": 0.0007289634668268263, |
| "sampling/sampling_logp_difference/max": 7.223886966705322, |
| "sampling/sampling_logp_difference/mean": 0.018994908779859543, |
| "step": 74 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1041666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1995.0, |
| "completions/mean_length": 1519.0, |
| "completions/mean_terminated_length": 1457.4884033203125, |
| "completions/min_length": 799.0, |
| "completions/min_terminated_length": 799.0, |
| "entropy": 0.09478063881397247, |
| "epoch": 0.08073196986006459, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.8359375, |
| "learning_rate": 9.203444564047364e-06, |
| "loss": 0.0352, |
| "num_tokens": 9115845.0, |
| "reward": 179.7698211669922, |
| "reward_std": 126.33210754394531, |
| "rewards/Rewards/mean": 179.7698211669922, |
| "rewards/Rewards/std": 156.64407348632812, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962257146835327, |
| "sampling/importance_sampling_ratio/min": 0.06889042258262634, |
| "sampling/sampling_logp_difference/max": 2.6752381324768066, |
| "sampling/sampling_logp_difference/mean": 0.01957097090780735, |
| "step": 75 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2708333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2014.0, |
| "completions/mean_length": 1643.166748046875, |
| "completions/mean_terminated_length": 1492.800048828125, |
| "completions/min_length": 802.0, |
| "completions/min_terminated_length": 802.0, |
| "entropy": 0.09344817698001862, |
| "epoch": 0.08180839612486544, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.734375, |
| "learning_rate": 9.192680301399355e-06, |
| "loss": 0.0195, |
| "num_tokens": 9247301.0, |
| "reward": 147.28268432617188, |
| "reward_std": 124.65312957763672, |
| "rewards/Rewards/mean": 147.2826690673828, |
| "rewards/Rewards/std": 159.5615997314453, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9967602491378784, |
| "sampling/importance_sampling_ratio/min": 0.030053241178393364, |
| "sampling/sampling_logp_difference/max": 3.5047848224639893, |
| "sampling/sampling_logp_difference/mean": 0.018918678164482117, |
| "step": 76 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0833333358168602, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1947.0, |
| "completions/mean_length": 1344.479248046875, |
| "completions/mean_terminated_length": 1280.522705078125, |
| "completions/min_length": 529.0, |
| "completions/min_terminated_length": 529.0, |
| "entropy": 0.09060220420360565, |
| "epoch": 0.08288482238966631, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.96875, |
| "learning_rate": 9.181916038751345e-06, |
| "loss": -0.0746, |
| "num_tokens": 9364396.0, |
| "reward": 199.82870483398438, |
| "reward_std": 49.06147003173828, |
| "rewards/Rewards/mean": 199.8286895751953, |
| "rewards/Rewards/std": 142.2600555419922, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962332844734192, |
| "sampling/importance_sampling_ratio/min": 0.021995970979332924, |
| "sampling/sampling_logp_difference/max": 3.8168959617614746, |
| "sampling/sampling_logp_difference/mean": 0.018977638334035873, |
| "step": 77 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.02083333395421505, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2015.0, |
| "completions/mean_length": 1400.1875, |
| "completions/mean_terminated_length": 1386.4041748046875, |
| "completions/min_length": 782.0, |
| "completions/min_terminated_length": 782.0, |
| "entropy": 0.09271110594272614, |
| "epoch": 0.08396124865446716, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.921875, |
| "learning_rate": 9.171151776103337e-06, |
| "loss": -0.0685, |
| "num_tokens": 9483805.0, |
| "reward": 157.17391967773438, |
| "reward_std": 84.85456848144531, |
| "rewards/Rewards/mean": 157.1739044189453, |
| "rewards/Rewards/std": 143.22023010253906, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9963641166687012, |
| "sampling/importance_sampling_ratio/min": 0.022093014791607857, |
| "sampling/sampling_logp_difference/max": 3.8124938011169434, |
| "sampling/sampling_logp_difference/mean": 0.019313348457217216, |
| "step": 78 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2020.0, |
| "completions/mean_length": 1403.2708740234375, |
| "completions/mean_terminated_length": 1360.2889404296875, |
| "completions/min_length": 620.0, |
| "completions/min_terminated_length": 620.0, |
| "entropy": 0.0901971235871315, |
| "epoch": 0.08503767491926803, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.875, |
| "learning_rate": 9.160387513455329e-06, |
| "loss": -0.0746, |
| "num_tokens": 9596264.0, |
| "reward": 170.26272583007812, |
| "reward_std": 78.15837097167969, |
| "rewards/Rewards/mean": 170.26271057128906, |
| "rewards/Rewards/std": 155.24510192871094, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9961694478988647, |
| "sampling/importance_sampling_ratio/min": 0.1316412091255188, |
| "sampling/sampling_logp_difference/max": 2.027675151824951, |
| "sampling/sampling_logp_difference/mean": 0.019312690943479538, |
| "step": 79 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2024.0, |
| "completions/mean_length": 1519.3125, |
| "completions/mean_terminated_length": 1443.7857666015625, |
| "completions/min_length": 915.0, |
| "completions/min_terminated_length": 915.0, |
| "entropy": 0.08893078565597534, |
| "epoch": 0.0861141011840689, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9140625, |
| "learning_rate": 9.14962325080732e-06, |
| "loss": 0.0103, |
| "num_tokens": 9717743.0, |
| "reward": 170.59756469726562, |
| "reward_std": 144.99819946289062, |
| "rewards/Rewards/mean": 170.59754943847656, |
| "rewards/Rewards/std": 163.2159423828125, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962536096572876, |
| "sampling/importance_sampling_ratio/min": 0.0001781879982445389, |
| "sampling/sampling_logp_difference/max": 8.632671356201172, |
| "sampling/sampling_logp_difference/mean": 0.019303126260638237, |
| "step": 80 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1666666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1984.0, |
| "completions/mean_length": 1579.3125, |
| "completions/mean_terminated_length": 1485.5750732421875, |
| "completions/min_length": 716.0, |
| "completions/min_terminated_length": 716.0, |
| "entropy": 0.09119614958763123, |
| "epoch": 0.08719052744886975, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.8828125, |
| "learning_rate": 9.138858988159312e-06, |
| "loss": 0.0078, |
| "num_tokens": 9840362.0, |
| "reward": 166.75074768066406, |
| "reward_std": 94.51995086669922, |
| "rewards/Rewards/mean": 166.75071716308594, |
| "rewards/Rewards/std": 144.9541015625, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9964168071746826, |
| "sampling/importance_sampling_ratio/min": 0.019641246646642685, |
| "sampling/sampling_logp_difference/max": 3.9301235675811768, |
| "sampling/sampling_logp_difference/mean": 0.019442304968833923, |
| "step": 81 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2291666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2005.0, |
| "completions/mean_length": 1590.25, |
| "completions/mean_terminated_length": 1454.1622314453125, |
| "completions/min_length": 837.0, |
| "completions/min_terminated_length": 837.0, |
| "entropy": 0.0953579992055893, |
| "epoch": 0.08826695371367062, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.796875, |
| "learning_rate": 9.128094725511302e-06, |
| "loss": -0.0446, |
| "num_tokens": 9969788.0, |
| "reward": 145.84396362304688, |
| "reward_std": 74.80790710449219, |
| "rewards/Rewards/mean": 145.8439483642578, |
| "rewards/Rewards/std": 152.19375610351562, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9961199760437012, |
| "sampling/importance_sampling_ratio/min": 0.04095093160867691, |
| "sampling/sampling_logp_difference/max": 3.195380687713623, |
| "sampling/sampling_logp_difference/mean": 0.0189397893846035, |
| "step": 82 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2033.0, |
| "completions/mean_length": 1368.604248046875, |
| "completions/mean_terminated_length": 1339.065185546875, |
| "completions/min_length": 714.0, |
| "completions/min_terminated_length": 714.0, |
| "entropy": 0.09253405779600143, |
| "epoch": 0.08934337997847147, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9765625, |
| "learning_rate": 9.117330462863294e-06, |
| "loss": -0.0991, |
| "num_tokens": 10077061.0, |
| "reward": 180.68536376953125, |
| "reward_std": 85.10310363769531, |
| "rewards/Rewards/mean": 180.68536376953125, |
| "rewards/Rewards/std": 155.8038330078125, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9964330792427063, |
| "sampling/importance_sampling_ratio/min": 0.03296323120594025, |
| "sampling/sampling_logp_difference/max": 3.412362575531006, |
| "sampling/sampling_logp_difference/mean": 0.019271500408649445, |
| "step": 83 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1041666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1923.0, |
| "completions/mean_length": 1404.5208740234375, |
| "completions/mean_terminated_length": 1329.6976318359375, |
| "completions/min_length": 874.0, |
| "completions/min_terminated_length": 874.0, |
| "entropy": 0.0913185402750969, |
| "epoch": 0.09041980624327234, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.96875, |
| "learning_rate": 9.106566200215286e-06, |
| "loss": -0.0419, |
| "num_tokens": 10195082.0, |
| "reward": 99.5012435913086, |
| "reward_std": 77.8438720703125, |
| "rewards/Rewards/mean": 99.50122833251953, |
| "rewards/Rewards/std": 125.01537322998047, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9956685304641724, |
| "sampling/importance_sampling_ratio/min": 0.12451466172933578, |
| "sampling/sampling_logp_difference/max": 2.083331823348999, |
| "sampling/sampling_logp_difference/mean": 0.019282877445220947, |
| "step": 84 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1941.0, |
| "completions/mean_length": 1464.5833740234375, |
| "completions/mean_terminated_length": 1439.2174072265625, |
| "completions/min_length": 893.0, |
| "completions/min_terminated_length": 893.0, |
| "entropy": 0.09081118553876877, |
| "epoch": 0.09149623250807319, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.890625, |
| "learning_rate": 9.095801937567277e-06, |
| "loss": -0.0636, |
| "num_tokens": 10310478.0, |
| "reward": 220.60836791992188, |
| "reward_std": 109.03794860839844, |
| "rewards/Rewards/mean": 220.60838317871094, |
| "rewards/Rewards/std": 164.03514099121094, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9961858987808228, |
| "sampling/importance_sampling_ratio/min": 0.09935268014669418, |
| "sampling/sampling_logp_difference/max": 2.30907940864563, |
| "sampling/sampling_logp_difference/mean": 0.018993422389030457, |
| "step": 85 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.02083333395421505, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2024.0, |
| "completions/mean_length": 1472.6458740234375, |
| "completions/mean_terminated_length": 1460.4041748046875, |
| "completions/min_length": 949.0, |
| "completions/min_terminated_length": 949.0, |
| "entropy": 0.08875827491283417, |
| "epoch": 0.09257265877287406, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.5625, |
| "learning_rate": 9.085037674919269e-06, |
| "loss": -0.0453, |
| "num_tokens": 10435897.0, |
| "reward": 150.59307861328125, |
| "reward_std": 126.29867553710938, |
| "rewards/Rewards/mean": 150.59304809570312, |
| "rewards/Rewards/std": 154.72206115722656, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9966999292373657, |
| "sampling/importance_sampling_ratio/min": 0.01632927730679512, |
| "sampling/sampling_logp_difference/max": 4.114795684814453, |
| "sampling/sampling_logp_difference/mean": 0.018780577927827835, |
| "step": 86 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0833333358168602, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1889.0, |
| "completions/mean_length": 1367.166748046875, |
| "completions/mean_terminated_length": 1305.2728271484375, |
| "completions/min_length": 905.0, |
| "completions/min_terminated_length": 905.0, |
| "entropy": 0.08919569849967957, |
| "epoch": 0.09364908503767493, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.0, |
| "learning_rate": 9.07427341227126e-06, |
| "loss": 0.0005, |
| "num_tokens": 10543251.0, |
| "reward": 135.57723999023438, |
| "reward_std": 127.69003295898438, |
| "rewards/Rewards/mean": 135.5772247314453, |
| "rewards/Rewards/std": 148.4595184326172, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962979555130005, |
| "sampling/importance_sampling_ratio/min": 0.04543057829141617, |
| "sampling/sampling_logp_difference/max": 3.0915699005126953, |
| "sampling/sampling_logp_difference/mean": 0.018866896629333496, |
| "step": 87 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2041.0, |
| "completions/mean_length": 1396.3958740234375, |
| "completions/mean_terminated_length": 1352.95556640625, |
| "completions/min_length": 837.0, |
| "completions/min_terminated_length": 837.0, |
| "entropy": 0.08955030888319016, |
| "epoch": 0.09472551130247578, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9140625, |
| "learning_rate": 9.063509149623253e-06, |
| "loss": -0.0019, |
| "num_tokens": 10660648.0, |
| "reward": 151.53836059570312, |
| "reward_std": 118.24325561523438, |
| "rewards/Rewards/mean": 151.53834533691406, |
| "rewards/Rewards/std": 150.38818359375, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9959496259689331, |
| "sampling/importance_sampling_ratio/min": 0.03812497481703758, |
| "sampling/sampling_logp_difference/max": 3.266885757446289, |
| "sampling/sampling_logp_difference/mean": 0.019105281680822372, |
| "step": 88 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1041666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2016.0, |
| "completions/mean_length": 1637.625, |
| "completions/mean_terminated_length": 1589.906982421875, |
| "completions/min_length": 779.0, |
| "completions/min_terminated_length": 779.0, |
| "entropy": 0.09204195439815521, |
| "epoch": 0.09580193756727665, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.71875, |
| "learning_rate": 9.052744886975243e-06, |
| "loss": -0.0224, |
| "num_tokens": 10794700.0, |
| "reward": 168.03155517578125, |
| "reward_std": 66.89749908447266, |
| "rewards/Rewards/mean": 168.0315399169922, |
| "rewards/Rewards/std": 136.70693969726562, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9961581230163574, |
| "sampling/importance_sampling_ratio/min": 0.013110673055052757, |
| "sampling/sampling_logp_difference/max": 4.334328651428223, |
| "sampling/sampling_logp_difference/mean": 0.01949831284582615, |
| "step": 89 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0833333358168602, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2044.0, |
| "completions/mean_length": 1513.25, |
| "completions/mean_terminated_length": 1464.6363525390625, |
| "completions/min_length": 1076.0, |
| "completions/min_terminated_length": 1076.0, |
| "entropy": 0.09165016561746597, |
| "epoch": 0.0968783638320775, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 3.015625, |
| "learning_rate": 9.041980624327234e-06, |
| "loss": -0.0242, |
| "num_tokens": 10915138.0, |
| "reward": 186.45489501953125, |
| "reward_std": 97.13003540039062, |
| "rewards/Rewards/mean": 186.4548797607422, |
| "rewards/Rewards/std": 155.18177795410156, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9963523149490356, |
| "sampling/importance_sampling_ratio/min": 0.029388200491666794, |
| "sampling/sampling_logp_difference/max": 3.5271620750427246, |
| "sampling/sampling_logp_difference/mean": 0.019088543951511383, |
| "step": 90 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0833333358168602, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2022.0, |
| "completions/mean_length": 1557.125, |
| "completions/mean_terminated_length": 1512.5, |
| "completions/min_length": 840.0, |
| "completions/min_terminated_length": 840.0, |
| "entropy": 0.09386443346738815, |
| "epoch": 0.09795479009687837, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.90625, |
| "learning_rate": 9.031216361679226e-06, |
| "loss": -0.0759, |
| "num_tokens": 11036938.0, |
| "reward": 171.71578979492188, |
| "reward_std": 80.66743469238281, |
| "rewards/Rewards/mean": 171.71575927734375, |
| "rewards/Rewards/std": 146.78236389160156, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9958236217498779, |
| "sampling/importance_sampling_ratio/min": 0.07584737241268158, |
| "sampling/sampling_logp_difference/max": 2.5790321826934814, |
| "sampling/sampling_logp_difference/mean": 0.019366014748811722, |
| "step": 91 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1956.0, |
| "completions/mean_length": 1487.229248046875, |
| "completions/mean_terminated_length": 1407.1190185546875, |
| "completions/min_length": 661.0, |
| "completions/min_terminated_length": 661.0, |
| "entropy": 0.09276638925075531, |
| "epoch": 0.09903121636167922, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9140625, |
| "learning_rate": 9.020452099031218e-06, |
| "loss": -0.0159, |
| "num_tokens": 11153955.0, |
| "reward": 125.07907104492188, |
| "reward_std": 71.97328186035156, |
| "rewards/Rewards/mean": 125.07907104492188, |
| "rewards/Rewards/std": 151.87110900878906, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9967538118362427, |
| "sampling/importance_sampling_ratio/min": 0.027835814282298088, |
| "sampling/sampling_logp_difference/max": 3.5814318656921387, |
| "sampling/sampling_logp_difference/mean": 0.018818777054548264, |
| "step": 92 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1922.0, |
| "completions/max_terminated_length": 1922.0, |
| "completions/mean_length": 1341.75, |
| "completions/mean_terminated_length": 1341.75, |
| "completions/min_length": 740.0, |
| "completions/min_terminated_length": 740.0, |
| "entropy": 0.09067563712596893, |
| "epoch": 0.10010764262648009, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.8984375, |
| "learning_rate": 9.00968783638321e-06, |
| "loss": -0.0657, |
| "num_tokens": 11265063.0, |
| "reward": 160.42709350585938, |
| "reward_std": 79.6560287475586, |
| "rewards/Rewards/mean": 160.4270782470703, |
| "rewards/Rewards/std": 163.8791961669922, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9966100454330444, |
| "sampling/importance_sampling_ratio/min": 0.10168613493442535, |
| "sampling/sampling_logp_difference/max": 2.2858643531799316, |
| "sampling/sampling_logp_difference/mean": 0.018616581335663795, |
| "step": 93 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1458333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1956.0, |
| "completions/mean_length": 1559.2083740234375, |
| "completions/mean_terminated_length": 1475.756103515625, |
| "completions/min_length": 922.0, |
| "completions/min_terminated_length": 922.0, |
| "entropy": 0.09171372652053833, |
| "epoch": 0.10118406889128095, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 1.6953125, |
| "learning_rate": 8.9989235737352e-06, |
| "loss": -0.0215, |
| "num_tokens": 11388661.0, |
| "reward": 180.605712890625, |
| "reward_std": 80.27830505371094, |
| "rewards/Rewards/mean": 180.60569763183594, |
| "rewards/Rewards/std": 162.41473388671875, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9959461688995361, |
| "sampling/importance_sampling_ratio/min": 0.05554138496518135, |
| "sampling/sampling_logp_difference/max": 2.890626907348633, |
| "sampling/sampling_logp_difference/mean": 0.019266359508037567, |
| "step": 94 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1966.0, |
| "completions/mean_length": 1645.0, |
| "completions/mean_terminated_length": 1552.0, |
| "completions/min_length": 981.0, |
| "completions/min_terminated_length": 981.0, |
| "entropy": 0.09587572515010834, |
| "epoch": 0.10226049515608181, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6796875, |
| "learning_rate": 8.988159311087191e-06, |
| "loss": -0.0386, |
| "num_tokens": 11516587.0, |
| "reward": 217.25149536132812, |
| "reward_std": 120.60152435302734, |
| "rewards/Rewards/mean": 217.25148010253906, |
| "rewards/Rewards/std": 168.1180419921875, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9957050085067749, |
| "sampling/importance_sampling_ratio/min": 5.652933850797126e-06, |
| "sampling/sampling_logp_difference/max": 12.083335876464844, |
| "sampling/sampling_logp_difference/mean": 0.019527170807123184, |
| "step": 95 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2291666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2038.0, |
| "completions/mean_length": 1637.8125, |
| "completions/mean_terminated_length": 1515.8648681640625, |
| "completions/min_length": 920.0, |
| "completions/min_terminated_length": 920.0, |
| "entropy": 0.09848017990589142, |
| "epoch": 0.10333692142088267, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6875, |
| "learning_rate": 8.977395048439183e-06, |
| "loss": 0.0114, |
| "num_tokens": 11647348.0, |
| "reward": 197.4237060546875, |
| "reward_std": 117.47871398925781, |
| "rewards/Rewards/mean": 197.4237060546875, |
| "rewards/Rewards/std": 161.23277282714844, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9960770606994629, |
| "sampling/importance_sampling_ratio/min": 0.04941500723361969, |
| "sampling/sampling_logp_difference/max": 3.0075011253356934, |
| "sampling/sampling_logp_difference/mean": 0.019542451947927475, |
| "step": 96 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2029.0, |
| "completions/mean_length": 1499.0625, |
| "completions/mean_terminated_length": 1475.1956787109375, |
| "completions/min_length": 908.0, |
| "completions/min_terminated_length": 908.0, |
| "entropy": 0.09346417337656021, |
| "epoch": 0.10441334768568353, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7734375, |
| "learning_rate": 8.966630785791175e-06, |
| "loss": -0.0408, |
| "num_tokens": 11764441.0, |
| "reward": 278.59368896484375, |
| "reward_std": 64.89412689208984, |
| "rewards/Rewards/mean": 278.5936584472656, |
| "rewards/Rewards/std": 126.29338073730469, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9967272281646729, |
| "sampling/importance_sampling_ratio/min": 0.004661599174141884, |
| "sampling/sampling_logp_difference/max": 5.368396759033203, |
| "sampling/sampling_logp_difference/mean": 0.01916693150997162, |
| "step": 97 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2006.0, |
| "completions/mean_length": 1479.6875, |
| "completions/mean_terminated_length": 1441.800048828125, |
| "completions/min_length": 986.0, |
| "completions/min_terminated_length": 986.0, |
| "entropy": 0.09089180082082748, |
| "epoch": 0.1054897739504844, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9296875, |
| "learning_rate": 8.955866523143165e-06, |
| "loss": -0.0522, |
| "num_tokens": 11879656.0, |
| "reward": 149.2887725830078, |
| "reward_std": 110.22410583496094, |
| "rewards/Rewards/mean": 149.28875732421875, |
| "rewards/Rewards/std": 156.80722045898438, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9960883855819702, |
| "sampling/importance_sampling_ratio/min": 0.06272123008966446, |
| "sampling/sampling_logp_difference/max": 2.7690553665161133, |
| "sampling/sampling_logp_difference/mean": 0.019509928300976753, |
| "step": 98 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1041666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1982.0, |
| "completions/mean_length": 1593.229248046875, |
| "completions/mean_terminated_length": 1540.348876953125, |
| "completions/min_length": 1045.0, |
| "completions/min_terminated_length": 1045.0, |
| "entropy": 0.09180265665054321, |
| "epoch": 0.10656620021528525, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.78125, |
| "learning_rate": 8.945102260495156e-06, |
| "loss": -0.0186, |
| "num_tokens": 12014289.0, |
| "reward": 265.54229736328125, |
| "reward_std": 133.96429443359375, |
| "rewards/Rewards/mean": 265.5422668457031, |
| "rewards/Rewards/std": 150.7986297607422, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9964120984077454, |
| "sampling/importance_sampling_ratio/min": 0.0691010132431984, |
| "sampling/sampling_logp_difference/max": 2.6721858978271484, |
| "sampling/sampling_logp_difference/mean": 0.019088027998805046, |
| "step": 99 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2021.0, |
| "completions/mean_length": 1706.375, |
| "completions/mean_terminated_length": 1592.5, |
| "completions/min_length": 991.0, |
| "completions/min_terminated_length": 991.0, |
| "entropy": 0.093202605843544, |
| "epoch": 0.10764262648008611, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6953125, |
| "learning_rate": 8.934337997847148e-06, |
| "loss": 0.0266, |
| "num_tokens": 12145557.0, |
| "reward": 247.9436492919922, |
| "reward_std": 136.01905822753906, |
| "rewards/Rewards/mean": 247.94361877441406, |
| "rewards/Rewards/std": 162.2523651123047, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9966781139373779, |
| "sampling/importance_sampling_ratio/min": 0.05028429627418518, |
| "sampling/sampling_logp_difference/max": 2.9900624752044678, |
| "sampling/sampling_logp_difference/mean": 0.018772196024656296, |
| "step": 100 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1041666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2029.0, |
| "completions/mean_length": 1523.0, |
| "completions/mean_terminated_length": 1461.9534912109375, |
| "completions/min_length": 1013.0, |
| "completions/min_terminated_length": 1013.0, |
| "entropy": 0.08810561150312424, |
| "epoch": 0.10871905274488698, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7421875, |
| "learning_rate": 8.92357373519914e-06, |
| "loss": -0.0335, |
| "num_tokens": 12264243.0, |
| "reward": 94.70065307617188, |
| "reward_std": 89.24530029296875, |
| "rewards/Rewards/mean": 94.70065307617188, |
| "rewards/Rewards/std": 126.46951293945312, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9960950016975403, |
| "sampling/importance_sampling_ratio/min": 0.006951752584427595, |
| "sampling/sampling_logp_difference/max": 4.968761444091797, |
| "sampling/sampling_logp_difference/mean": 0.018199002370238304, |
| "step": 101 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2034.0, |
| "completions/mean_length": 1659.354248046875, |
| "completions/mean_terminated_length": 1529.8055419921875, |
| "completions/min_length": 1055.0, |
| "completions/min_terminated_length": 1055.0, |
| "entropy": 0.0948924720287323, |
| "epoch": 0.10979547900968784, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.8671875, |
| "learning_rate": 8.91280947255113e-06, |
| "loss": 0.0178, |
| "num_tokens": 12395852.0, |
| "reward": 133.68142700195312, |
| "reward_std": 94.77001190185547, |
| "rewards/Rewards/mean": 133.68141174316406, |
| "rewards/Rewards/std": 151.79200744628906, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962817430496216, |
| "sampling/importance_sampling_ratio/min": 0.04595468193292618, |
| "sampling/sampling_logp_difference/max": 3.080099582672119, |
| "sampling/sampling_logp_difference/mean": 0.01914885640144348, |
| "step": 102 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1666666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2039.0, |
| "completions/mean_length": 1663.416748046875, |
| "completions/mean_terminated_length": 1586.5, |
| "completions/min_length": 1071.0, |
| "completions/min_terminated_length": 1071.0, |
| "entropy": 0.09139697253704071, |
| "epoch": 0.1108719052744887, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7890625, |
| "learning_rate": 8.902045209903121e-06, |
| "loss": 0.0123, |
| "num_tokens": 12527806.0, |
| "reward": 232.5415496826172, |
| "reward_std": 101.91683197021484, |
| "rewards/Rewards/mean": 232.5415496826172, |
| "rewards/Rewards/std": 157.9981689453125, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9964860677719116, |
| "sampling/importance_sampling_ratio/min": 0.10882613807916641, |
| "sampling/sampling_logp_difference/max": 2.218003749847412, |
| "sampling/sampling_logp_difference/mean": 0.018667157739400864, |
| "step": 103 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1930.0, |
| "completions/mean_length": 1590.5208740234375, |
| "completions/mean_terminated_length": 1484.94873046875, |
| "completions/min_length": 849.0, |
| "completions/min_terminated_length": 849.0, |
| "entropy": 0.0869491845369339, |
| "epoch": 0.11194833153928956, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.90625, |
| "learning_rate": 8.891280947255115e-06, |
| "loss": -0.0273, |
| "num_tokens": 12652877.0, |
| "reward": 154.14407348632812, |
| "reward_std": 89.59563446044922, |
| "rewards/Rewards/mean": 154.14405822753906, |
| "rewards/Rewards/std": 153.5826873779297, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9963630437850952, |
| "sampling/importance_sampling_ratio/min": 0.02814963459968567, |
| "sampling/sampling_logp_difference/max": 3.570220947265625, |
| "sampling/sampling_logp_difference/mean": 0.018812354654073715, |
| "step": 104 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1041666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1974.0, |
| "completions/mean_length": 1532.1875, |
| "completions/mean_terminated_length": 1472.2093505859375, |
| "completions/min_length": 865.0, |
| "completions/min_terminated_length": 865.0, |
| "entropy": 0.0883987694978714, |
| "epoch": 0.11302475780409042, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9140625, |
| "learning_rate": 8.880516684607105e-06, |
| "loss": -0.0401, |
| "num_tokens": 12775958.0, |
| "reward": 93.6148681640625, |
| "reward_std": 73.73970794677734, |
| "rewards/Rewards/mean": 93.6148681640625, |
| "rewards/Rewards/std": 122.41616821289062, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962837100028992, |
| "sampling/importance_sampling_ratio/min": 0.05158340558409691, |
| "sampling/sampling_logp_difference/max": 2.964555263519287, |
| "sampling/sampling_logp_difference/mean": 0.019515471532940865, |
| "step": 105 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1987.0, |
| "completions/mean_length": 1416.25, |
| "completions/mean_terminated_length": 1388.7825927734375, |
| "completions/min_length": 630.0, |
| "completions/min_terminated_length": 630.0, |
| "entropy": 0.08332017064094543, |
| "epoch": 0.11410118406889128, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.171875, |
| "learning_rate": 8.869752421959097e-06, |
| "loss": -0.0793, |
| "num_tokens": 12886424.0, |
| "reward": 168.2129669189453, |
| "reward_std": 76.80313110351562, |
| "rewards/Rewards/mean": 168.21295166015625, |
| "rewards/Rewards/std": 158.4163818359375, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9966524243354797, |
| "sampling/importance_sampling_ratio/min": 0.007026932667940855, |
| "sampling/sampling_logp_difference/max": 4.958004951477051, |
| "sampling/sampling_logp_difference/mean": 0.018798114731907845, |
| "step": 106 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1666666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1885.0, |
| "completions/mean_length": 1549.7708740234375, |
| "completions/mean_terminated_length": 1450.125, |
| "completions/min_length": 920.0, |
| "completions/min_terminated_length": 920.0, |
| "entropy": 0.08735213428735733, |
| "epoch": 0.11517761033369214, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.09375, |
| "learning_rate": 8.858988159311088e-06, |
| "loss": -0.0218, |
| "num_tokens": 13005801.0, |
| "reward": 143.79306030273438, |
| "reward_std": 135.1495361328125, |
| "rewards/Rewards/mean": 143.7930450439453, |
| "rewards/Rewards/std": 151.38134765625, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9966720938682556, |
| "sampling/importance_sampling_ratio/min": 0.06602096557617188, |
| "sampling/sampling_logp_difference/max": 2.717782974243164, |
| "sampling/sampling_logp_difference/mean": 0.019426580518484116, |
| "step": 107 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1666666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1987.0, |
| "completions/mean_length": 1655.666748046875, |
| "completions/mean_terminated_length": 1577.2000732421875, |
| "completions/min_length": 1086.0, |
| "completions/min_terminated_length": 1086.0, |
| "entropy": 0.08660842478275299, |
| "epoch": 0.116254036598493, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7578125, |
| "learning_rate": 8.84822389666308e-06, |
| "loss": -0.0023, |
| "num_tokens": 13134245.0, |
| "reward": 124.0144271850586, |
| "reward_std": 115.87239837646484, |
| "rewards/Rewards/mean": 124.0144271850586, |
| "rewards/Rewards/std": 147.93455505371094, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962751865386963, |
| "sampling/importance_sampling_ratio/min": 0.07169795781373978, |
| "sampling/sampling_logp_difference/max": 2.6352930068969727, |
| "sampling/sampling_logp_difference/mean": 0.01898413896560669, |
| "step": 108 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2042.0, |
| "completions/mean_length": 1540.375, |
| "completions/mean_terminated_length": 1506.5333251953125, |
| "completions/min_length": 834.0, |
| "completions/min_terminated_length": 834.0, |
| "entropy": 0.08225899934768677, |
| "epoch": 0.11733046286329386, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9140625, |
| "learning_rate": 8.837459634015072e-06, |
| "loss": -0.0635, |
| "num_tokens": 13252091.0, |
| "reward": 172.61041259765625, |
| "reward_std": 81.08895874023438, |
| "rewards/Rewards/mean": 172.61041259765625, |
| "rewards/Rewards/std": 158.3402099609375, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9964280128479004, |
| "sampling/importance_sampling_ratio/min": 0.024863438680768013, |
| "sampling/sampling_logp_difference/max": 3.694356918334961, |
| "sampling/sampling_logp_difference/mean": 0.018688606098294258, |
| "step": 109 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1041666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2005.0, |
| "completions/mean_length": 1607.8125, |
| "completions/mean_terminated_length": 1556.6279296875, |
| "completions/min_length": 957.0, |
| "completions/min_terminated_length": 957.0, |
| "entropy": 0.08497870713472366, |
| "epoch": 0.11840688912809473, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.8515625, |
| "learning_rate": 8.826695371367062e-06, |
| "loss": 0.0312, |
| "num_tokens": 13378280.0, |
| "reward": 159.2774658203125, |
| "reward_std": 63.95246505737305, |
| "rewards/Rewards/mean": 159.2774658203125, |
| "rewards/Rewards/std": 141.60948181152344, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9966553449630737, |
| "sampling/importance_sampling_ratio/min": 0.05783185735344887, |
| "sampling/sampling_logp_difference/max": 2.850215435028076, |
| "sampling/sampling_logp_difference/mean": 0.01872144639492035, |
| "step": 110 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2044.0, |
| "completions/mean_length": 1874.041748046875, |
| "completions/mean_terminated_length": 1700.0833740234375, |
| "completions/min_length": 1228.0, |
| "completions/min_terminated_length": 1228.0, |
| "entropy": 0.09004805237054825, |
| "epoch": 0.11948331539289558, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.578125, |
| "learning_rate": 8.815931108719053e-06, |
| "loss": 0.0354, |
| "num_tokens": 13519228.0, |
| "reward": 84.75015258789062, |
| "reward_std": 103.92143249511719, |
| "rewards/Rewards/mean": 84.75015258789062, |
| "rewards/Rewards/std": 132.39300537109375, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962224960327148, |
| "sampling/importance_sampling_ratio/min": 0.06855442374944687, |
| "sampling/sampling_logp_difference/max": 2.6801273822784424, |
| "sampling/sampling_logp_difference/mean": 0.01885523647069931, |
| "step": 111 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1993.0, |
| "completions/mean_length": 1600.479248046875, |
| "completions/mean_terminated_length": 1536.547607421875, |
| "completions/min_length": 807.0, |
| "completions/min_terminated_length": 807.0, |
| "entropy": 0.08414789289236069, |
| "epoch": 0.12055974165769645, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.734375, |
| "learning_rate": 8.805166846071045e-06, |
| "loss": -0.0281, |
| "num_tokens": 13643043.0, |
| "reward": 231.875732421875, |
| "reward_std": 110.37709045410156, |
| "rewards/Rewards/mean": 231.875732421875, |
| "rewards/Rewards/std": 160.89878845214844, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9967577457427979, |
| "sampling/importance_sampling_ratio/min": 0.12920556962490082, |
| "sampling/sampling_logp_difference/max": 2.0463504791259766, |
| "sampling/sampling_logp_difference/mean": 0.017805946990847588, |
| "step": 112 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2708333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2035.0, |
| "completions/mean_length": 1683.8958740234375, |
| "completions/mean_terminated_length": 1548.6571044921875, |
| "completions/min_length": 779.0, |
| "completions/min_terminated_length": 779.0, |
| "entropy": 0.08192656934261322, |
| "epoch": 0.1216361679224973, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.703125, |
| "learning_rate": 8.794402583423037e-06, |
| "loss": -0.0205, |
| "num_tokens": 13764814.0, |
| "reward": 157.37353515625, |
| "reward_std": 125.2473373413086, |
| "rewards/Rewards/mean": 157.37355041503906, |
| "rewards/Rewards/std": 171.18760681152344, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9961519241333008, |
| "sampling/importance_sampling_ratio/min": 0.004844507202506065, |
| "sampling/sampling_logp_difference/max": 5.329909801483154, |
| "sampling/sampling_logp_difference/mean": 0.01832774095237255, |
| "step": 113 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2038.0, |
| "completions/mean_length": 1677.666748046875, |
| "completions/mean_terminated_length": 1624.761962890625, |
| "completions/min_length": 842.0, |
| "completions/min_terminated_length": 842.0, |
| "entropy": 0.08480958640575409, |
| "epoch": 0.12271259418729817, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7265625, |
| "learning_rate": 8.783638320775027e-06, |
| "loss": -0.0138, |
| "num_tokens": 13891836.0, |
| "reward": 152.83729553222656, |
| "reward_std": 89.38969421386719, |
| "rewards/Rewards/mean": 152.8372802734375, |
| "rewards/Rewards/std": 149.14773559570312, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9965521097183228, |
| "sampling/importance_sampling_ratio/min": 0.019641246646642685, |
| "sampling/sampling_logp_difference/max": 3.9301235675811768, |
| "sampling/sampling_logp_difference/mean": 0.017855927348136902, |
| "step": 114 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2708333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2032.0, |
| "completions/mean_length": 1767.354248046875, |
| "completions/mean_terminated_length": 1663.1142578125, |
| "completions/min_length": 1045.0, |
| "completions/min_terminated_length": 1045.0, |
| "entropy": 0.08714842796325684, |
| "epoch": 0.12378902045209902, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7421875, |
| "learning_rate": 8.772874058127019e-06, |
| "loss": 0.0045, |
| "num_tokens": 14024063.0, |
| "reward": 144.72988891601562, |
| "reward_std": 145.175048828125, |
| "rewards/Rewards/mean": 144.72987365722656, |
| "rewards/Rewards/std": 159.03140258789062, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.996498703956604, |
| "sampling/importance_sampling_ratio/min": 0.016073117032647133, |
| "sampling/sampling_logp_difference/max": 4.1306071281433105, |
| "sampling/sampling_logp_difference/mean": 0.018725711852312088, |
| "step": 115 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2291666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2044.0, |
| "completions/mean_length": 1498.25, |
| "completions/mean_terminated_length": 1334.810791015625, |
| "completions/min_length": 576.0, |
| "completions/min_terminated_length": 576.0, |
| "entropy": 0.08243508636951447, |
| "epoch": 0.12486544671689989, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.921875, |
| "learning_rate": 8.76210979547901e-06, |
| "loss": -0.0135, |
| "num_tokens": 14142359.0, |
| "reward": 68.60581970214844, |
| "reward_std": 65.1133041381836, |
| "rewards/Rewards/mean": 68.6058120727539, |
| "rewards/Rewards/std": 118.82546997070312, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9966591596603394, |
| "sampling/importance_sampling_ratio/min": 0.06388171017169952, |
| "sampling/sampling_logp_difference/max": 2.7507221698760986, |
| "sampling/sampling_logp_difference/mean": 0.018525827676057816, |
| "step": 116 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.4791666865348816, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2029.0, |
| "completions/mean_length": 1731.3958740234375, |
| "completions/mean_terminated_length": 1440.1199951171875, |
| "completions/min_length": 1004.0, |
| "completions/min_terminated_length": 1004.0, |
| "entropy": 0.08880011737346649, |
| "epoch": 0.12594187298170076, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 1.75, |
| "learning_rate": 8.751345532831002e-06, |
| "loss": 0.0262, |
| "num_tokens": 14271606.0, |
| "reward": 134.94076538085938, |
| "reward_std": 99.54734802246094, |
| "rewards/Rewards/mean": 134.9407501220703, |
| "rewards/Rewards/std": 162.60922241210938, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9966139793395996, |
| "sampling/importance_sampling_ratio/min": 0.030053241178393364, |
| "sampling/sampling_logp_difference/max": 3.5047848224639893, |
| "sampling/sampling_logp_difference/mean": 0.018653348088264465, |
| "step": 117 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2291666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1988.0, |
| "completions/mean_length": 1765.791748046875, |
| "completions/mean_terminated_length": 1681.8919677734375, |
| "completions/min_length": 1199.0, |
| "completions/min_terminated_length": 1199.0, |
| "entropy": 0.08772322535514832, |
| "epoch": 0.12701829924650163, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.796875, |
| "learning_rate": 8.740581270182994e-06, |
| "loss": 0.0118, |
| "num_tokens": 14417078.0, |
| "reward": 165.54493713378906, |
| "reward_std": 86.4625244140625, |
| "rewards/Rewards/mean": 165.54493713378906, |
| "rewards/Rewards/std": 162.67767333984375, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9964956045150757, |
| "sampling/importance_sampling_ratio/min": 0.038425639271736145, |
| "sampling/sampling_logp_difference/max": 3.259030342102051, |
| "sampling/sampling_logp_difference/mean": 0.018552402034401894, |
| "step": 118 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2010.0, |
| "completions/mean_length": 1704.0833740234375, |
| "completions/mean_terminated_length": 1547.757568359375, |
| "completions/min_length": 743.0, |
| "completions/min_terminated_length": 743.0, |
| "entropy": 0.0828559622168541, |
| "epoch": 0.12809472551130247, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.796875, |
| "learning_rate": 8.729817007534984e-06, |
| "loss": 0.0237, |
| "num_tokens": 14543238.0, |
| "reward": 206.37078857421875, |
| "reward_std": 107.7475814819336, |
| "rewards/Rewards/mean": 206.3707733154297, |
| "rewards/Rewards/std": 168.4974822998047, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9965498447418213, |
| "sampling/importance_sampling_ratio/min": 0.03842545673251152, |
| "sampling/sampling_logp_difference/max": 3.259035110473633, |
| "sampling/sampling_logp_difference/mean": 0.018624860793352127, |
| "step": 119 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2708333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1990.0, |
| "completions/mean_length": 1640.2708740234375, |
| "completions/mean_terminated_length": 1488.82861328125, |
| "completions/min_length": 929.0, |
| "completions/min_terminated_length": 929.0, |
| "entropy": 0.08398817479610443, |
| "epoch": 0.12917115177610333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.84375, |
| "learning_rate": 8.719052744886975e-06, |
| "loss": 0.0249, |
| "num_tokens": 14670391.0, |
| "reward": 110.42204284667969, |
| "reward_std": 105.4375228881836, |
| "rewards/Rewards/mean": 110.42205047607422, |
| "rewards/Rewards/std": 140.33175659179688, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9965760707855225, |
| "sampling/importance_sampling_ratio/min": 0.00865168683230877, |
| "sampling/sampling_logp_difference/max": 4.750000953674316, |
| "sampling/sampling_logp_difference/mean": 0.018549980595707893, |
| "step": 120 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2083333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2018.0, |
| "completions/mean_length": 1704.5208740234375, |
| "completions/mean_terminated_length": 1614.131591796875, |
| "completions/min_length": 1057.0, |
| "completions/min_terminated_length": 1057.0, |
| "entropy": 0.0853036642074585, |
| "epoch": 0.1302475780409042, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.8125, |
| "learning_rate": 8.708288482238967e-06, |
| "loss": 0.0115, |
| "num_tokens": 14794730.0, |
| "reward": 211.4768524169922, |
| "reward_std": 144.12379455566406, |
| "rewards/Rewards/mean": 211.4768524169922, |
| "rewards/Rewards/std": 166.9752197265625, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9965263605117798, |
| "sampling/importance_sampling_ratio/min": 0.016329152509570122, |
| "sampling/sampling_logp_difference/max": 4.114803314208984, |
| "sampling/sampling_logp_difference/mean": 0.018672306090593338, |
| "step": 121 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2291666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2014.0, |
| "completions/mean_length": 1783.541748046875, |
| "completions/mean_terminated_length": 1704.9189453125, |
| "completions/min_length": 1418.0, |
| "completions/min_terminated_length": 1418.0, |
| "entropy": 0.08703920990228653, |
| "epoch": 0.13132400430570507, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9609375, |
| "learning_rate": 8.697524219590959e-06, |
| "loss": 0.0198, |
| "num_tokens": 14931316.0, |
| "reward": 102.9444580078125, |
| "reward_std": 77.51891326904297, |
| "rewards/Rewards/mean": 102.9444580078125, |
| "rewards/Rewards/std": 117.40369415283203, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962136149406433, |
| "sampling/importance_sampling_ratio/min": 0.03993840888142586, |
| "sampling/sampling_logp_difference/max": 3.220416784286499, |
| "sampling/sampling_logp_difference/mean": 0.018898990005254745, |
| "step": 122 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2024.0, |
| "completions/mean_length": 1753.291748046875, |
| "completions/mean_terminated_length": 1655.0555419921875, |
| "completions/min_length": 1161.0, |
| "completions/min_terminated_length": 1161.0, |
| "entropy": 0.08477212488651276, |
| "epoch": 0.13240043057050593, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.8125, |
| "learning_rate": 8.68675995694295e-06, |
| "loss": 0.0348, |
| "num_tokens": 15060072.0, |
| "reward": 185.78619384765625, |
| "reward_std": 137.3616943359375, |
| "rewards/Rewards/mean": 185.7861785888672, |
| "rewards/Rewards/std": 168.95420837402344, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9964303970336914, |
| "sampling/importance_sampling_ratio/min": 6.512909749289975e-05, |
| "sampling/sampling_logp_difference/max": 9.639139175415039, |
| "sampling/sampling_logp_difference/mean": 0.01922321878373623, |
| "step": 123 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.4791666865348816, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2028.0, |
| "completions/mean_length": 1849.8333740234375, |
| "completions/mean_terminated_length": 1667.52001953125, |
| "completions/min_length": 1176.0, |
| "completions/min_terminated_length": 1176.0, |
| "entropy": 0.08716564625501633, |
| "epoch": 0.13347685683530677, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.71875, |
| "learning_rate": 8.675995694294942e-06, |
| "loss": 0.0449, |
| "num_tokens": 15202438.0, |
| "reward": 113.38813781738281, |
| "reward_std": 110.96601867675781, |
| "rewards/Rewards/mean": 113.38814544677734, |
| "rewards/Rewards/std": 153.09771728515625, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9958086013793945, |
| "sampling/importance_sampling_ratio/min": 0.0003843386366497725, |
| "sampling/sampling_logp_difference/max": 7.863986492156982, |
| "sampling/sampling_logp_difference/mean": 0.01934371329843998, |
| "step": 124 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2007.0, |
| "completions/mean_length": 1751.979248046875, |
| "completions/mean_terminated_length": 1617.42431640625, |
| "completions/min_length": 1142.0, |
| "completions/min_terminated_length": 1142.0, |
| "entropy": 0.08353154361248016, |
| "epoch": 0.13455328310010764, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.109375, |
| "learning_rate": 8.665231431646934e-06, |
| "loss": 0.0237, |
| "num_tokens": 15342189.0, |
| "reward": 117.08355712890625, |
| "reward_std": 117.69966888427734, |
| "rewards/Rewards/mean": 117.08355712890625, |
| "rewards/Rewards/std": 142.62982177734375, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9963948130607605, |
| "sampling/importance_sampling_ratio/min": 0.012470904737710953, |
| "sampling/sampling_logp_difference/max": 4.38435697555542, |
| "sampling/sampling_logp_difference/mean": 0.01883932203054428, |
| "step": 125 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1041666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2043.0, |
| "completions/mean_length": 1634.6875, |
| "completions/mean_terminated_length": 1586.6279296875, |
| "completions/min_length": 1052.0, |
| "completions/min_terminated_length": 1052.0, |
| "entropy": 0.08093823492527008, |
| "epoch": 0.1356297093649085, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.875, |
| "learning_rate": 8.654467168998924e-06, |
| "loss": 0.0178, |
| "num_tokens": 15462714.0, |
| "reward": 167.71546936035156, |
| "reward_std": 93.82933044433594, |
| "rewards/Rewards/mean": 167.71546936035156, |
| "rewards/Rewards/std": 150.2532501220703, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9966925382614136, |
| "sampling/importance_sampling_ratio/min": 0.010767911560833454, |
| "sampling/sampling_logp_difference/max": 4.531184673309326, |
| "sampling/sampling_logp_difference/mean": 0.018742073327302933, |
| "step": 126 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1666666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2019.0, |
| "completions/mean_length": 1666.604248046875, |
| "completions/mean_terminated_length": 1590.3250732421875, |
| "completions/min_length": 1007.0, |
| "completions/min_terminated_length": 1007.0, |
| "entropy": 0.08408204466104507, |
| "epoch": 0.13670613562970937, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.84375, |
| "learning_rate": 8.643702906350916e-06, |
| "loss": 0.0025, |
| "num_tokens": 15589055.0, |
| "reward": 123.41200256347656, |
| "reward_std": 110.46016693115234, |
| "rewards/Rewards/mean": 123.4119873046875, |
| "rewards/Rewards/std": 133.6590576171875, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.996285080909729, |
| "sampling/importance_sampling_ratio/min": 0.0035241865552961826, |
| "sampling/sampling_logp_difference/max": 5.648105621337891, |
| "sampling/sampling_logp_difference/mean": 0.019644845277071, |
| "step": 127 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2291666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1874.0, |
| "completions/mean_length": 1431.5, |
| "completions/mean_terminated_length": 1248.2161865234375, |
| "completions/min_length": 842.0, |
| "completions/min_terminated_length": 842.0, |
| "entropy": 0.07925313711166382, |
| "epoch": 0.1377825618945102, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.03125, |
| "learning_rate": 8.632938643702907e-06, |
| "loss": 0.0074, |
| "num_tokens": 15705359.0, |
| "reward": 74.46238708496094, |
| "reward_std": 54.4752197265625, |
| "rewards/Rewards/mean": 74.4623794555664, |
| "rewards/Rewards/std": 118.78749084472656, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9974421262741089, |
| "sampling/importance_sampling_ratio/min": 0.009600045159459114, |
| "sampling/sampling_logp_difference/max": 4.645987510681152, |
| "sampling/sampling_logp_difference/mean": 0.018069487065076828, |
| "step": 128 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2083333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2023.0, |
| "completions/mean_length": 1754.041748046875, |
| "completions/mean_terminated_length": 1676.6842041015625, |
| "completions/min_length": 1185.0, |
| "completions/min_terminated_length": 1185.0, |
| "entropy": 0.08643808960914612, |
| "epoch": 0.13885898815931108, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.8125, |
| "learning_rate": 8.622174381054899e-06, |
| "loss": 0.0347, |
| "num_tokens": 15837865.0, |
| "reward": 207.1958465576172, |
| "reward_std": 120.06201171875, |
| "rewards/Rewards/mean": 207.19581604003906, |
| "rewards/Rewards/std": 165.83035278320312, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.996021568775177, |
| "sampling/importance_sampling_ratio/min": 0.016845213249325752, |
| "sampling/sampling_logp_difference/max": 4.083688735961914, |
| "sampling/sampling_logp_difference/mean": 0.019307805225253105, |
| "step": 129 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2083333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2020.0, |
| "completions/mean_length": 1653.5833740234375, |
| "completions/mean_terminated_length": 1549.7894287109375, |
| "completions/min_length": 1166.0, |
| "completions/min_terminated_length": 1166.0, |
| "entropy": 0.08368074148893356, |
| "epoch": 0.13993541442411195, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.84375, |
| "learning_rate": 8.611410118406889e-06, |
| "loss": 0.0144, |
| "num_tokens": 15970955.0, |
| "reward": 145.67257690429688, |
| "reward_std": 118.706787109375, |
| "rewards/Rewards/mean": 145.6725616455078, |
| "rewards/Rewards/std": 148.27821350097656, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9966948628425598, |
| "sampling/importance_sampling_ratio/min": 0.028941864147782326, |
| "sampling/sampling_logp_difference/max": 3.542466163635254, |
| "sampling/sampling_logp_difference/mean": 0.019158201292157173, |
| "step": 130 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2011.0, |
| "completions/mean_length": 1522.604248046875, |
| "completions/mean_terminated_length": 1447.547607421875, |
| "completions/min_length": 705.0, |
| "completions/min_terminated_length": 705.0, |
| "entropy": 0.08337804675102234, |
| "epoch": 0.14101184068891282, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.96875, |
| "learning_rate": 8.60064585575888e-06, |
| "loss": -0.0715, |
| "num_tokens": 16102654.0, |
| "reward": 183.8970947265625, |
| "reward_std": 76.86428833007812, |
| "rewards/Rewards/mean": 183.8970947265625, |
| "rewards/Rewards/std": 149.77781677246094, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9960099458694458, |
| "sampling/importance_sampling_ratio/min": 0.0454302616417408, |
| "sampling/sampling_logp_difference/max": 3.0915768146514893, |
| "sampling/sampling_logp_difference/mean": 0.01908412016928196, |
| "step": 131 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2000.0, |
| "completions/mean_length": 1695.1875, |
| "completions/mean_terminated_length": 1613.769287109375, |
| "completions/min_length": 1024.0, |
| "completions/min_terminated_length": 1024.0, |
| "entropy": 0.08442177623510361, |
| "epoch": 0.14208826695371368, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 7.53125, |
| "learning_rate": 8.589881593110873e-06, |
| "loss": 0.0203, |
| "num_tokens": 16228399.0, |
| "reward": 132.07305908203125, |
| "reward_std": 133.1316680908203, |
| "rewards/Rewards/mean": 132.07305908203125, |
| "rewards/Rewards/std": 145.59878540039062, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9966284036636353, |
| "sampling/importance_sampling_ratio/min": 0.025022760033607483, |
| "sampling/sampling_logp_difference/max": 3.687969446182251, |
| "sampling/sampling_logp_difference/mean": 0.01852625235915184, |
| "step": 132 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1458333432674408, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2005.0, |
| "completions/mean_length": 1667.0208740234375, |
| "completions/mean_terminated_length": 1601.9755859375, |
| "completions/min_length": 965.0, |
| "completions/min_terminated_length": 965.0, |
| "entropy": 0.07948024570941925, |
| "epoch": 0.14316469321851452, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.0, |
| "learning_rate": 8.579117330462864e-06, |
| "loss": -0.0127, |
| "num_tokens": 16360934.0, |
| "reward": 179.4247283935547, |
| "reward_std": 117.30535888671875, |
| "rewards/Rewards/mean": 179.4247283935547, |
| "rewards/Rewards/std": 155.9678192138672, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9964487552642822, |
| "sampling/importance_sampling_ratio/min": 0.0016366546042263508, |
| "sampling/sampling_logp_difference/max": 6.415101051330566, |
| "sampling/sampling_logp_difference/mean": 0.01843268796801567, |
| "step": 133 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1041666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2005.0, |
| "completions/mean_length": 1544.5833740234375, |
| "completions/mean_terminated_length": 1486.0465087890625, |
| "completions/min_length": 883.0, |
| "completions/min_terminated_length": 883.0, |
| "entropy": 0.08017030358314514, |
| "epoch": 0.1442411194833154, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.8671875, |
| "learning_rate": 8.568353067814856e-06, |
| "loss": -0.0251, |
| "num_tokens": 16483980.0, |
| "reward": 170.14761352539062, |
| "reward_std": 108.76873779296875, |
| "rewards/Rewards/mean": 170.14759826660156, |
| "rewards/Rewards/std": 149.59971618652344, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9965035319328308, |
| "sampling/importance_sampling_ratio/min": 0.06854788213968277, |
| "sampling/sampling_logp_difference/max": 2.680222749710083, |
| "sampling/sampling_logp_difference/mean": 0.018077414482831955, |
| "step": 134 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1880.0, |
| "completions/mean_length": 1547.979248046875, |
| "completions/mean_terminated_length": 1432.5897216796875, |
| "completions/min_length": 712.0, |
| "completions/min_terminated_length": 712.0, |
| "entropy": 0.08219125866889954, |
| "epoch": 0.14531754574811626, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.046875, |
| "learning_rate": 8.557588805166846e-06, |
| "loss": 0.0168, |
| "num_tokens": 16601699.0, |
| "reward": 226.5089111328125, |
| "reward_std": 113.33275604248047, |
| "rewards/Rewards/mean": 226.5089111328125, |
| "rewards/Rewards/std": 153.14732360839844, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9965484142303467, |
| "sampling/importance_sampling_ratio/min": 0.0002600400475785136, |
| "sampling/sampling_logp_difference/max": 8.254674911499023, |
| "sampling/sampling_logp_difference/mean": 0.019097905606031418, |
| "step": 135 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0833333358168602, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2021.0, |
| "completions/mean_length": 1570.8958740234375, |
| "completions/mean_terminated_length": 1527.5228271484375, |
| "completions/min_length": 839.0, |
| "completions/min_terminated_length": 839.0, |
| "entropy": 0.08367850631475449, |
| "epoch": 0.14639397201291712, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.0, |
| "learning_rate": 8.546824542518838e-06, |
| "loss": -0.0181, |
| "num_tokens": 16727862.0, |
| "reward": 169.14260864257812, |
| "reward_std": 109.80867767333984, |
| "rewards/Rewards/mean": 169.14259338378906, |
| "rewards/Rewards/std": 164.3988800048828, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.996330976486206, |
| "sampling/importance_sampling_ratio/min": 0.02022353932261467, |
| "sampling/sampling_logp_difference/max": 3.9009079933166504, |
| "sampling/sampling_logp_difference/mean": 0.01995905488729477, |
| "step": 136 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0833333358168602, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1986.0, |
| "completions/mean_length": 1430.875, |
| "completions/mean_terminated_length": 1374.7728271484375, |
| "completions/min_length": 825.0, |
| "completions/min_terminated_length": 825.0, |
| "entropy": 0.07976692914962769, |
| "epoch": 0.147470398277718, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.078125, |
| "learning_rate": 8.53606027987083e-06, |
| "loss": -0.0286, |
| "num_tokens": 16840806.0, |
| "reward": 150.56967163085938, |
| "reward_std": 123.42110443115234, |
| "rewards/Rewards/mean": 150.5696563720703, |
| "rewards/Rewards/std": 152.9371337890625, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9966201782226562, |
| "sampling/importance_sampling_ratio/min": 0.007347543723881245, |
| "sampling/sampling_logp_difference/max": 4.913389205932617, |
| "sampling/sampling_logp_difference/mean": 0.018820632249116898, |
| "step": 137 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2018.0, |
| "completions/mean_length": 1563.604248046875, |
| "completions/mean_terminated_length": 1542.54345703125, |
| "completions/min_length": 681.0, |
| "completions/min_terminated_length": 681.0, |
| "entropy": 0.08118607103824615, |
| "epoch": 0.14854682454251883, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9453125, |
| "learning_rate": 8.525296017222821e-06, |
| "loss": -0.0781, |
| "num_tokens": 16969061.0, |
| "reward": 178.30577087402344, |
| "reward_std": 63.06797790527344, |
| "rewards/Rewards/mean": 178.3057403564453, |
| "rewards/Rewards/std": 156.191650390625, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9966996908187866, |
| "sampling/importance_sampling_ratio/min": 0.012999716214835644, |
| "sampling/sampling_logp_difference/max": 4.342827796936035, |
| "sampling/sampling_logp_difference/mean": 0.01800641044974327, |
| "step": 138 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.02083333395421505, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2006.0, |
| "completions/mean_length": 1388.0208740234375, |
| "completions/mean_terminated_length": 1373.9786376953125, |
| "completions/min_length": 620.0, |
| "completions/min_terminated_length": 620.0, |
| "entropy": 0.07906337827444077, |
| "epoch": 0.1496232508073197, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.21875, |
| "learning_rate": 8.514531754574811e-06, |
| "loss": -0.0267, |
| "num_tokens": 17080032.0, |
| "reward": 160.73464965820312, |
| "reward_std": 93.3261947631836, |
| "rewards/Rewards/mean": 160.73463439941406, |
| "rewards/Rewards/std": 167.1549072265625, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9969953894615173, |
| "sampling/importance_sampling_ratio/min": 0.07584778964519501, |
| "sampling/sampling_logp_difference/max": 2.579026699066162, |
| "sampling/sampling_logp_difference/mean": 0.019315559417009354, |
| "step": 139 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2046.0, |
| "completions/mean_length": 1438.5, |
| "completions/mean_terminated_length": 1412.0, |
| "completions/min_length": 912.0, |
| "completions/min_terminated_length": 912.0, |
| "entropy": 0.08420948684215546, |
| "epoch": 0.15069967707212056, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.1875, |
| "learning_rate": 8.503767491926803e-06, |
| "loss": -0.056, |
| "num_tokens": 17201976.0, |
| "reward": 130.47802734375, |
| "reward_std": 110.41607666015625, |
| "rewards/Rewards/mean": 130.47802734375, |
| "rewards/Rewards/std": 145.10369873046875, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9967747926712036, |
| "sampling/importance_sampling_ratio/min": 0.03695596382021904, |
| "sampling/sampling_logp_difference/max": 3.2980282306671143, |
| "sampling/sampling_logp_difference/mean": 0.019422899931669235, |
| "step": 140 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2030.0, |
| "completions/mean_length": 1597.625, |
| "completions/mean_terminated_length": 1493.6923828125, |
| "completions/min_length": 874.0, |
| "completions/min_terminated_length": 874.0, |
| "entropy": 0.0827246755361557, |
| "epoch": 0.15177610333692143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.1875, |
| "learning_rate": 8.493003229278796e-06, |
| "loss": 0.0049, |
| "num_tokens": 17332008.0, |
| "reward": 83.94515228271484, |
| "reward_std": 65.63279724121094, |
| "rewards/Rewards/mean": 83.94515228271484, |
| "rewards/Rewards/std": 119.19114685058594, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962602853775024, |
| "sampling/importance_sampling_ratio/min": 0.046488065272569656, |
| "sampling/sampling_logp_difference/max": 3.0685596466064453, |
| "sampling/sampling_logp_difference/mean": 0.019384002313017845, |
| "step": 141 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2022.0, |
| "completions/mean_length": 1426.1458740234375, |
| "completions/mean_terminated_length": 1399.1087646484375, |
| "completions/min_length": 822.0, |
| "completions/min_terminated_length": 822.0, |
| "entropy": 0.07838290929794312, |
| "epoch": 0.15285252960172227, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.921875, |
| "learning_rate": 8.482238966630786e-06, |
| "loss": -0.0712, |
| "num_tokens": 17440903.0, |
| "reward": 219.865234375, |
| "reward_std": 75.80467224121094, |
| "rewards/Rewards/mean": 219.865234375, |
| "rewards/Rewards/std": 159.51031494140625, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9964785575866699, |
| "sampling/importance_sampling_ratio/min": 0.06103203073143959, |
| "sampling/sampling_logp_difference/max": 2.796356439590454, |
| "sampling/sampling_logp_difference/mean": 0.01858234778046608, |
| "step": 142 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2004.0, |
| "completions/mean_length": 1544.0, |
| "completions/mean_terminated_length": 1522.0870361328125, |
| "completions/min_length": 959.0, |
| "completions/min_terminated_length": 959.0, |
| "entropy": 0.08301036059856415, |
| "epoch": 0.15392895586652314, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.90625, |
| "learning_rate": 8.471474703982778e-06, |
| "loss": -0.0748, |
| "num_tokens": 17561167.0, |
| "reward": 180.9502410888672, |
| "reward_std": 53.65088653564453, |
| "rewards/Rewards/mean": 180.9502410888672, |
| "rewards/Rewards/std": 162.4229278564453, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9965589046478271, |
| "sampling/importance_sampling_ratio/min": 0.056365251541137695, |
| "sampling/sampling_logp_difference/max": 2.8759024143218994, |
| "sampling/sampling_logp_difference/mean": 0.01916421577334404, |
| "step": 143 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2012.0, |
| "completions/mean_length": 1522.1458740234375, |
| "completions/mean_terminated_length": 1499.2825927734375, |
| "completions/min_length": 939.0, |
| "completions/min_terminated_length": 939.0, |
| "entropy": 0.08124718070030212, |
| "epoch": 0.155005382131324, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9296875, |
| "learning_rate": 8.46071044133477e-06, |
| "loss": -0.0565, |
| "num_tokens": 17683118.0, |
| "reward": 225.0555419921875, |
| "reward_std": 92.28500366210938, |
| "rewards/Rewards/mean": 225.0555419921875, |
| "rewards/Rewards/std": 149.1592254638672, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9963716268539429, |
| "sampling/importance_sampling_ratio/min": 0.05687575414776802, |
| "sampling/sampling_logp_difference/max": 2.8668861389160156, |
| "sampling/sampling_logp_difference/mean": 0.01863805204629898, |
| "step": 144 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0416666679084301, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1927.0, |
| "completions/mean_length": 1432.666748046875, |
| "completions/mean_terminated_length": 1405.9130859375, |
| "completions/min_length": 718.0, |
| "completions/min_terminated_length": 718.0, |
| "entropy": 0.08093176782131195, |
| "epoch": 0.15608180839612487, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.078125, |
| "learning_rate": 8.449946178686761e-06, |
| "loss": -0.0745, |
| "num_tokens": 17797018.0, |
| "reward": 135.8857421875, |
| "reward_std": 84.26956176757812, |
| "rewards/Rewards/mean": 135.8857421875, |
| "rewards/Rewards/std": 152.7830047607422, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9965113401412964, |
| "sampling/importance_sampling_ratio/min": 0.03339711204171181, |
| "sampling/sampling_logp_difference/max": 3.3992857933044434, |
| "sampling/sampling_logp_difference/mean": 0.01943063922226429, |
| "step": 145 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1041666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2035.0, |
| "completions/mean_length": 1558.2083740234375, |
| "completions/mean_terminated_length": 1501.255859375, |
| "completions/min_length": 1090.0, |
| "completions/min_terminated_length": 1090.0, |
| "entropy": 0.08431422710418701, |
| "epoch": 0.15715823466092574, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.015625, |
| "learning_rate": 8.439181916038753e-06, |
| "loss": -0.0368, |
| "num_tokens": 17917976.0, |
| "reward": 141.34796142578125, |
| "reward_std": 111.29246520996094, |
| "rewards/Rewards/mean": 141.34796142578125, |
| "rewards/Rewards/std": 151.082763671875, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9961467981338501, |
| "sampling/importance_sampling_ratio/min": 0.05715278163552284, |
| "sampling/sampling_logp_difference/max": 2.862027168273926, |
| "sampling/sampling_logp_difference/mean": 0.01945657841861248, |
| "step": 146 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1907.0, |
| "completions/mean_length": 1446.4583740234375, |
| "completions/mean_terminated_length": 1406.3555908203125, |
| "completions/min_length": 866.0, |
| "completions/min_terminated_length": 866.0, |
| "entropy": 0.0837797150015831, |
| "epoch": 0.15823466092572658, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.125, |
| "learning_rate": 8.428417653390743e-06, |
| "loss": 0.0078, |
| "num_tokens": 18036030.0, |
| "reward": 248.0830078125, |
| "reward_std": 113.72354125976562, |
| "rewards/Rewards/mean": 248.0830078125, |
| "rewards/Rewards/std": 146.14605712890625, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9962608814239502, |
| "sampling/importance_sampling_ratio/min": 0.05039582774043083, |
| "sampling/sampling_logp_difference/max": 2.987846851348877, |
| "sampling/sampling_logp_difference/mean": 0.01956545002758503, |
| "step": 147 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1041666716337204, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2046.0, |
| "completions/mean_length": 1549.5833740234375, |
| "completions/mean_terminated_length": 1491.6279296875, |
| "completions/min_length": 635.0, |
| "completions/min_terminated_length": 635.0, |
| "entropy": 0.08182680606842041, |
| "epoch": 0.15931108719052745, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9140625, |
| "learning_rate": 8.417653390742735e-06, |
| "loss": -0.0206, |
| "num_tokens": 18167038.0, |
| "reward": 108.25777435302734, |
| "reward_std": 85.30794525146484, |
| "rewards/Rewards/mean": 108.25775909423828, |
| "rewards/Rewards/std": 126.70792388916016, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9965537786483765, |
| "sampling/importance_sampling_ratio/min": 0.03747351095080376, |
| "sampling/sampling_logp_difference/max": 3.284121036529541, |
| "sampling/sampling_logp_difference/mean": 0.01908835396170616, |
| "step": 148 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.02083333395421505, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1947.0, |
| "completions/mean_length": 1398.854248046875, |
| "completions/mean_terminated_length": 1385.04248046875, |
| "completions/min_length": 802.0, |
| "completions/min_terminated_length": 802.0, |
| "entropy": 0.07840481400489807, |
| "epoch": 0.1603875134553283, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.328125, |
| "learning_rate": 8.406889128094727e-06, |
| "loss": -0.0369, |
| "num_tokens": 18287115.0, |
| "reward": 220.73410034179688, |
| "reward_std": 43.89515686035156, |
| "rewards/Rewards/mean": 220.7340850830078, |
| "rewards/Rewards/std": 141.5032958984375, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9964281320571899, |
| "sampling/importance_sampling_ratio/min": 0.03872040659189224, |
| "sampling/sampling_logp_difference/max": 3.2513885498046875, |
| "sampling/sampling_logp_difference/mean": 0.0185480285435915, |
| "step": 149 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2039.0, |
| "completions/mean_length": 1486.75, |
| "completions/mean_terminated_length": 1449.3333740234375, |
| "completions/min_length": 836.0, |
| "completions/min_terminated_length": 836.0, |
| "entropy": 0.07938205450773239, |
| "epoch": 0.16146393972012918, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.21875, |
| "learning_rate": 8.396124865446718e-06, |
| "loss": -0.0306, |
| "num_tokens": 18403377.0, |
| "reward": 237.360595703125, |
| "reward_std": 82.56875610351562, |
| "rewards/Rewards/mean": 237.360595703125, |
| "rewards/Rewards/std": 157.23016357421875, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 0.9965106844902039, |
| "sampling/importance_sampling_ratio/min": 0.03296330198645592, |
| "sampling/sampling_logp_difference/max": 3.412360429763794, |
| "sampling/sampling_logp_difference/mean": 0.019054463133215904, |
| "step": 150 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 929, |
| "num_input_tokens_seen": 18403377, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 6, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|