Qwen3-4B-baseline / trainer_state.json
Mithilss's picture
Add files using upload-large-folder tool
4bf29e7 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.16146393972012918,
"eval_steps": 500,
"global_step": 150,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1674.0,
"completions/max_terminated_length": 1674.0,
"completions/mean_length": 871.875,
"completions/mean_terminated_length": 871.875,
"completions/min_length": 399.0,
"completions/min_terminated_length": 399.0,
"entropy": 0.18132147192955017,
"epoch": 0.001076426264800861,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6484375,
"learning_rate": 1e-05,
"loss": -0.1149,
"num_tokens": 90882.0,
"reward": 42.04616928100586,
"reward_std": 51.792091369628906,
"rewards/Rewards/mean": 42.046173095703125,
"rewards/Rewards/std": 144.7001190185547,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9934591054916382,
"sampling/importance_sampling_ratio/min": 0.08985913544893265,
"sampling/sampling_logp_difference/max": 2.4095120429992676,
"sampling/sampling_logp_difference/mean": 0.02156674861907959,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1762.0,
"completions/max_terminated_length": 1762.0,
"completions/mean_length": 896.2291870117188,
"completions/mean_terminated_length": 896.2291870117188,
"completions/min_length": 468.0,
"completions/min_terminated_length": 468.0,
"entropy": 0.15218639373779297,
"epoch": 0.002152852529601722,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9296875,
"learning_rate": 9.989235737351993e-06,
"loss": -0.1743,
"num_tokens": 176615.0,
"reward": 168.61300659179688,
"reward_std": 52.48692321777344,
"rewards/Rewards/mean": 168.61297607421875,
"rewards/Rewards/std": 160.77197265625,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9949066638946533,
"sampling/importance_sampling_ratio/min": 0.19706952571868896,
"sampling/sampling_logp_difference/max": 1.6241986751556396,
"sampling/sampling_logp_difference/mean": 0.020902985706925392,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1708.0,
"completions/max_terminated_length": 1708.0,
"completions/mean_length": 674.625,
"completions/mean_terminated_length": 674.625,
"completions/min_length": 297.0,
"completions/min_terminated_length": 297.0,
"entropy": 0.10972679406404495,
"epoch": 0.0032292787944025836,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.546875,
"learning_rate": 9.978471474703984e-06,
"loss": -0.1742,
"num_tokens": 255089.0,
"reward": 114.97002410888672,
"reward_std": 28.174999237060547,
"rewards/Rewards/mean": 114.97003173828125,
"rewards/Rewards/std": 152.3834228515625,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9959638714790344,
"sampling/importance_sampling_ratio/min": 0.11370240896940231,
"sampling/sampling_logp_difference/max": 2.174170732498169,
"sampling/sampling_logp_difference/mean": 0.01859966665506363,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1724.0,
"completions/max_terminated_length": 1724.0,
"completions/mean_length": 789.8541870117188,
"completions/mean_terminated_length": 789.8541870117188,
"completions/min_length": 484.0,
"completions/min_terminated_length": 484.0,
"entropy": 0.10646244883537292,
"epoch": 0.004305705059203444,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.359375,
"learning_rate": 9.967707212055974e-06,
"loss": -0.1154,
"num_tokens": 338194.0,
"reward": 225.43746948242188,
"reward_std": 24.845914840698242,
"rewards/Rewards/mean": 225.4374542236328,
"rewards/Rewards/std": 109.95954132080078,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9960993528366089,
"sampling/importance_sampling_ratio/min": 0.10363264381885529,
"sampling/sampling_logp_difference/max": 2.2669029235839844,
"sampling/sampling_logp_difference/mean": 0.01849844679236412,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02083333395421505,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2045.0,
"completions/mean_length": 1027.5208740234375,
"completions/mean_terminated_length": 1005.8084716796875,
"completions/min_length": 355.0,
"completions/min_terminated_length": 355.0,
"entropy": 0.11067882180213928,
"epoch": 0.005382131324004306,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.953125,
"learning_rate": 9.956942949407966e-06,
"loss": -0.1189,
"num_tokens": 433241.0,
"reward": 120.4049072265625,
"reward_std": 54.20726013183594,
"rewards/Rewards/mean": 120.4049072265625,
"rewards/Rewards/std": 154.53668212890625,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9961210489273071,
"sampling/importance_sampling_ratio/min": 0.1546708047389984,
"sampling/sampling_logp_difference/max": 1.8664562702178955,
"sampling/sampling_logp_difference/mean": 0.01938852109014988,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1719.0,
"completions/max_terminated_length": 1719.0,
"completions/mean_length": 1006.4791870117188,
"completions/mean_terminated_length": 1006.4791870117188,
"completions/min_length": 622.0,
"completions/min_terminated_length": 622.0,
"entropy": 0.11143378913402557,
"epoch": 0.006458557588805167,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.109375,
"learning_rate": 9.946178686759958e-06,
"loss": -0.0849,
"num_tokens": 527308.0,
"reward": 137.47239685058594,
"reward_std": 92.79216003417969,
"rewards/Rewards/mean": 137.47238159179688,
"rewards/Rewards/std": 149.94862365722656,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9963271021842957,
"sampling/importance_sampling_ratio/min": 0.09281564503908157,
"sampling/sampling_logp_difference/max": 2.3771400451660156,
"sampling/sampling_logp_difference/mean": 0.019732967019081116,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1742.0,
"completions/mean_length": 1053.125,
"completions/mean_terminated_length": 1009.8695678710938,
"completions/min_length": 555.0,
"completions/min_terminated_length": 555.0,
"entropy": 0.10790788382291794,
"epoch": 0.007534983853606028,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.125,
"learning_rate": 9.93541442411195e-06,
"loss": -0.1188,
"num_tokens": 625750.0,
"reward": 123.31615447998047,
"reward_std": 75.22834014892578,
"rewards/Rewards/mean": 123.3161392211914,
"rewards/Rewards/std": 152.19955444335938,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9965001344680786,
"sampling/importance_sampling_ratio/min": 0.10315173864364624,
"sampling/sampling_logp_difference/max": 2.2715542316436768,
"sampling/sampling_logp_difference/mean": 0.01938672922551632,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1997.0,
"completions/mean_length": 1300.5,
"completions/mean_terminated_length": 1151.0,
"completions/min_length": 540.0,
"completions/min_terminated_length": 540.0,
"entropy": 0.11116605252027512,
"epoch": 0.008611410118406888,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.515625,
"learning_rate": 9.92465016146394e-06,
"loss": -0.008,
"num_tokens": 735160.0,
"reward": 127.77455139160156,
"reward_std": 100.4245834350586,
"rewards/Rewards/mean": 127.77454376220703,
"rewards/Rewards/std": 161.16799926757812,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.996357798576355,
"sampling/importance_sampling_ratio/min": 0.07584778964519501,
"sampling/sampling_logp_difference/max": 2.579026699066162,
"sampling/sampling_logp_difference/mean": 0.01950543373823166,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1982.0,
"completions/mean_length": 1292.666748046875,
"completions/mean_terminated_length": 1242.3111572265625,
"completions/min_length": 615.0,
"completions/min_terminated_length": 615.0,
"entropy": 0.10612079501152039,
"epoch": 0.00968783638320775,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8203125,
"learning_rate": 9.913885898815931e-06,
"loss": -0.0727,
"num_tokens": 843408.0,
"reward": 78.78534698486328,
"reward_std": 78.33949279785156,
"rewards/Rewards/mean": 78.78535461425781,
"rewards/Rewards/std": 138.2502899169922,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9955548048019409,
"sampling/importance_sampling_ratio/min": 0.007099959533661604,
"sampling/sampling_logp_difference/max": 4.947666168212891,
"sampling/sampling_logp_difference/mean": 0.0185045525431633,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2291666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2004.0,
"completions/mean_length": 1513.8958740234375,
"completions/mean_terminated_length": 1355.108154296875,
"completions/min_length": 587.0,
"completions/min_terminated_length": 587.0,
"entropy": 0.10934104025363922,
"epoch": 0.010764262648008612,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6640625,
"learning_rate": 9.903121636167923e-06,
"loss": 0.0008,
"num_tokens": 964243.0,
"reward": 142.08834838867188,
"reward_std": 111.20391082763672,
"rewards/Rewards/mean": 142.08831787109375,
"rewards/Rewards/std": 154.59608459472656,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9958118200302124,
"sampling/importance_sampling_ratio/min": 0.05715308338403702,
"sampling/sampling_logp_difference/max": 2.8620219230651855,
"sampling/sampling_logp_difference/mean": 0.018251175060868263,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2043.0,
"completions/mean_length": 1362.8333740234375,
"completions/mean_terminated_length": 1264.952392578125,
"completions/min_length": 560.0,
"completions/min_terminated_length": 560.0,
"entropy": 0.1081463098526001,
"epoch": 0.011840688912809472,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.765625,
"learning_rate": 9.892357373519915e-06,
"loss": -0.0131,
"num_tokens": 1071719.0,
"reward": 195.29440307617188,
"reward_std": 109.97010803222656,
"rewards/Rewards/mean": 195.29437255859375,
"rewards/Rewards/std": 153.7947540283203,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9960513114929199,
"sampling/importance_sampling_ratio/min": 0.009758302941918373,
"sampling/sampling_logp_difference/max": 4.629636764526367,
"sampling/sampling_logp_difference/mean": 0.019215960055589676,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2916666865348816,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2043.0,
"completions/mean_length": 1750.354248046875,
"completions/mean_terminated_length": 1627.7940673828125,
"completions/min_length": 831.0,
"completions/min_terminated_length": 831.0,
"entropy": 0.11442270129919052,
"epoch": 0.012917115177610334,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4375,
"learning_rate": 9.881593110871906e-06,
"loss": 0.0184,
"num_tokens": 1211050.0,
"reward": 164.88296508789062,
"reward_std": 126.29165649414062,
"rewards/Rewards/mean": 164.88294982910156,
"rewards/Rewards/std": 163.17112731933594,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9958729147911072,
"sampling/importance_sampling_ratio/min": 0.13337160646915436,
"sampling/sampling_logp_difference/max": 2.014616012573242,
"sampling/sampling_logp_difference/mean": 0.018926220014691353,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3333333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1879.0,
"completions/mean_length": 1591.3958740234375,
"completions/mean_terminated_length": 1363.09375,
"completions/min_length": 665.0,
"completions/min_terminated_length": 665.0,
"entropy": 0.10113458335399628,
"epoch": 0.013993541442411194,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4921875,
"learning_rate": 9.870828848223898e-06,
"loss": 0.0255,
"num_tokens": 1332917.0,
"reward": 112.35260009765625,
"reward_std": 93.94807434082031,
"rewards/Rewards/mean": 112.35260009765625,
"rewards/Rewards/std": 141.7664337158203,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962171912193298,
"sampling/importance_sampling_ratio/min": 0.04543043673038483,
"sampling/sampling_logp_difference/max": 3.0915729999542236,
"sampling/sampling_logp_difference/mean": 0.017922332510352135,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1997.0,
"completions/mean_length": 1511.2708740234375,
"completions/mean_terminated_length": 1387.4102783203125,
"completions/min_length": 651.0,
"completions/min_terminated_length": 651.0,
"entropy": 0.10830745100975037,
"epoch": 0.015069967707212056,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.53125,
"learning_rate": 9.86006458557589e-06,
"loss": -0.0334,
"num_tokens": 1449234.0,
"reward": 123.8992919921875,
"reward_std": 88.19905853271484,
"rewards/Rewards/mean": 123.8992691040039,
"rewards/Rewards/std": 146.86688232421875,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9959671497344971,
"sampling/importance_sampling_ratio/min": 0.18372225761413574,
"sampling/sampling_logp_difference/max": 1.694330096244812,
"sampling/sampling_logp_difference/mean": 0.01905238628387451,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2916666865348816,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2016.0,
"completions/mean_length": 1455.4583740234375,
"completions/mean_terminated_length": 1211.4705810546875,
"completions/min_length": 693.0,
"completions/min_terminated_length": 693.0,
"entropy": 0.10178375989198685,
"epoch": 0.016146393972012917,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7109375,
"learning_rate": 9.84930032292788e-06,
"loss": -0.0285,
"num_tokens": 1566238.0,
"reward": 104.54838562011719,
"reward_std": 63.17893600463867,
"rewards/Rewards/mean": 104.54837036132812,
"rewards/Rewards/std": 155.53961181640625,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962810277938843,
"sampling/importance_sampling_ratio/min": 0.1905871480703354,
"sampling/sampling_logp_difference/max": 1.6576457023620605,
"sampling/sampling_logp_difference/mean": 0.018221702426671982,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4583333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2045.0,
"completions/mean_length": 1741.9583740234375,
"completions/mean_terminated_length": 1483.0,
"completions/min_length": 720.0,
"completions/min_terminated_length": 720.0,
"entropy": 0.10251286625862122,
"epoch": 0.017222820236813777,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.2421875,
"learning_rate": 9.838536060279871e-06,
"loss": 0.0534,
"num_tokens": 1704008.0,
"reward": 89.67097473144531,
"reward_std": 84.6678695678711,
"rewards/Rewards/mean": 89.67096710205078,
"rewards/Rewards/std": 131.33070373535156,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9964886903762817,
"sampling/importance_sampling_ratio/min": 0.035292141139507294,
"sampling/sampling_logp_difference/max": 3.34409499168396,
"sampling/sampling_logp_difference/mean": 0.01839987002313137,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4583333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2025.0,
"completions/mean_length": 1682.666748046875,
"completions/mean_terminated_length": 1373.5384521484375,
"completions/min_length": 472.0,
"completions/min_terminated_length": 472.0,
"entropy": 0.10447351634502411,
"epoch": 0.01829924650161464,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3984375,
"learning_rate": 9.827771797631863e-06,
"loss": 0.0515,
"num_tokens": 1833718.0,
"reward": 132.6174774169922,
"reward_std": 95.32293701171875,
"rewards/Rewards/mean": 132.6174774169922,
"rewards/Rewards/std": 155.01036071777344,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9960591197013855,
"sampling/importance_sampling_ratio/min": 0.11370313912630081,
"sampling/sampling_logp_difference/max": 2.174164295196533,
"sampling/sampling_logp_difference/mean": 0.01829143613576889,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2916666865348816,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2029.0,
"completions/mean_length": 1645.479248046875,
"completions/mean_terminated_length": 1479.7353515625,
"completions/min_length": 807.0,
"completions/min_terminated_length": 807.0,
"entropy": 0.10255686193704605,
"epoch": 0.0193756727664155,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5234375,
"learning_rate": 9.817007534983855e-06,
"loss": -0.0021,
"num_tokens": 1959159.0,
"reward": 118.39625549316406,
"reward_std": 70.67788696289062,
"rewards/Rewards/mean": 118.396240234375,
"rewards/Rewards/std": 147.87017822265625,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.995873749256134,
"sampling/importance_sampling_ratio/min": 0.04542801156640053,
"sampling/sampling_logp_difference/max": 3.0916264057159424,
"sampling/sampling_logp_difference/mean": 0.018819186836481094,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2708333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1975.0,
"completions/mean_length": 1593.291748046875,
"completions/mean_terminated_length": 1424.4000244140625,
"completions/min_length": 788.0,
"completions/min_terminated_length": 788.0,
"entropy": 0.10415040701627731,
"epoch": 0.02045209903121636,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6796875,
"learning_rate": 9.806243272335847e-06,
"loss": 0.02,
"num_tokens": 2082473.0,
"reward": 192.97549438476562,
"reward_std": 106.2840805053711,
"rewards/Rewards/mean": 192.9755096435547,
"rewards/Rewards/std": 158.14520263671875,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962199330329895,
"sampling/importance_sampling_ratio/min": 0.15721529722213745,
"sampling/sampling_logp_difference/max": 1.8501391410827637,
"sampling/sampling_logp_difference/mean": 0.018406182527542114,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1458333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2025.0,
"completions/mean_length": 1522.75,
"completions/mean_terminated_length": 1433.0731201171875,
"completions/min_length": 720.0,
"completions/min_terminated_length": 720.0,
"entropy": 0.09845548868179321,
"epoch": 0.021528525296017224,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.625,
"learning_rate": 9.795479009687837e-06,
"loss": -0.0126,
"num_tokens": 2202803.0,
"reward": 203.66549682617188,
"reward_std": 74.4580078125,
"rewards/Rewards/mean": 203.6654815673828,
"rewards/Rewards/std": 151.41436767578125,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9963464736938477,
"sampling/importance_sampling_ratio/min": 0.025838343426585197,
"sampling/sampling_logp_difference/max": 3.655895709991455,
"sampling/sampling_logp_difference/mean": 0.017935417592525482,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2015.0,
"completions/mean_length": 1649.7708740234375,
"completions/mean_terminated_length": 1468.757568359375,
"completions/min_length": 834.0,
"completions/min_terminated_length": 834.0,
"entropy": 0.10193191468715668,
"epoch": 0.022604951560818085,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.640625,
"learning_rate": 9.784714747039828e-06,
"loss": 0.0792,
"num_tokens": 2327952.0,
"reward": 170.25717163085938,
"reward_std": 125.09175872802734,
"rewards/Rewards/mean": 170.2571563720703,
"rewards/Rewards/std": 162.55516052246094,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9963719844818115,
"sampling/importance_sampling_ratio/min": 0.15727432072162628,
"sampling/sampling_logp_difference/max": 1.8497637510299683,
"sampling/sampling_logp_difference/mean": 0.018050983548164368,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2291666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2012.0,
"completions/mean_length": 1579.729248046875,
"completions/mean_terminated_length": 1440.5135498046875,
"completions/min_length": 643.0,
"completions/min_terminated_length": 643.0,
"entropy": 0.10178973525762558,
"epoch": 0.023681377825618945,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7109375,
"learning_rate": 9.77395048439182e-06,
"loss": -0.0035,
"num_tokens": 2447459.0,
"reward": 188.00665283203125,
"reward_std": 95.58663177490234,
"rewards/Rewards/mean": 188.00665283203125,
"rewards/Rewards/std": 158.27713012695312,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962644577026367,
"sampling/importance_sampling_ratio/min": 0.1137038916349411,
"sampling/sampling_logp_difference/max": 2.1741576194763184,
"sampling/sampling_logp_difference/mean": 0.018942642956972122,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2291666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2047.0,
"completions/mean_length": 1589.791748046875,
"completions/mean_terminated_length": 1453.567626953125,
"completions/min_length": 630.0,
"completions/min_terminated_length": 630.0,
"entropy": 0.10466985404491425,
"epoch": 0.024757804090419805,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.59375,
"learning_rate": 9.763186221743812e-06,
"loss": 0.0422,
"num_tokens": 2565853.0,
"reward": 163.7479248046875,
"reward_std": 122.3080062866211,
"rewards/Rewards/mean": 163.7479248046875,
"rewards/Rewards/std": 170.48175048828125,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9957739114761353,
"sampling/importance_sampling_ratio/min": 0.057831499725580215,
"sampling/sampling_logp_difference/max": 2.850221633911133,
"sampling/sampling_logp_difference/mean": 0.019438397139310837,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2021.0,
"completions/mean_length": 1572.8125,
"completions/mean_terminated_length": 1414.4166259765625,
"completions/min_length": 722.0,
"completions/min_terminated_length": 722.0,
"entropy": 0.1026575118303299,
"epoch": 0.02583423035522067,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.390625,
"learning_rate": 9.752421959095802e-06,
"loss": -0.0083,
"num_tokens": 2690938.0,
"reward": 131.00326538085938,
"reward_std": 95.41658020019531,
"rewards/Rewards/mean": 131.00328063964844,
"rewards/Rewards/std": 156.06686401367188,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9963216781616211,
"sampling/importance_sampling_ratio/min": 0.20426322519779205,
"sampling/sampling_logp_difference/max": 1.5883457660675049,
"sampling/sampling_logp_difference/mean": 0.018099797889590263,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3958333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2013.0,
"completions/mean_length": 1594.0,
"completions/mean_terminated_length": 1296.5517578125,
"completions/min_length": 548.0,
"completions/min_terminated_length": 548.0,
"entropy": 0.10098929703235626,
"epoch": 0.02691065662002153,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.375,
"learning_rate": 9.741657696447793e-06,
"loss": 0.0231,
"num_tokens": 2830276.0,
"reward": 183.90872192382812,
"reward_std": 96.02549743652344,
"rewards/Rewards/mean": 183.90869140625,
"rewards/Rewards/std": 162.77676391601562,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9963605403900146,
"sampling/importance_sampling_ratio/min": 0.06367813050746918,
"sampling/sampling_logp_difference/max": 2.7539141178131104,
"sampling/sampling_logp_difference/mean": 0.01794135943055153,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1964.0,
"completions/mean_length": 1354.9375,
"completions/mean_terminated_length": 1216.3250732421875,
"completions/min_length": 513.0,
"completions/min_terminated_length": 513.0,
"entropy": 0.09489695727825165,
"epoch": 0.02798708288482239,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8046875,
"learning_rate": 9.730893433799785e-06,
"loss": -0.0773,
"num_tokens": 2943301.0,
"reward": 75.51364135742188,
"reward_std": 65.03620910644531,
"rewards/Rewards/mean": 75.51363372802734,
"rewards/Rewards/std": 119.27247619628906,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9964057207107544,
"sampling/importance_sampling_ratio/min": 0.03296323120594025,
"sampling/sampling_logp_difference/max": 3.412362575531006,
"sampling/sampling_logp_difference/mean": 0.018709205090999603,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2291666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2031.0,
"completions/mean_length": 1607.979248046875,
"completions/mean_terminated_length": 1477.1622314453125,
"completions/min_length": 870.0,
"completions/min_terminated_length": 870.0,
"entropy": 0.10251633822917938,
"epoch": 0.02906350914962325,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4921875,
"learning_rate": 9.720129171151777e-06,
"loss": -0.0425,
"num_tokens": 3064410.0,
"reward": 191.07308959960938,
"reward_std": 90.68852233886719,
"rewards/Rewards/mean": 191.0730743408203,
"rewards/Rewards/std": 165.14541625976562,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9957290291786194,
"sampling/importance_sampling_ratio/min": 0.002294244710355997,
"sampling/sampling_logp_difference/max": 6.0773515701293945,
"sampling/sampling_logp_difference/mean": 0.018729567527770996,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2708333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2009.0,
"completions/mean_length": 1671.2083740234375,
"completions/mean_terminated_length": 1531.2572021484375,
"completions/min_length": 905.0,
"completions/min_terminated_length": 905.0,
"entropy": 0.1027567908167839,
"epoch": 0.030139935414424113,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5703125,
"learning_rate": 9.709364908503769e-06,
"loss": 0.0418,
"num_tokens": 3188380.0,
"reward": 136.59169006347656,
"reward_std": 62.256492614746094,
"rewards/Rewards/mean": 136.5916748046875,
"rewards/Rewards/std": 151.45384216308594,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9961684942245483,
"sampling/importance_sampling_ratio/min": 0.0541137270629406,
"sampling/sampling_logp_difference/max": 2.9166674613952637,
"sampling/sampling_logp_difference/mean": 0.018425792455673218,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3541666865348816,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2048.0,
"completions/mean_length": 1587.7083740234375,
"completions/mean_terminated_length": 1335.290283203125,
"completions/min_length": 547.0,
"completions/min_terminated_length": 547.0,
"entropy": 0.10235518962144852,
"epoch": 0.031216361679224973,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.5859375,
"learning_rate": 9.69860064585576e-06,
"loss": -0.0208,
"num_tokens": 3326762.0,
"reward": 143.9744873046875,
"reward_std": 42.498687744140625,
"rewards/Rewards/mean": 143.97447204589844,
"rewards/Rewards/std": 149.9228973388672,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9966323971748352,
"sampling/importance_sampling_ratio/min": 0.12920065224170685,
"sampling/sampling_logp_difference/max": 2.046388626098633,
"sampling/sampling_logp_difference/mean": 0.018576383590698242,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2291666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2032.0,
"completions/mean_length": 1538.7083740234375,
"completions/mean_terminated_length": 1387.29736328125,
"completions/min_length": 705.0,
"completions/min_terminated_length": 705.0,
"entropy": 0.10081931948661804,
"epoch": 0.03229278794402583,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6640625,
"learning_rate": 9.687836383207752e-06,
"loss": 0.0021,
"num_tokens": 3450126.0,
"reward": 155.98631286621094,
"reward_std": 135.351806640625,
"rewards/Rewards/mean": 155.98631286621094,
"rewards/Rewards/std": 177.09036254882812,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9961260557174683,
"sampling/importance_sampling_ratio/min": 0.0660436749458313,
"sampling/sampling_logp_difference/max": 2.7174389362335205,
"sampling/sampling_logp_difference/mean": 0.018216893076896667,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2042.0,
"completions/mean_length": 1672.3125,
"completions/mean_terminated_length": 1585.6153564453125,
"completions/min_length": 804.0,
"completions/min_terminated_length": 804.0,
"entropy": 0.0999954417347908,
"epoch": 0.03336921420882669,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.015625,
"learning_rate": 9.677072120559744e-06,
"loss": 0.0166,
"num_tokens": 3577095.0,
"reward": 157.2335205078125,
"reward_std": 114.414306640625,
"rewards/Rewards/mean": 157.23350524902344,
"rewards/Rewards/std": 159.78680419921875,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9964659214019775,
"sampling/importance_sampling_ratio/min": 0.019857462495565414,
"sampling/sampling_logp_difference/max": 3.919175386428833,
"sampling/sampling_logp_difference/mean": 0.018892712891101837,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2023.0,
"completions/mean_length": 1509.9583740234375,
"completions/mean_terminated_length": 1433.09521484375,
"completions/min_length": 661.0,
"completions/min_terminated_length": 661.0,
"entropy": 0.10067085921764374,
"epoch": 0.03444564047362755,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7734375,
"learning_rate": 9.666307857911734e-06,
"loss": 0.0353,
"num_tokens": 3701461.0,
"reward": 224.95938110351562,
"reward_std": 124.48177337646484,
"rewards/Rewards/mean": 224.9593505859375,
"rewards/Rewards/std": 151.01626586914062,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962152242660522,
"sampling/importance_sampling_ratio/min": 0.14445149898529053,
"sampling/sampling_logp_difference/max": 1.9348114728927612,
"sampling/sampling_logp_difference/mean": 0.019468065351247787,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2083333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2021.0,
"completions/mean_length": 1636.104248046875,
"completions/mean_terminated_length": 1527.7105712890625,
"completions/min_length": 636.0,
"completions/min_terminated_length": 636.0,
"entropy": 0.10358748584985733,
"epoch": 0.03552206673842842,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.578125,
"learning_rate": 9.655543595263725e-06,
"loss": 0.0059,
"num_tokens": 3847320.0,
"reward": 138.1477813720703,
"reward_std": 55.14875411987305,
"rewards/Rewards/mean": 138.14776611328125,
"rewards/Rewards/std": 136.7897186279297,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9959272146224976,
"sampling/importance_sampling_ratio/min": 0.12920619547367096,
"sampling/sampling_logp_difference/max": 2.0463457107543945,
"sampling/sampling_logp_difference/mean": 0.018886592239141464,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2044.0,
"completions/mean_length": 1700.0,
"completions/mean_terminated_length": 1491.2000732421875,
"completions/min_length": 777.0,
"completions/min_terminated_length": 777.0,
"entropy": 0.10209772735834122,
"epoch": 0.03659849300322928,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.65625,
"learning_rate": 9.644779332615717e-06,
"loss": -0.0092,
"num_tokens": 3980100.0,
"reward": 121.0146713256836,
"reward_std": 102.0250244140625,
"rewards/Rewards/mean": 121.01465606689453,
"rewards/Rewards/std": 146.56837463378906,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9963019490242004,
"sampling/importance_sampling_ratio/min": 0.085022933781147,
"sampling/sampling_logp_difference/max": 2.6755123138427734,
"sampling/sampling_logp_difference/mean": 0.018831949681043625,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2045.0,
"completions/mean_length": 1679.541748046875,
"completions/mean_terminated_length": 1512.0606689453125,
"completions/min_length": 943.0,
"completions/min_terminated_length": 943.0,
"entropy": 0.1014847606420517,
"epoch": 0.03767491926803014,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6015625,
"learning_rate": 9.634015069967709e-06,
"loss": -0.0016,
"num_tokens": 4113914.0,
"reward": 79.2403564453125,
"reward_std": 42.394386291503906,
"rewards/Rewards/mean": 79.2403564453125,
"rewards/Rewards/std": 115.3869400024414,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9961296319961548,
"sampling/importance_sampling_ratio/min": 0.02546442113816738,
"sampling/sampling_logp_difference/max": 3.670473098754883,
"sampling/sampling_logp_difference/mean": 0.0189987625926733,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2708333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2014.0,
"completions/mean_length": 1772.6458740234375,
"completions/mean_terminated_length": 1670.3714599609375,
"completions/min_length": 1016.0,
"completions/min_terminated_length": 1016.0,
"entropy": 0.1009664312005043,
"epoch": 0.038751345532831,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8671875,
"learning_rate": 9.623250807319699e-06,
"loss": 0.0061,
"num_tokens": 4249329.0,
"reward": 142.5376434326172,
"reward_std": 83.20094299316406,
"rewards/Rewards/mean": 142.5376434326172,
"rewards/Rewards/std": 157.02310180664062,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.996181845664978,
"sampling/importance_sampling_ratio/min": 0.00419560307636857,
"sampling/sampling_logp_difference/max": 5.473718166351318,
"sampling/sampling_logp_difference/mean": 0.01883939653635025,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1458333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1978.0,
"completions/mean_length": 1526.25,
"completions/mean_terminated_length": 1437.170654296875,
"completions/min_length": 756.0,
"completions/min_terminated_length": 756.0,
"entropy": 0.09632325172424316,
"epoch": 0.03982777179763186,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.84375,
"learning_rate": 9.61248654467169e-06,
"loss": -0.0497,
"num_tokens": 4368357.0,
"reward": 153.72262573242188,
"reward_std": 64.56527709960938,
"rewards/Rewards/mean": 153.72262573242188,
"rewards/Rewards/std": 167.7002716064453,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9960031509399414,
"sampling/importance_sampling_ratio/min": 0.013016091659665108,
"sampling/sampling_logp_difference/max": 4.341568946838379,
"sampling/sampling_logp_difference/mean": 0.01888095960021019,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2039.0,
"completions/mean_length": 1351.5833740234375,
"completions/mean_terminated_length": 1190.871826171875,
"completions/min_length": 622.0,
"completions/min_terminated_length": 622.0,
"entropy": 0.09179762005805969,
"epoch": 0.04090419806243272,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8671875,
"learning_rate": 9.601722282023682e-06,
"loss": -0.024,
"num_tokens": 4475905.0,
"reward": 105.28202819824219,
"reward_std": 40.796363830566406,
"rewards/Rewards/mean": 105.28202056884766,
"rewards/Rewards/std": 128.94845581054688,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9963054656982422,
"sampling/importance_sampling_ratio/min": 0.042128000408411026,
"sampling/sampling_logp_difference/max": 3.1670427322387695,
"sampling/sampling_logp_difference/mean": 0.018371229991316795,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1987.0,
"completions/mean_length": 1484.2083740234375,
"completions/mean_terminated_length": 1227.939453125,
"completions/min_length": 615.0,
"completions/min_terminated_length": 615.0,
"entropy": 0.0977960154414177,
"epoch": 0.04198062432723358,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.671875,
"learning_rate": 9.590958019375674e-06,
"loss": -0.0293,
"num_tokens": 4597715.0,
"reward": 184.84185791015625,
"reward_std": 77.36622619628906,
"rewards/Rewards/mean": 184.8418426513672,
"rewards/Rewards/std": 149.9651336669922,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9964928030967712,
"sampling/importance_sampling_ratio/min": 0.0114639513194561,
"sampling/sampling_logp_difference/max": 4.468547821044922,
"sampling/sampling_logp_difference/mean": 0.01864359900355339,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2291666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2040.0,
"completions/mean_length": 1668.7083740234375,
"completions/mean_terminated_length": 1555.9459228515625,
"completions/min_length": 728.0,
"completions/min_terminated_length": 728.0,
"entropy": 0.09704931080341339,
"epoch": 0.04305705059203445,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7421875,
"learning_rate": 9.580193756727666e-06,
"loss": -0.047,
"num_tokens": 4724073.0,
"reward": 123.34667205810547,
"reward_std": 45.43848419189453,
"rewards/Rewards/mean": 123.34667205810547,
"rewards/Rewards/std": 150.5157012939453,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.996220052242279,
"sampling/importance_sampling_ratio/min": 0.09609924256801605,
"sampling/sampling_logp_difference/max": 2.342373847961426,
"sampling/sampling_logp_difference/mean": 0.01860324665904045,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1991.0,
"completions/mean_length": 1652.25,
"completions/mean_terminated_length": 1520.3333740234375,
"completions/min_length": 1028.0,
"completions/min_terminated_length": 1028.0,
"entropy": 0.09834767878055573,
"epoch": 0.04413347685683531,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.671875,
"learning_rate": 9.569429494079656e-06,
"loss": 0.066,
"num_tokens": 4858017.0,
"reward": 152.06527709960938,
"reward_std": 94.49375915527344,
"rewards/Rewards/mean": 152.0652618408203,
"rewards/Rewards/std": 155.73765563964844,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9955779314041138,
"sampling/importance_sampling_ratio/min": 0.016845213249325752,
"sampling/sampling_logp_difference/max": 4.083688735961914,
"sampling/sampling_logp_difference/mean": 0.018709905445575714,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0833333358168602,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2035.0,
"completions/mean_length": 1488.25,
"completions/mean_terminated_length": 1437.3636474609375,
"completions/min_length": 923.0,
"completions/min_terminated_length": 923.0,
"entropy": 0.09486360847949982,
"epoch": 0.04520990312163617,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.828125,
"learning_rate": 9.558665231431647e-06,
"loss": -0.0715,
"num_tokens": 4982931.0,
"reward": 171.48448181152344,
"reward_std": 81.55918884277344,
"rewards/Rewards/mean": 171.48448181152344,
"rewards/Rewards/std": 155.43109130859375,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9963211417198181,
"sampling/importance_sampling_ratio/min": 0.007348766550421715,
"sampling/sampling_logp_difference/max": 4.913222789764404,
"sampling/sampling_logp_difference/mean": 0.018740836530923843,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2003.0,
"completions/mean_length": 1424.3125,
"completions/mean_terminated_length": 1382.7333984375,
"completions/min_length": 729.0,
"completions/min_terminated_length": 729.0,
"entropy": 0.09741601347923279,
"epoch": 0.04628632938643703,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.859375,
"learning_rate": 9.547900968783639e-06,
"loss": -0.0045,
"num_tokens": 5097222.0,
"reward": 211.81884765625,
"reward_std": 101.86795043945312,
"rewards/Rewards/mean": 211.81884765625,
"rewards/Rewards/std": 150.73831176757812,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9959251284599304,
"sampling/importance_sampling_ratio/min": 0.03727293387055397,
"sampling/sampling_logp_difference/max": 3.289487838745117,
"sampling/sampling_logp_difference/mean": 0.019588638097047806,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1041666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1989.0,
"completions/mean_length": 1534.8333740234375,
"completions/mean_terminated_length": 1475.162841796875,
"completions/min_length": 738.0,
"completions/min_terminated_length": 738.0,
"entropy": 0.09020587801933289,
"epoch": 0.04736275565123789,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.578125,
"learning_rate": 9.53713670613563e-06,
"loss": -0.0717,
"num_tokens": 5224438.0,
"reward": 110.83538818359375,
"reward_std": 79.80099487304688,
"rewards/Rewards/mean": 110.83538818359375,
"rewards/Rewards/std": 144.04507446289062,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962780475616455,
"sampling/importance_sampling_ratio/min": 0.04941500723361969,
"sampling/sampling_logp_difference/max": 3.0075011253356934,
"sampling/sampling_logp_difference/mean": 0.017612557858228683,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2024.0,
"completions/mean_length": 1616.25,
"completions/mean_terminated_length": 1472.3333740234375,
"completions/min_length": 673.0,
"completions/min_terminated_length": 673.0,
"entropy": 0.09505733847618103,
"epoch": 0.04843918191603875,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.75,
"learning_rate": 9.52637244348762e-06,
"loss": -0.0156,
"num_tokens": 5360650.0,
"reward": 135.80593872070312,
"reward_std": 77.96055603027344,
"rewards/Rewards/mean": 135.80592346191406,
"rewards/Rewards/std": 162.7121124267578,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9965173006057739,
"sampling/importance_sampling_ratio/min": 0.09018614143133163,
"sampling/sampling_logp_difference/max": 2.405879497528076,
"sampling/sampling_logp_difference/mean": 0.018498722463846207,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2708333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2027.0,
"completions/mean_length": 1686.229248046875,
"completions/mean_terminated_length": 1551.857177734375,
"completions/min_length": 947.0,
"completions/min_terminated_length": 947.0,
"entropy": 0.09737753868103027,
"epoch": 0.04951560818083961,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9140625,
"learning_rate": 9.515608180839613e-06,
"loss": -0.0086,
"num_tokens": 5487141.0,
"reward": 54.04819869995117,
"reward_std": 46.59992218017578,
"rewards/Rewards/mean": 54.04819869995117,
"rewards/Rewards/std": 70.76473999023438,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9959502220153809,
"sampling/importance_sampling_ratio/min": 2.4299904907820746e-05,
"sampling/sampling_logp_difference/max": 10.625038146972656,
"sampling/sampling_logp_difference/mean": 0.01930040866136551,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1981.0,
"completions/mean_length": 1558.2708740234375,
"completions/mean_terminated_length": 1488.3095703125,
"completions/min_length": 770.0,
"completions/min_terminated_length": 770.0,
"entropy": 0.10088413953781128,
"epoch": 0.05059203444564048,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.75,
"learning_rate": 9.504843918191604e-06,
"loss": -0.0587,
"num_tokens": 5617066.0,
"reward": 110.54548645019531,
"reward_std": 60.73766326904297,
"rewards/Rewards/mean": 110.54547882080078,
"rewards/Rewards/std": 137.8954620361328,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9965789318084717,
"sampling/importance_sampling_ratio/min": 0.030534274876117706,
"sampling/sampling_logp_difference/max": 3.488905429840088,
"sampling/sampling_logp_difference/mean": 0.018777910619974136,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1458333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2047.0,
"completions/mean_length": 1724.7708740234375,
"completions/mean_terminated_length": 1669.5853271484375,
"completions/min_length": 1103.0,
"completions/min_terminated_length": 1103.0,
"entropy": 0.0975833609700203,
"epoch": 0.05166846071044134,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5859375,
"learning_rate": 9.494079655543596e-06,
"loss": 0.0125,
"num_tokens": 5746925.0,
"reward": 165.01998901367188,
"reward_std": 72.62326049804688,
"rewards/Rewards/mean": 165.01995849609375,
"rewards/Rewards/std": 155.69970703125,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9964895248413086,
"sampling/importance_sampling_ratio/min": 0.0012031460646539927,
"sampling/sampling_logp_difference/max": 6.72281551361084,
"sampling/sampling_logp_difference/mean": 0.018396669998764992,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1952.0,
"completions/mean_length": 1696.4375,
"completions/mean_terminated_length": 1615.3077392578125,
"completions/min_length": 1103.0,
"completions/min_terminated_length": 1103.0,
"entropy": 0.10078628361225128,
"epoch": 0.0527448869752422,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7890625,
"learning_rate": 9.483315392895588e-06,
"loss": -0.0133,
"num_tokens": 5888714.0,
"reward": 121.20083618164062,
"reward_std": 103.41592407226562,
"rewards/Rewards/mean": 121.2008285522461,
"rewards/Rewards/std": 139.4957733154297,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9960814714431763,
"sampling/importance_sampling_ratio/min": 0.01684529334306717,
"sampling/sampling_logp_difference/max": 4.083683967590332,
"sampling/sampling_logp_difference/mean": 0.019092433154582977,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2036.0,
"completions/mean_length": 1389.416748046875,
"completions/mean_terminated_length": 1345.5111083984375,
"completions/min_length": 643.0,
"completions/min_terminated_length": 643.0,
"entropy": 0.09157487750053406,
"epoch": 0.05382131324004306,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.953125,
"learning_rate": 9.47255113024758e-06,
"loss": -0.0557,
"num_tokens": 5997592.0,
"reward": 144.40060424804688,
"reward_std": 57.521636962890625,
"rewards/Rewards/mean": 144.4005889892578,
"rewards/Rewards/std": 166.4522247314453,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9964755773544312,
"sampling/importance_sampling_ratio/min": 0.038425933569669724,
"sampling/sampling_logp_difference/max": 3.2590227127075195,
"sampling/sampling_logp_difference/mean": 0.018797706812620163,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1458333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2037.0,
"completions/mean_length": 1519.3958740234375,
"completions/mean_terminated_length": 1429.146240234375,
"completions/min_length": 624.0,
"completions/min_terminated_length": 624.0,
"entropy": 0.09402786195278168,
"epoch": 0.05489773950484392,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6875,
"learning_rate": 9.461786867599571e-06,
"loss": -0.0112,
"num_tokens": 6117167.0,
"reward": 212.0793914794922,
"reward_std": 130.76829528808594,
"rewards/Rewards/mean": 212.0793914794922,
"rewards/Rewards/std": 167.76551818847656,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9960064888000488,
"sampling/importance_sampling_ratio/min": 0.0003944017516914755,
"sampling/sampling_logp_difference/max": 7.838140487670898,
"sampling/sampling_logp_difference/mean": 0.018857020884752274,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1041666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2031.0,
"completions/mean_length": 1518.0,
"completions/mean_terminated_length": 1456.3720703125,
"completions/min_length": 671.0,
"completions/min_terminated_length": 671.0,
"entropy": 0.10275686532258987,
"epoch": 0.05597416576964478,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6484375,
"learning_rate": 9.451022604951561e-06,
"loss": -0.0197,
"num_tokens": 6234137.0,
"reward": 163.3297882080078,
"reward_std": 112.94554138183594,
"rewards/Rewards/mean": 163.3297882080078,
"rewards/Rewards/std": 149.05389404296875,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9961435794830322,
"sampling/importance_sampling_ratio/min": 0.020013727247714996,
"sampling/sampling_logp_difference/max": 3.911336898803711,
"sampling/sampling_logp_difference/mean": 0.01944819837808609,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2017.0,
"completions/mean_length": 1657.875,
"completions/mean_terminated_length": 1567.84619140625,
"completions/min_length": 663.0,
"completions/min_terminated_length": 663.0,
"entropy": 0.09955208003520966,
"epoch": 0.05705059203444564,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5859375,
"learning_rate": 9.440258342303553e-06,
"loss": 0.0054,
"num_tokens": 6360593.0,
"reward": 186.9022216796875,
"reward_std": 92.23899841308594,
"rewards/Rewards/mean": 186.90220642089844,
"rewards/Rewards/std": 152.16409301757812,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962495565414429,
"sampling/importance_sampling_ratio/min": 0.03338921442627907,
"sampling/sampling_logp_difference/max": 3.399522304534912,
"sampling/sampling_logp_difference/mean": 0.018768608570098877,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1974.0,
"completions/mean_length": 1413.104248046875,
"completions/mean_terminated_length": 1385.5,
"completions/min_length": 738.0,
"completions/min_terminated_length": 738.0,
"entropy": 0.10132155567407608,
"epoch": 0.0581270182992465,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9609375,
"learning_rate": 9.429494079655545e-06,
"loss": -0.026,
"num_tokens": 6476044.0,
"reward": 105.79423522949219,
"reward_std": 78.7714614868164,
"rewards/Rewards/mean": 105.7942123413086,
"rewards/Rewards/std": 127.42036437988281,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.995726466178894,
"sampling/importance_sampling_ratio/min": 0.02481662854552269,
"sampling/sampling_logp_difference/max": 3.6962413787841797,
"sampling/sampling_logp_difference/mean": 0.020124241709709167,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2029.0,
"completions/mean_length": 1417.166748046875,
"completions/mean_terminated_length": 1375.111083984375,
"completions/min_length": 668.0,
"completions/min_terminated_length": 668.0,
"entropy": 0.09215769916772842,
"epoch": 0.059203444564047365,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.96875,
"learning_rate": 9.418729817007536e-06,
"loss": -0.0679,
"num_tokens": 6588270.0,
"reward": 135.9485626220703,
"reward_std": 112.82146453857422,
"rewards/Rewards/mean": 135.9485626220703,
"rewards/Rewards/std": 160.89108276367188,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962545037269592,
"sampling/importance_sampling_ratio/min": 0.030534042045474052,
"sampling/sampling_logp_difference/max": 3.488913059234619,
"sampling/sampling_logp_difference/mean": 0.0190061517059803,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2916666865348816,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2028.0,
"completions/mean_length": 1753.0208740234375,
"completions/mean_terminated_length": 1631.558837890625,
"completions/min_length": 1058.0,
"completions/min_terminated_length": 1058.0,
"entropy": 0.10360866785049438,
"epoch": 0.060279870828848225,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6875,
"learning_rate": 9.407965554359528e-06,
"loss": 0.0544,
"num_tokens": 6725311.0,
"reward": 150.8729705810547,
"reward_std": 118.97310638427734,
"rewards/Rewards/mean": 150.87294006347656,
"rewards/Rewards/std": 161.175537109375,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9956259727478027,
"sampling/importance_sampling_ratio/min": 0.015444685705006123,
"sampling/sampling_logp_difference/max": 4.170490264892578,
"sampling/sampling_logp_difference/mean": 0.019745338708162308,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1458333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2029.0,
"completions/mean_length": 1575.666748046875,
"completions/mean_terminated_length": 1495.0242919921875,
"completions/min_length": 802.0,
"completions/min_terminated_length": 802.0,
"entropy": 0.09608377516269684,
"epoch": 0.061356297093649086,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6953125,
"learning_rate": 9.397201291711518e-06,
"loss": -0.0215,
"num_tokens": 6850545.0,
"reward": 178.48880004882812,
"reward_std": 125.83811950683594,
"rewards/Rewards/mean": 178.4888153076172,
"rewards/Rewards/std": 155.5189971923828,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9961973428726196,
"sampling/importance_sampling_ratio/min": 0.06855442374944687,
"sampling/sampling_logp_difference/max": 2.6801273822784424,
"sampling/sampling_logp_difference/mean": 0.018560823053121567,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2026.0,
"completions/mean_length": 1514.7708740234375,
"completions/mean_terminated_length": 1438.59521484375,
"completions/min_length": 665.0,
"completions/min_terminated_length": 665.0,
"entropy": 0.09563355147838593,
"epoch": 0.062432723358449946,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9296875,
"learning_rate": 9.38643702906351e-06,
"loss": -0.0427,
"num_tokens": 6978388.0,
"reward": 117.72795104980469,
"reward_std": 99.36922454833984,
"rewards/Rewards/mean": 117.72793579101562,
"rewards/Rewards/std": 152.76295471191406,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9963783025741577,
"sampling/importance_sampling_ratio/min": 0.023247025907039642,
"sampling/sampling_logp_difference/max": 3.76157808303833,
"sampling/sampling_logp_difference/mean": 0.019177014008164406,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1458333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1973.0,
"completions/mean_length": 1525.479248046875,
"completions/mean_terminated_length": 1436.2681884765625,
"completions/min_length": 863.0,
"completions/min_terminated_length": 863.0,
"entropy": 0.09957332164049149,
"epoch": 0.06350914962325081,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.59375,
"learning_rate": 9.375672766415501e-06,
"loss": -0.033,
"num_tokens": 7098075.0,
"reward": 209.320068359375,
"reward_std": 75.28262329101562,
"rewards/Rewards/mean": 209.320068359375,
"rewards/Rewards/std": 161.8440399169922,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9965529441833496,
"sampling/importance_sampling_ratio/min": 0.07584026455879211,
"sampling/sampling_logp_difference/max": 2.5791258811950684,
"sampling/sampling_logp_difference/mean": 0.019289657473564148,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2004.0,
"completions/mean_length": 1673.8125,
"completions/mean_terminated_length": 1598.9749755859375,
"completions/min_length": 869.0,
"completions/min_terminated_length": 869.0,
"entropy": 0.09496274590492249,
"epoch": 0.06458557588805167,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.65625,
"learning_rate": 9.364908503767493e-06,
"loss": 0.0237,
"num_tokens": 7235586.0,
"reward": 95.792724609375,
"reward_std": 94.22042083740234,
"rewards/Rewards/mean": 95.79271697998047,
"rewards/Rewards/std": 115.19368743896484,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9958664774894714,
"sampling/importance_sampling_ratio/min": 0.026444947347044945,
"sampling/sampling_logp_difference/max": 3.632690191268921,
"sampling/sampling_logp_difference/mean": 0.019015762954950333,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1992.0,
"completions/mean_length": 1550.666748046875,
"completions/mean_terminated_length": 1479.6190185546875,
"completions/min_length": 767.0,
"completions/min_terminated_length": 767.0,
"entropy": 0.097409687936306,
"epoch": 0.06566200215285253,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7109375,
"learning_rate": 9.354144241119483e-06,
"loss": 0.0211,
"num_tokens": 7361786.0,
"reward": 134.22329711914062,
"reward_std": 93.79661560058594,
"rewards/Rewards/mean": 134.22328186035156,
"rewards/Rewards/std": 143.19869995117188,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962544441223145,
"sampling/importance_sampling_ratio/min": 0.030053241178393364,
"sampling/sampling_logp_difference/max": 3.5047848224639893,
"sampling/sampling_logp_difference/mean": 0.01930471695959568,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2028.0,
"completions/mean_length": 1623.479248046875,
"completions/mean_terminated_length": 1562.8333740234375,
"completions/min_length": 1080.0,
"completions/min_terminated_length": 1080.0,
"entropy": 0.09310100972652435,
"epoch": 0.06673842841765339,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.296875,
"learning_rate": 9.343379978471475e-06,
"loss": 0.0072,
"num_tokens": 7488655.0,
"reward": 212.15054321289062,
"reward_std": 121.80223846435547,
"rewards/Rewards/mean": 212.15052795410156,
"rewards/Rewards/std": 165.37892150878906,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9959063529968262,
"sampling/importance_sampling_ratio/min": 0.02023465186357498,
"sampling/sampling_logp_difference/max": 3.9003586769104004,
"sampling/sampling_logp_difference/mean": 0.01847892440855503,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2041.0,
"completions/mean_length": 1498.416748046875,
"completions/mean_terminated_length": 1419.90478515625,
"completions/min_length": 833.0,
"completions/min_terminated_length": 833.0,
"entropy": 0.09334105253219604,
"epoch": 0.06781485468245425,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8046875,
"learning_rate": 9.332615715823467e-06,
"loss": -0.0597,
"num_tokens": 7605093.0,
"reward": 122.33708953857422,
"reward_std": 82.52408599853516,
"rewards/Rewards/mean": 122.33708953857422,
"rewards/Rewards/std": 139.99977111816406,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9959535002708435,
"sampling/importance_sampling_ratio/min": 0.012991238385438919,
"sampling/sampling_logp_difference/max": 4.343480110168457,
"sampling/sampling_logp_difference/mean": 0.01902196742594242,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2083333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1966.0,
"completions/mean_length": 1604.5833740234375,
"completions/mean_terminated_length": 1487.894775390625,
"completions/min_length": 762.0,
"completions/min_terminated_length": 762.0,
"entropy": 0.09379036724567413,
"epoch": 0.0688912809472551,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7265625,
"learning_rate": 9.321851453175458e-06,
"loss": 0.027,
"num_tokens": 7734385.0,
"reward": 152.49241638183594,
"reward_std": 91.75732421875,
"rewards/Rewards/mean": 152.49241638183594,
"rewards/Rewards/std": 155.2235107421875,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9959923028945923,
"sampling/importance_sampling_ratio/min": 0.0454302616417408,
"sampling/sampling_logp_difference/max": 3.0915768146514893,
"sampling/sampling_logp_difference/mean": 0.019177217036485672,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1458333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1992.0,
"completions/mean_length": 1404.854248046875,
"completions/mean_terminated_length": 1295.0487060546875,
"completions/min_length": 602.0,
"completions/min_terminated_length": 602.0,
"entropy": 0.09235261380672455,
"epoch": 0.06996770721205597,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.015625,
"learning_rate": 9.31108719052745e-06,
"loss": 0.0104,
"num_tokens": 7854288.0,
"reward": 142.0937957763672,
"reward_std": 94.76582336425781,
"rewards/Rewards/mean": 142.09378051757812,
"rewards/Rewards/std": 150.158447265625,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962871074676514,
"sampling/importance_sampling_ratio/min": 0.016520341858267784,
"sampling/sampling_logp_difference/max": 4.10316276550293,
"sampling/sampling_logp_difference/mean": 0.019058652222156525,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2291666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2039.0,
"completions/mean_length": 1712.75,
"completions/mean_terminated_length": 1613.0811767578125,
"completions/min_length": 964.0,
"completions/min_terminated_length": 964.0,
"entropy": 0.09805743396282196,
"epoch": 0.07104413347685684,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5625,
"learning_rate": 9.30032292787944e-06,
"loss": 0.0247,
"num_tokens": 7991970.0,
"reward": 109.45711517333984,
"reward_std": 96.96968841552734,
"rewards/Rewards/mean": 109.45711517333984,
"rewards/Rewards/std": 129.8672637939453,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9963368773460388,
"sampling/importance_sampling_ratio/min": 0.03675702214241028,
"sampling/sampling_logp_difference/max": 3.3034260272979736,
"sampling/sampling_logp_difference/mean": 0.018985003232955933,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0833333358168602,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2042.0,
"completions/mean_length": 1568.0833740234375,
"completions/mean_terminated_length": 1524.45458984375,
"completions/min_length": 920.0,
"completions/min_terminated_length": 920.0,
"entropy": 0.09471037983894348,
"epoch": 0.0721205597416577,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.625,
"learning_rate": 9.289558665231433e-06,
"loss": -0.0555,
"num_tokens": 8115184.0,
"reward": 217.14907836914062,
"reward_std": 111.93682861328125,
"rewards/Rewards/mean": 217.1490478515625,
"rewards/Rewards/std": 165.21253967285156,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962868690490723,
"sampling/importance_sampling_ratio/min": 0.025223759934306145,
"sampling/sampling_logp_difference/max": 3.67996883392334,
"sampling/sampling_logp_difference/mean": 0.018947413191199303,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3541666865348816,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2040.0,
"completions/mean_length": 1741.041748046875,
"completions/mean_terminated_length": 1572.7095947265625,
"completions/min_length": 599.0,
"completions/min_terminated_length": 599.0,
"entropy": 0.09793736040592194,
"epoch": 0.07319698600645856,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6953125,
"learning_rate": 9.278794402583423e-06,
"loss": -0.0115,
"num_tokens": 8246124.0,
"reward": 127.42999267578125,
"reward_std": 106.5477294921875,
"rewards/Rewards/mean": 127.42999267578125,
"rewards/Rewards/std": 152.0902099609375,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.995684027671814,
"sampling/importance_sampling_ratio/min": 0.07607259601354599,
"sampling/sampling_logp_difference/max": 2.5760672092437744,
"sampling/sampling_logp_difference/mean": 0.019500732421875,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1458333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1925.0,
"completions/mean_length": 1410.291748046875,
"completions/mean_terminated_length": 1301.41455078125,
"completions/min_length": 671.0,
"completions/min_terminated_length": 671.0,
"entropy": 0.09403970837593079,
"epoch": 0.07427341227125941,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.90625,
"learning_rate": 9.268030139935415e-06,
"loss": -0.0468,
"num_tokens": 8355194.0,
"reward": 137.93809509277344,
"reward_std": 79.21134948730469,
"rewards/Rewards/mean": 137.93809509277344,
"rewards/Rewards/std": 153.5146484375,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9967970252037048,
"sampling/importance_sampling_ratio/min": 0.017168184742331505,
"sampling/sampling_logp_difference/max": 4.064697265625,
"sampling/sampling_logp_difference/mean": 0.01961737498641014,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3333333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2012.0,
"completions/mean_length": 1782.0208740234375,
"completions/mean_terminated_length": 1649.03125,
"completions/min_length": 1140.0,
"completions/min_terminated_length": 1140.0,
"entropy": 0.09640659391880035,
"epoch": 0.07534983853606028,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6640625,
"learning_rate": 9.257265877287407e-06,
"loss": 0.0284,
"num_tokens": 8492217.0,
"reward": 138.16677856445312,
"reward_std": 84.71370697021484,
"rewards/Rewards/mean": 138.16676330566406,
"rewards/Rewards/std": 155.8728790283203,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962026476860046,
"sampling/importance_sampling_ratio/min": 0.08669225126504898,
"sampling/sampling_logp_difference/max": 2.4453907012939453,
"sampling/sampling_logp_difference/mean": 0.018790341913700104,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2040.0,
"completions/mean_length": 1621.8333740234375,
"completions/mean_terminated_length": 1536.5999755859375,
"completions/min_length": 657.0,
"completions/min_terminated_length": 657.0,
"entropy": 0.09194771200418472,
"epoch": 0.07642626480086114,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.96875,
"learning_rate": 9.246501614639399e-06,
"loss": -0.01,
"num_tokens": 8614717.0,
"reward": 162.2823944091797,
"reward_std": 151.30052185058594,
"rewards/Rewards/mean": 162.28236389160156,
"rewards/Rewards/std": 162.62136840820312,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9964730143547058,
"sampling/importance_sampling_ratio/min": 0.10786180943250656,
"sampling/sampling_logp_difference/max": 2.2269043922424316,
"sampling/sampling_logp_difference/mean": 0.018355056643486023,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1990.0,
"completions/mean_length": 1564.0833740234375,
"completions/mean_terminated_length": 1494.952392578125,
"completions/min_length": 832.0,
"completions/min_terminated_length": 832.0,
"entropy": 0.09191286563873291,
"epoch": 0.077502691065662,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7421875,
"learning_rate": 9.23573735199139e-06,
"loss": -0.0089,
"num_tokens": 8739977.0,
"reward": 202.41168212890625,
"reward_std": 113.517822265625,
"rewards/Rewards/mean": 202.41168212890625,
"rewards/Rewards/std": 167.29736328125,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9957892894744873,
"sampling/importance_sampling_ratio/min": 0.08669490367174149,
"sampling/sampling_logp_difference/max": 2.4453601837158203,
"sampling/sampling_logp_difference/mean": 0.018984520807862282,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0833333358168602,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1973.0,
"completions/mean_length": 1552.041748046875,
"completions/mean_terminated_length": 1506.95458984375,
"completions/min_length": 943.0,
"completions/min_terminated_length": 943.0,
"entropy": 0.09602253884077072,
"epoch": 0.07857911733046287,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.828125,
"learning_rate": 9.22497308934338e-06,
"loss": 0.0344,
"num_tokens": 8861137.0,
"reward": 157.66354370117188,
"reward_std": 144.0146942138672,
"rewards/Rewards/mean": 157.66354370117188,
"rewards/Rewards/std": 143.1285858154297,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9959803819656372,
"sampling/importance_sampling_ratio/min": 0.00499701825901866,
"sampling/sampling_logp_difference/max": 5.298913955688477,
"sampling/sampling_logp_difference/mean": 0.01990203559398651,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2083333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2012.0,
"completions/mean_length": 1609.041748046875,
"completions/mean_terminated_length": 1493.5263671875,
"completions/min_length": 942.0,
"completions/min_terminated_length": 942.0,
"entropy": 0.08647584915161133,
"epoch": 0.07965554359526372,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.65625,
"learning_rate": 9.214208826695372e-06,
"loss": -0.0107,
"num_tokens": 8989995.0,
"reward": 197.0958709716797,
"reward_std": 68.34449768066406,
"rewards/Rewards/mean": 197.09584045410156,
"rewards/Rewards/std": 159.35365295410156,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9968025088310242,
"sampling/importance_sampling_ratio/min": 0.0007289634668268263,
"sampling/sampling_logp_difference/max": 7.223886966705322,
"sampling/sampling_logp_difference/mean": 0.018994908779859543,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1041666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1995.0,
"completions/mean_length": 1519.0,
"completions/mean_terminated_length": 1457.4884033203125,
"completions/min_length": 799.0,
"completions/min_terminated_length": 799.0,
"entropy": 0.09478063881397247,
"epoch": 0.08073196986006459,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8359375,
"learning_rate": 9.203444564047364e-06,
"loss": 0.0352,
"num_tokens": 9115845.0,
"reward": 179.7698211669922,
"reward_std": 126.33210754394531,
"rewards/Rewards/mean": 179.7698211669922,
"rewards/Rewards/std": 156.64407348632812,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962257146835327,
"sampling/importance_sampling_ratio/min": 0.06889042258262634,
"sampling/sampling_logp_difference/max": 2.6752381324768066,
"sampling/sampling_logp_difference/mean": 0.01957097090780735,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2708333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2014.0,
"completions/mean_length": 1643.166748046875,
"completions/mean_terminated_length": 1492.800048828125,
"completions/min_length": 802.0,
"completions/min_terminated_length": 802.0,
"entropy": 0.09344817698001862,
"epoch": 0.08180839612486544,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.734375,
"learning_rate": 9.192680301399355e-06,
"loss": 0.0195,
"num_tokens": 9247301.0,
"reward": 147.28268432617188,
"reward_std": 124.65312957763672,
"rewards/Rewards/mean": 147.2826690673828,
"rewards/Rewards/std": 159.5615997314453,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9967602491378784,
"sampling/importance_sampling_ratio/min": 0.030053241178393364,
"sampling/sampling_logp_difference/max": 3.5047848224639893,
"sampling/sampling_logp_difference/mean": 0.018918678164482117,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0833333358168602,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1947.0,
"completions/mean_length": 1344.479248046875,
"completions/mean_terminated_length": 1280.522705078125,
"completions/min_length": 529.0,
"completions/min_terminated_length": 529.0,
"entropy": 0.09060220420360565,
"epoch": 0.08288482238966631,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.96875,
"learning_rate": 9.181916038751345e-06,
"loss": -0.0746,
"num_tokens": 9364396.0,
"reward": 199.82870483398438,
"reward_std": 49.06147003173828,
"rewards/Rewards/mean": 199.8286895751953,
"rewards/Rewards/std": 142.2600555419922,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962332844734192,
"sampling/importance_sampling_ratio/min": 0.021995970979332924,
"sampling/sampling_logp_difference/max": 3.8168959617614746,
"sampling/sampling_logp_difference/mean": 0.018977638334035873,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02083333395421505,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2015.0,
"completions/mean_length": 1400.1875,
"completions/mean_terminated_length": 1386.4041748046875,
"completions/min_length": 782.0,
"completions/min_terminated_length": 782.0,
"entropy": 0.09271110594272614,
"epoch": 0.08396124865446716,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.921875,
"learning_rate": 9.171151776103337e-06,
"loss": -0.0685,
"num_tokens": 9483805.0,
"reward": 157.17391967773438,
"reward_std": 84.85456848144531,
"rewards/Rewards/mean": 157.1739044189453,
"rewards/Rewards/std": 143.22023010253906,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9963641166687012,
"sampling/importance_sampling_ratio/min": 0.022093014791607857,
"sampling/sampling_logp_difference/max": 3.8124938011169434,
"sampling/sampling_logp_difference/mean": 0.019313348457217216,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2020.0,
"completions/mean_length": 1403.2708740234375,
"completions/mean_terminated_length": 1360.2889404296875,
"completions/min_length": 620.0,
"completions/min_terminated_length": 620.0,
"entropy": 0.0901971235871315,
"epoch": 0.08503767491926803,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.875,
"learning_rate": 9.160387513455329e-06,
"loss": -0.0746,
"num_tokens": 9596264.0,
"reward": 170.26272583007812,
"reward_std": 78.15837097167969,
"rewards/Rewards/mean": 170.26271057128906,
"rewards/Rewards/std": 155.24510192871094,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9961694478988647,
"sampling/importance_sampling_ratio/min": 0.1316412091255188,
"sampling/sampling_logp_difference/max": 2.027675151824951,
"sampling/sampling_logp_difference/mean": 0.019312690943479538,
"step": 79
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2024.0,
"completions/mean_length": 1519.3125,
"completions/mean_terminated_length": 1443.7857666015625,
"completions/min_length": 915.0,
"completions/min_terminated_length": 915.0,
"entropy": 0.08893078565597534,
"epoch": 0.0861141011840689,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9140625,
"learning_rate": 9.14962325080732e-06,
"loss": 0.0103,
"num_tokens": 9717743.0,
"reward": 170.59756469726562,
"reward_std": 144.99819946289062,
"rewards/Rewards/mean": 170.59754943847656,
"rewards/Rewards/std": 163.2159423828125,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962536096572876,
"sampling/importance_sampling_ratio/min": 0.0001781879982445389,
"sampling/sampling_logp_difference/max": 8.632671356201172,
"sampling/sampling_logp_difference/mean": 0.019303126260638237,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1984.0,
"completions/mean_length": 1579.3125,
"completions/mean_terminated_length": 1485.5750732421875,
"completions/min_length": 716.0,
"completions/min_terminated_length": 716.0,
"entropy": 0.09119614958763123,
"epoch": 0.08719052744886975,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8828125,
"learning_rate": 9.138858988159312e-06,
"loss": 0.0078,
"num_tokens": 9840362.0,
"reward": 166.75074768066406,
"reward_std": 94.51995086669922,
"rewards/Rewards/mean": 166.75071716308594,
"rewards/Rewards/std": 144.9541015625,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9964168071746826,
"sampling/importance_sampling_ratio/min": 0.019641246646642685,
"sampling/sampling_logp_difference/max": 3.9301235675811768,
"sampling/sampling_logp_difference/mean": 0.019442304968833923,
"step": 81
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2291666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2005.0,
"completions/mean_length": 1590.25,
"completions/mean_terminated_length": 1454.1622314453125,
"completions/min_length": 837.0,
"completions/min_terminated_length": 837.0,
"entropy": 0.0953579992055893,
"epoch": 0.08826695371367062,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.796875,
"learning_rate": 9.128094725511302e-06,
"loss": -0.0446,
"num_tokens": 9969788.0,
"reward": 145.84396362304688,
"reward_std": 74.80790710449219,
"rewards/Rewards/mean": 145.8439483642578,
"rewards/Rewards/std": 152.19375610351562,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9961199760437012,
"sampling/importance_sampling_ratio/min": 0.04095093160867691,
"sampling/sampling_logp_difference/max": 3.195380687713623,
"sampling/sampling_logp_difference/mean": 0.0189397893846035,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2033.0,
"completions/mean_length": 1368.604248046875,
"completions/mean_terminated_length": 1339.065185546875,
"completions/min_length": 714.0,
"completions/min_terminated_length": 714.0,
"entropy": 0.09253405779600143,
"epoch": 0.08934337997847147,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9765625,
"learning_rate": 9.117330462863294e-06,
"loss": -0.0991,
"num_tokens": 10077061.0,
"reward": 180.68536376953125,
"reward_std": 85.10310363769531,
"rewards/Rewards/mean": 180.68536376953125,
"rewards/Rewards/std": 155.8038330078125,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9964330792427063,
"sampling/importance_sampling_ratio/min": 0.03296323120594025,
"sampling/sampling_logp_difference/max": 3.412362575531006,
"sampling/sampling_logp_difference/mean": 0.019271500408649445,
"step": 83
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1041666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1923.0,
"completions/mean_length": 1404.5208740234375,
"completions/mean_terminated_length": 1329.6976318359375,
"completions/min_length": 874.0,
"completions/min_terminated_length": 874.0,
"entropy": 0.0913185402750969,
"epoch": 0.09041980624327234,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.96875,
"learning_rate": 9.106566200215286e-06,
"loss": -0.0419,
"num_tokens": 10195082.0,
"reward": 99.5012435913086,
"reward_std": 77.8438720703125,
"rewards/Rewards/mean": 99.50122833251953,
"rewards/Rewards/std": 125.01537322998047,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9956685304641724,
"sampling/importance_sampling_ratio/min": 0.12451466172933578,
"sampling/sampling_logp_difference/max": 2.083331823348999,
"sampling/sampling_logp_difference/mean": 0.019282877445220947,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1941.0,
"completions/mean_length": 1464.5833740234375,
"completions/mean_terminated_length": 1439.2174072265625,
"completions/min_length": 893.0,
"completions/min_terminated_length": 893.0,
"entropy": 0.09081118553876877,
"epoch": 0.09149623250807319,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.890625,
"learning_rate": 9.095801937567277e-06,
"loss": -0.0636,
"num_tokens": 10310478.0,
"reward": 220.60836791992188,
"reward_std": 109.03794860839844,
"rewards/Rewards/mean": 220.60838317871094,
"rewards/Rewards/std": 164.03514099121094,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9961858987808228,
"sampling/importance_sampling_ratio/min": 0.09935268014669418,
"sampling/sampling_logp_difference/max": 2.30907940864563,
"sampling/sampling_logp_difference/mean": 0.018993422389030457,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02083333395421505,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2024.0,
"completions/mean_length": 1472.6458740234375,
"completions/mean_terminated_length": 1460.4041748046875,
"completions/min_length": 949.0,
"completions/min_terminated_length": 949.0,
"entropy": 0.08875827491283417,
"epoch": 0.09257265877287406,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.5625,
"learning_rate": 9.085037674919269e-06,
"loss": -0.0453,
"num_tokens": 10435897.0,
"reward": 150.59307861328125,
"reward_std": 126.29867553710938,
"rewards/Rewards/mean": 150.59304809570312,
"rewards/Rewards/std": 154.72206115722656,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9966999292373657,
"sampling/importance_sampling_ratio/min": 0.01632927730679512,
"sampling/sampling_logp_difference/max": 4.114795684814453,
"sampling/sampling_logp_difference/mean": 0.018780577927827835,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0833333358168602,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1889.0,
"completions/mean_length": 1367.166748046875,
"completions/mean_terminated_length": 1305.2728271484375,
"completions/min_length": 905.0,
"completions/min_terminated_length": 905.0,
"entropy": 0.08919569849967957,
"epoch": 0.09364908503767493,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0,
"learning_rate": 9.07427341227126e-06,
"loss": 0.0005,
"num_tokens": 10543251.0,
"reward": 135.57723999023438,
"reward_std": 127.69003295898438,
"rewards/Rewards/mean": 135.5772247314453,
"rewards/Rewards/std": 148.4595184326172,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962979555130005,
"sampling/importance_sampling_ratio/min": 0.04543057829141617,
"sampling/sampling_logp_difference/max": 3.0915699005126953,
"sampling/sampling_logp_difference/mean": 0.018866896629333496,
"step": 87
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2041.0,
"completions/mean_length": 1396.3958740234375,
"completions/mean_terminated_length": 1352.95556640625,
"completions/min_length": 837.0,
"completions/min_terminated_length": 837.0,
"entropy": 0.08955030888319016,
"epoch": 0.09472551130247578,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9140625,
"learning_rate": 9.063509149623253e-06,
"loss": -0.0019,
"num_tokens": 10660648.0,
"reward": 151.53836059570312,
"reward_std": 118.24325561523438,
"rewards/Rewards/mean": 151.53834533691406,
"rewards/Rewards/std": 150.38818359375,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9959496259689331,
"sampling/importance_sampling_ratio/min": 0.03812497481703758,
"sampling/sampling_logp_difference/max": 3.266885757446289,
"sampling/sampling_logp_difference/mean": 0.019105281680822372,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1041666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2016.0,
"completions/mean_length": 1637.625,
"completions/mean_terminated_length": 1589.906982421875,
"completions/min_length": 779.0,
"completions/min_terminated_length": 779.0,
"entropy": 0.09204195439815521,
"epoch": 0.09580193756727665,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.71875,
"learning_rate": 9.052744886975243e-06,
"loss": -0.0224,
"num_tokens": 10794700.0,
"reward": 168.03155517578125,
"reward_std": 66.89749908447266,
"rewards/Rewards/mean": 168.0315399169922,
"rewards/Rewards/std": 136.70693969726562,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9961581230163574,
"sampling/importance_sampling_ratio/min": 0.013110673055052757,
"sampling/sampling_logp_difference/max": 4.334328651428223,
"sampling/sampling_logp_difference/mean": 0.01949831284582615,
"step": 89
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0833333358168602,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2044.0,
"completions/mean_length": 1513.25,
"completions/mean_terminated_length": 1464.6363525390625,
"completions/min_length": 1076.0,
"completions/min_terminated_length": 1076.0,
"entropy": 0.09165016561746597,
"epoch": 0.0968783638320775,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.015625,
"learning_rate": 9.041980624327234e-06,
"loss": -0.0242,
"num_tokens": 10915138.0,
"reward": 186.45489501953125,
"reward_std": 97.13003540039062,
"rewards/Rewards/mean": 186.4548797607422,
"rewards/Rewards/std": 155.18177795410156,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9963523149490356,
"sampling/importance_sampling_ratio/min": 0.029388200491666794,
"sampling/sampling_logp_difference/max": 3.5271620750427246,
"sampling/sampling_logp_difference/mean": 0.019088543951511383,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0833333358168602,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2022.0,
"completions/mean_length": 1557.125,
"completions/mean_terminated_length": 1512.5,
"completions/min_length": 840.0,
"completions/min_terminated_length": 840.0,
"entropy": 0.09386443346738815,
"epoch": 0.09795479009687837,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.90625,
"learning_rate": 9.031216361679226e-06,
"loss": -0.0759,
"num_tokens": 11036938.0,
"reward": 171.71578979492188,
"reward_std": 80.66743469238281,
"rewards/Rewards/mean": 171.71575927734375,
"rewards/Rewards/std": 146.78236389160156,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9958236217498779,
"sampling/importance_sampling_ratio/min": 0.07584737241268158,
"sampling/sampling_logp_difference/max": 2.5790321826934814,
"sampling/sampling_logp_difference/mean": 0.019366014748811722,
"step": 91
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1956.0,
"completions/mean_length": 1487.229248046875,
"completions/mean_terminated_length": 1407.1190185546875,
"completions/min_length": 661.0,
"completions/min_terminated_length": 661.0,
"entropy": 0.09276638925075531,
"epoch": 0.09903121636167922,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9140625,
"learning_rate": 9.020452099031218e-06,
"loss": -0.0159,
"num_tokens": 11153955.0,
"reward": 125.07907104492188,
"reward_std": 71.97328186035156,
"rewards/Rewards/mean": 125.07907104492188,
"rewards/Rewards/std": 151.87110900878906,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9967538118362427,
"sampling/importance_sampling_ratio/min": 0.027835814282298088,
"sampling/sampling_logp_difference/max": 3.5814318656921387,
"sampling/sampling_logp_difference/mean": 0.018818777054548264,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1922.0,
"completions/max_terminated_length": 1922.0,
"completions/mean_length": 1341.75,
"completions/mean_terminated_length": 1341.75,
"completions/min_length": 740.0,
"completions/min_terminated_length": 740.0,
"entropy": 0.09067563712596893,
"epoch": 0.10010764262648009,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8984375,
"learning_rate": 9.00968783638321e-06,
"loss": -0.0657,
"num_tokens": 11265063.0,
"reward": 160.42709350585938,
"reward_std": 79.6560287475586,
"rewards/Rewards/mean": 160.4270782470703,
"rewards/Rewards/std": 163.8791961669922,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9966100454330444,
"sampling/importance_sampling_ratio/min": 0.10168613493442535,
"sampling/sampling_logp_difference/max": 2.2858643531799316,
"sampling/sampling_logp_difference/mean": 0.018616581335663795,
"step": 93
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1458333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1956.0,
"completions/mean_length": 1559.2083740234375,
"completions/mean_terminated_length": 1475.756103515625,
"completions/min_length": 922.0,
"completions/min_terminated_length": 922.0,
"entropy": 0.09171372652053833,
"epoch": 0.10118406889128095,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.6953125,
"learning_rate": 8.9989235737352e-06,
"loss": -0.0215,
"num_tokens": 11388661.0,
"reward": 180.605712890625,
"reward_std": 80.27830505371094,
"rewards/Rewards/mean": 180.60569763183594,
"rewards/Rewards/std": 162.41473388671875,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9959461688995361,
"sampling/importance_sampling_ratio/min": 0.05554138496518135,
"sampling/sampling_logp_difference/max": 2.890626907348633,
"sampling/sampling_logp_difference/mean": 0.019266359508037567,
"step": 94
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1966.0,
"completions/mean_length": 1645.0,
"completions/mean_terminated_length": 1552.0,
"completions/min_length": 981.0,
"completions/min_terminated_length": 981.0,
"entropy": 0.09587572515010834,
"epoch": 0.10226049515608181,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6796875,
"learning_rate": 8.988159311087191e-06,
"loss": -0.0386,
"num_tokens": 11516587.0,
"reward": 217.25149536132812,
"reward_std": 120.60152435302734,
"rewards/Rewards/mean": 217.25148010253906,
"rewards/Rewards/std": 168.1180419921875,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9957050085067749,
"sampling/importance_sampling_ratio/min": 5.652933850797126e-06,
"sampling/sampling_logp_difference/max": 12.083335876464844,
"sampling/sampling_logp_difference/mean": 0.019527170807123184,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2291666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2038.0,
"completions/mean_length": 1637.8125,
"completions/mean_terminated_length": 1515.8648681640625,
"completions/min_length": 920.0,
"completions/min_terminated_length": 920.0,
"entropy": 0.09848017990589142,
"epoch": 0.10333692142088267,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6875,
"learning_rate": 8.977395048439183e-06,
"loss": 0.0114,
"num_tokens": 11647348.0,
"reward": 197.4237060546875,
"reward_std": 117.47871398925781,
"rewards/Rewards/mean": 197.4237060546875,
"rewards/Rewards/std": 161.23277282714844,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9960770606994629,
"sampling/importance_sampling_ratio/min": 0.04941500723361969,
"sampling/sampling_logp_difference/max": 3.0075011253356934,
"sampling/sampling_logp_difference/mean": 0.019542451947927475,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2029.0,
"completions/mean_length": 1499.0625,
"completions/mean_terminated_length": 1475.1956787109375,
"completions/min_length": 908.0,
"completions/min_terminated_length": 908.0,
"entropy": 0.09346417337656021,
"epoch": 0.10441334768568353,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7734375,
"learning_rate": 8.966630785791175e-06,
"loss": -0.0408,
"num_tokens": 11764441.0,
"reward": 278.59368896484375,
"reward_std": 64.89412689208984,
"rewards/Rewards/mean": 278.5936584472656,
"rewards/Rewards/std": 126.29338073730469,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9967272281646729,
"sampling/importance_sampling_ratio/min": 0.004661599174141884,
"sampling/sampling_logp_difference/max": 5.368396759033203,
"sampling/sampling_logp_difference/mean": 0.01916693150997162,
"step": 97
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2006.0,
"completions/mean_length": 1479.6875,
"completions/mean_terminated_length": 1441.800048828125,
"completions/min_length": 986.0,
"completions/min_terminated_length": 986.0,
"entropy": 0.09089180082082748,
"epoch": 0.1054897739504844,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9296875,
"learning_rate": 8.955866523143165e-06,
"loss": -0.0522,
"num_tokens": 11879656.0,
"reward": 149.2887725830078,
"reward_std": 110.22410583496094,
"rewards/Rewards/mean": 149.28875732421875,
"rewards/Rewards/std": 156.80722045898438,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9960883855819702,
"sampling/importance_sampling_ratio/min": 0.06272123008966446,
"sampling/sampling_logp_difference/max": 2.7690553665161133,
"sampling/sampling_logp_difference/mean": 0.019509928300976753,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1041666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1982.0,
"completions/mean_length": 1593.229248046875,
"completions/mean_terminated_length": 1540.348876953125,
"completions/min_length": 1045.0,
"completions/min_terminated_length": 1045.0,
"entropy": 0.09180265665054321,
"epoch": 0.10656620021528525,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.78125,
"learning_rate": 8.945102260495156e-06,
"loss": -0.0186,
"num_tokens": 12014289.0,
"reward": 265.54229736328125,
"reward_std": 133.96429443359375,
"rewards/Rewards/mean": 265.5422668457031,
"rewards/Rewards/std": 150.7986297607422,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9964120984077454,
"sampling/importance_sampling_ratio/min": 0.0691010132431984,
"sampling/sampling_logp_difference/max": 2.6721858978271484,
"sampling/sampling_logp_difference/mean": 0.019088027998805046,
"step": 99
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2021.0,
"completions/mean_length": 1706.375,
"completions/mean_terminated_length": 1592.5,
"completions/min_length": 991.0,
"completions/min_terminated_length": 991.0,
"entropy": 0.093202605843544,
"epoch": 0.10764262648008611,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6953125,
"learning_rate": 8.934337997847148e-06,
"loss": 0.0266,
"num_tokens": 12145557.0,
"reward": 247.9436492919922,
"reward_std": 136.01905822753906,
"rewards/Rewards/mean": 247.94361877441406,
"rewards/Rewards/std": 162.2523651123047,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9966781139373779,
"sampling/importance_sampling_ratio/min": 0.05028429627418518,
"sampling/sampling_logp_difference/max": 2.9900624752044678,
"sampling/sampling_logp_difference/mean": 0.018772196024656296,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1041666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2029.0,
"completions/mean_length": 1523.0,
"completions/mean_terminated_length": 1461.9534912109375,
"completions/min_length": 1013.0,
"completions/min_terminated_length": 1013.0,
"entropy": 0.08810561150312424,
"epoch": 0.10871905274488698,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7421875,
"learning_rate": 8.92357373519914e-06,
"loss": -0.0335,
"num_tokens": 12264243.0,
"reward": 94.70065307617188,
"reward_std": 89.24530029296875,
"rewards/Rewards/mean": 94.70065307617188,
"rewards/Rewards/std": 126.46951293945312,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9960950016975403,
"sampling/importance_sampling_ratio/min": 0.006951752584427595,
"sampling/sampling_logp_difference/max": 4.968761444091797,
"sampling/sampling_logp_difference/mean": 0.018199002370238304,
"step": 101
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2034.0,
"completions/mean_length": 1659.354248046875,
"completions/mean_terminated_length": 1529.8055419921875,
"completions/min_length": 1055.0,
"completions/min_terminated_length": 1055.0,
"entropy": 0.0948924720287323,
"epoch": 0.10979547900968784,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8671875,
"learning_rate": 8.91280947255113e-06,
"loss": 0.0178,
"num_tokens": 12395852.0,
"reward": 133.68142700195312,
"reward_std": 94.77001190185547,
"rewards/Rewards/mean": 133.68141174316406,
"rewards/Rewards/std": 151.79200744628906,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962817430496216,
"sampling/importance_sampling_ratio/min": 0.04595468193292618,
"sampling/sampling_logp_difference/max": 3.080099582672119,
"sampling/sampling_logp_difference/mean": 0.01914885640144348,
"step": 102
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2039.0,
"completions/mean_length": 1663.416748046875,
"completions/mean_terminated_length": 1586.5,
"completions/min_length": 1071.0,
"completions/min_terminated_length": 1071.0,
"entropy": 0.09139697253704071,
"epoch": 0.1108719052744887,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7890625,
"learning_rate": 8.902045209903121e-06,
"loss": 0.0123,
"num_tokens": 12527806.0,
"reward": 232.5415496826172,
"reward_std": 101.91683197021484,
"rewards/Rewards/mean": 232.5415496826172,
"rewards/Rewards/std": 157.9981689453125,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9964860677719116,
"sampling/importance_sampling_ratio/min": 0.10882613807916641,
"sampling/sampling_logp_difference/max": 2.218003749847412,
"sampling/sampling_logp_difference/mean": 0.018667157739400864,
"step": 103
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1930.0,
"completions/mean_length": 1590.5208740234375,
"completions/mean_terminated_length": 1484.94873046875,
"completions/min_length": 849.0,
"completions/min_terminated_length": 849.0,
"entropy": 0.0869491845369339,
"epoch": 0.11194833153928956,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.90625,
"learning_rate": 8.891280947255115e-06,
"loss": -0.0273,
"num_tokens": 12652877.0,
"reward": 154.14407348632812,
"reward_std": 89.59563446044922,
"rewards/Rewards/mean": 154.14405822753906,
"rewards/Rewards/std": 153.5826873779297,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9963630437850952,
"sampling/importance_sampling_ratio/min": 0.02814963459968567,
"sampling/sampling_logp_difference/max": 3.570220947265625,
"sampling/sampling_logp_difference/mean": 0.018812354654073715,
"step": 104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1041666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1974.0,
"completions/mean_length": 1532.1875,
"completions/mean_terminated_length": 1472.2093505859375,
"completions/min_length": 865.0,
"completions/min_terminated_length": 865.0,
"entropy": 0.0883987694978714,
"epoch": 0.11302475780409042,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9140625,
"learning_rate": 8.880516684607105e-06,
"loss": -0.0401,
"num_tokens": 12775958.0,
"reward": 93.6148681640625,
"reward_std": 73.73970794677734,
"rewards/Rewards/mean": 93.6148681640625,
"rewards/Rewards/std": 122.41616821289062,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962837100028992,
"sampling/importance_sampling_ratio/min": 0.05158340558409691,
"sampling/sampling_logp_difference/max": 2.964555263519287,
"sampling/sampling_logp_difference/mean": 0.019515471532940865,
"step": 105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1987.0,
"completions/mean_length": 1416.25,
"completions/mean_terminated_length": 1388.7825927734375,
"completions/min_length": 630.0,
"completions/min_terminated_length": 630.0,
"entropy": 0.08332017064094543,
"epoch": 0.11410118406889128,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.171875,
"learning_rate": 8.869752421959097e-06,
"loss": -0.0793,
"num_tokens": 12886424.0,
"reward": 168.2129669189453,
"reward_std": 76.80313110351562,
"rewards/Rewards/mean": 168.21295166015625,
"rewards/Rewards/std": 158.4163818359375,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9966524243354797,
"sampling/importance_sampling_ratio/min": 0.007026932667940855,
"sampling/sampling_logp_difference/max": 4.958004951477051,
"sampling/sampling_logp_difference/mean": 0.018798114731907845,
"step": 106
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1885.0,
"completions/mean_length": 1549.7708740234375,
"completions/mean_terminated_length": 1450.125,
"completions/min_length": 920.0,
"completions/min_terminated_length": 920.0,
"entropy": 0.08735213428735733,
"epoch": 0.11517761033369214,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.09375,
"learning_rate": 8.858988159311088e-06,
"loss": -0.0218,
"num_tokens": 13005801.0,
"reward": 143.79306030273438,
"reward_std": 135.1495361328125,
"rewards/Rewards/mean": 143.7930450439453,
"rewards/Rewards/std": 151.38134765625,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9966720938682556,
"sampling/importance_sampling_ratio/min": 0.06602096557617188,
"sampling/sampling_logp_difference/max": 2.717782974243164,
"sampling/sampling_logp_difference/mean": 0.019426580518484116,
"step": 107
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1987.0,
"completions/mean_length": 1655.666748046875,
"completions/mean_terminated_length": 1577.2000732421875,
"completions/min_length": 1086.0,
"completions/min_terminated_length": 1086.0,
"entropy": 0.08660842478275299,
"epoch": 0.116254036598493,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7578125,
"learning_rate": 8.84822389666308e-06,
"loss": -0.0023,
"num_tokens": 13134245.0,
"reward": 124.0144271850586,
"reward_std": 115.87239837646484,
"rewards/Rewards/mean": 124.0144271850586,
"rewards/Rewards/std": 147.93455505371094,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962751865386963,
"sampling/importance_sampling_ratio/min": 0.07169795781373978,
"sampling/sampling_logp_difference/max": 2.6352930068969727,
"sampling/sampling_logp_difference/mean": 0.01898413896560669,
"step": 108
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2042.0,
"completions/mean_length": 1540.375,
"completions/mean_terminated_length": 1506.5333251953125,
"completions/min_length": 834.0,
"completions/min_terminated_length": 834.0,
"entropy": 0.08225899934768677,
"epoch": 0.11733046286329386,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9140625,
"learning_rate": 8.837459634015072e-06,
"loss": -0.0635,
"num_tokens": 13252091.0,
"reward": 172.61041259765625,
"reward_std": 81.08895874023438,
"rewards/Rewards/mean": 172.61041259765625,
"rewards/Rewards/std": 158.3402099609375,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9964280128479004,
"sampling/importance_sampling_ratio/min": 0.024863438680768013,
"sampling/sampling_logp_difference/max": 3.694356918334961,
"sampling/sampling_logp_difference/mean": 0.018688606098294258,
"step": 109
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1041666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2005.0,
"completions/mean_length": 1607.8125,
"completions/mean_terminated_length": 1556.6279296875,
"completions/min_length": 957.0,
"completions/min_terminated_length": 957.0,
"entropy": 0.08497870713472366,
"epoch": 0.11840688912809473,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8515625,
"learning_rate": 8.826695371367062e-06,
"loss": 0.0312,
"num_tokens": 13378280.0,
"reward": 159.2774658203125,
"reward_std": 63.95246505737305,
"rewards/Rewards/mean": 159.2774658203125,
"rewards/Rewards/std": 141.60948181152344,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9966553449630737,
"sampling/importance_sampling_ratio/min": 0.05783185735344887,
"sampling/sampling_logp_difference/max": 2.850215435028076,
"sampling/sampling_logp_difference/mean": 0.01872144639492035,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2044.0,
"completions/mean_length": 1874.041748046875,
"completions/mean_terminated_length": 1700.0833740234375,
"completions/min_length": 1228.0,
"completions/min_terminated_length": 1228.0,
"entropy": 0.09004805237054825,
"epoch": 0.11948331539289558,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.578125,
"learning_rate": 8.815931108719053e-06,
"loss": 0.0354,
"num_tokens": 13519228.0,
"reward": 84.75015258789062,
"reward_std": 103.92143249511719,
"rewards/Rewards/mean": 84.75015258789062,
"rewards/Rewards/std": 132.39300537109375,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962224960327148,
"sampling/importance_sampling_ratio/min": 0.06855442374944687,
"sampling/sampling_logp_difference/max": 2.6801273822784424,
"sampling/sampling_logp_difference/mean": 0.01885523647069931,
"step": 111
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1993.0,
"completions/mean_length": 1600.479248046875,
"completions/mean_terminated_length": 1536.547607421875,
"completions/min_length": 807.0,
"completions/min_terminated_length": 807.0,
"entropy": 0.08414789289236069,
"epoch": 0.12055974165769645,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.734375,
"learning_rate": 8.805166846071045e-06,
"loss": -0.0281,
"num_tokens": 13643043.0,
"reward": 231.875732421875,
"reward_std": 110.37709045410156,
"rewards/Rewards/mean": 231.875732421875,
"rewards/Rewards/std": 160.89878845214844,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9967577457427979,
"sampling/importance_sampling_ratio/min": 0.12920556962490082,
"sampling/sampling_logp_difference/max": 2.0463504791259766,
"sampling/sampling_logp_difference/mean": 0.017805946990847588,
"step": 112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2708333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2035.0,
"completions/mean_length": 1683.8958740234375,
"completions/mean_terminated_length": 1548.6571044921875,
"completions/min_length": 779.0,
"completions/min_terminated_length": 779.0,
"entropy": 0.08192656934261322,
"epoch": 0.1216361679224973,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.703125,
"learning_rate": 8.794402583423037e-06,
"loss": -0.0205,
"num_tokens": 13764814.0,
"reward": 157.37353515625,
"reward_std": 125.2473373413086,
"rewards/Rewards/mean": 157.37355041503906,
"rewards/Rewards/std": 171.18760681152344,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9961519241333008,
"sampling/importance_sampling_ratio/min": 0.004844507202506065,
"sampling/sampling_logp_difference/max": 5.329909801483154,
"sampling/sampling_logp_difference/mean": 0.01832774095237255,
"step": 113
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2038.0,
"completions/mean_length": 1677.666748046875,
"completions/mean_terminated_length": 1624.761962890625,
"completions/min_length": 842.0,
"completions/min_terminated_length": 842.0,
"entropy": 0.08480958640575409,
"epoch": 0.12271259418729817,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7265625,
"learning_rate": 8.783638320775027e-06,
"loss": -0.0138,
"num_tokens": 13891836.0,
"reward": 152.83729553222656,
"reward_std": 89.38969421386719,
"rewards/Rewards/mean": 152.8372802734375,
"rewards/Rewards/std": 149.14773559570312,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9965521097183228,
"sampling/importance_sampling_ratio/min": 0.019641246646642685,
"sampling/sampling_logp_difference/max": 3.9301235675811768,
"sampling/sampling_logp_difference/mean": 0.017855927348136902,
"step": 114
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2708333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2032.0,
"completions/mean_length": 1767.354248046875,
"completions/mean_terminated_length": 1663.1142578125,
"completions/min_length": 1045.0,
"completions/min_terminated_length": 1045.0,
"entropy": 0.08714842796325684,
"epoch": 0.12378902045209902,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7421875,
"learning_rate": 8.772874058127019e-06,
"loss": 0.0045,
"num_tokens": 14024063.0,
"reward": 144.72988891601562,
"reward_std": 145.175048828125,
"rewards/Rewards/mean": 144.72987365722656,
"rewards/Rewards/std": 159.03140258789062,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.996498703956604,
"sampling/importance_sampling_ratio/min": 0.016073117032647133,
"sampling/sampling_logp_difference/max": 4.1306071281433105,
"sampling/sampling_logp_difference/mean": 0.018725711852312088,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2291666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2044.0,
"completions/mean_length": 1498.25,
"completions/mean_terminated_length": 1334.810791015625,
"completions/min_length": 576.0,
"completions/min_terminated_length": 576.0,
"entropy": 0.08243508636951447,
"epoch": 0.12486544671689989,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.921875,
"learning_rate": 8.76210979547901e-06,
"loss": -0.0135,
"num_tokens": 14142359.0,
"reward": 68.60581970214844,
"reward_std": 65.1133041381836,
"rewards/Rewards/mean": 68.6058120727539,
"rewards/Rewards/std": 118.82546997070312,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9966591596603394,
"sampling/importance_sampling_ratio/min": 0.06388171017169952,
"sampling/sampling_logp_difference/max": 2.7507221698760986,
"sampling/sampling_logp_difference/mean": 0.018525827676057816,
"step": 116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4791666865348816,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2029.0,
"completions/mean_length": 1731.3958740234375,
"completions/mean_terminated_length": 1440.1199951171875,
"completions/min_length": 1004.0,
"completions/min_terminated_length": 1004.0,
"entropy": 0.08880011737346649,
"epoch": 0.12594187298170076,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.75,
"learning_rate": 8.751345532831002e-06,
"loss": 0.0262,
"num_tokens": 14271606.0,
"reward": 134.94076538085938,
"reward_std": 99.54734802246094,
"rewards/Rewards/mean": 134.9407501220703,
"rewards/Rewards/std": 162.60922241210938,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9966139793395996,
"sampling/importance_sampling_ratio/min": 0.030053241178393364,
"sampling/sampling_logp_difference/max": 3.5047848224639893,
"sampling/sampling_logp_difference/mean": 0.018653348088264465,
"step": 117
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2291666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1988.0,
"completions/mean_length": 1765.791748046875,
"completions/mean_terminated_length": 1681.8919677734375,
"completions/min_length": 1199.0,
"completions/min_terminated_length": 1199.0,
"entropy": 0.08772322535514832,
"epoch": 0.12701829924650163,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.796875,
"learning_rate": 8.740581270182994e-06,
"loss": 0.0118,
"num_tokens": 14417078.0,
"reward": 165.54493713378906,
"reward_std": 86.4625244140625,
"rewards/Rewards/mean": 165.54493713378906,
"rewards/Rewards/std": 162.67767333984375,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9964956045150757,
"sampling/importance_sampling_ratio/min": 0.038425639271736145,
"sampling/sampling_logp_difference/max": 3.259030342102051,
"sampling/sampling_logp_difference/mean": 0.018552402034401894,
"step": 118
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2010.0,
"completions/mean_length": 1704.0833740234375,
"completions/mean_terminated_length": 1547.757568359375,
"completions/min_length": 743.0,
"completions/min_terminated_length": 743.0,
"entropy": 0.0828559622168541,
"epoch": 0.12809472551130247,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.796875,
"learning_rate": 8.729817007534984e-06,
"loss": 0.0237,
"num_tokens": 14543238.0,
"reward": 206.37078857421875,
"reward_std": 107.7475814819336,
"rewards/Rewards/mean": 206.3707733154297,
"rewards/Rewards/std": 168.4974822998047,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9965498447418213,
"sampling/importance_sampling_ratio/min": 0.03842545673251152,
"sampling/sampling_logp_difference/max": 3.259035110473633,
"sampling/sampling_logp_difference/mean": 0.018624860793352127,
"step": 119
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2708333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1990.0,
"completions/mean_length": 1640.2708740234375,
"completions/mean_terminated_length": 1488.82861328125,
"completions/min_length": 929.0,
"completions/min_terminated_length": 929.0,
"entropy": 0.08398817479610443,
"epoch": 0.12917115177610333,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.84375,
"learning_rate": 8.719052744886975e-06,
"loss": 0.0249,
"num_tokens": 14670391.0,
"reward": 110.42204284667969,
"reward_std": 105.4375228881836,
"rewards/Rewards/mean": 110.42205047607422,
"rewards/Rewards/std": 140.33175659179688,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9965760707855225,
"sampling/importance_sampling_ratio/min": 0.00865168683230877,
"sampling/sampling_logp_difference/max": 4.750000953674316,
"sampling/sampling_logp_difference/mean": 0.018549980595707893,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2083333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2018.0,
"completions/mean_length": 1704.5208740234375,
"completions/mean_terminated_length": 1614.131591796875,
"completions/min_length": 1057.0,
"completions/min_terminated_length": 1057.0,
"entropy": 0.0853036642074585,
"epoch": 0.1302475780409042,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8125,
"learning_rate": 8.708288482238967e-06,
"loss": 0.0115,
"num_tokens": 14794730.0,
"reward": 211.4768524169922,
"reward_std": 144.12379455566406,
"rewards/Rewards/mean": 211.4768524169922,
"rewards/Rewards/std": 166.9752197265625,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9965263605117798,
"sampling/importance_sampling_ratio/min": 0.016329152509570122,
"sampling/sampling_logp_difference/max": 4.114803314208984,
"sampling/sampling_logp_difference/mean": 0.018672306090593338,
"step": 121
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2291666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2014.0,
"completions/mean_length": 1783.541748046875,
"completions/mean_terminated_length": 1704.9189453125,
"completions/min_length": 1418.0,
"completions/min_terminated_length": 1418.0,
"entropy": 0.08703920990228653,
"epoch": 0.13132400430570507,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9609375,
"learning_rate": 8.697524219590959e-06,
"loss": 0.0198,
"num_tokens": 14931316.0,
"reward": 102.9444580078125,
"reward_std": 77.51891326904297,
"rewards/Rewards/mean": 102.9444580078125,
"rewards/Rewards/std": 117.40369415283203,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962136149406433,
"sampling/importance_sampling_ratio/min": 0.03993840888142586,
"sampling/sampling_logp_difference/max": 3.220416784286499,
"sampling/sampling_logp_difference/mean": 0.018898990005254745,
"step": 122
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2024.0,
"completions/mean_length": 1753.291748046875,
"completions/mean_terminated_length": 1655.0555419921875,
"completions/min_length": 1161.0,
"completions/min_terminated_length": 1161.0,
"entropy": 0.08477212488651276,
"epoch": 0.13240043057050593,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8125,
"learning_rate": 8.68675995694295e-06,
"loss": 0.0348,
"num_tokens": 15060072.0,
"reward": 185.78619384765625,
"reward_std": 137.3616943359375,
"rewards/Rewards/mean": 185.7861785888672,
"rewards/Rewards/std": 168.95420837402344,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9964303970336914,
"sampling/importance_sampling_ratio/min": 6.512909749289975e-05,
"sampling/sampling_logp_difference/max": 9.639139175415039,
"sampling/sampling_logp_difference/mean": 0.01922321878373623,
"step": 123
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4791666865348816,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2028.0,
"completions/mean_length": 1849.8333740234375,
"completions/mean_terminated_length": 1667.52001953125,
"completions/min_length": 1176.0,
"completions/min_terminated_length": 1176.0,
"entropy": 0.08716564625501633,
"epoch": 0.13347685683530677,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.71875,
"learning_rate": 8.675995694294942e-06,
"loss": 0.0449,
"num_tokens": 15202438.0,
"reward": 113.38813781738281,
"reward_std": 110.96601867675781,
"rewards/Rewards/mean": 113.38814544677734,
"rewards/Rewards/std": 153.09771728515625,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9958086013793945,
"sampling/importance_sampling_ratio/min": 0.0003843386366497725,
"sampling/sampling_logp_difference/max": 7.863986492156982,
"sampling/sampling_logp_difference/mean": 0.01934371329843998,
"step": 124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2007.0,
"completions/mean_length": 1751.979248046875,
"completions/mean_terminated_length": 1617.42431640625,
"completions/min_length": 1142.0,
"completions/min_terminated_length": 1142.0,
"entropy": 0.08353154361248016,
"epoch": 0.13455328310010764,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.109375,
"learning_rate": 8.665231431646934e-06,
"loss": 0.0237,
"num_tokens": 15342189.0,
"reward": 117.08355712890625,
"reward_std": 117.69966888427734,
"rewards/Rewards/mean": 117.08355712890625,
"rewards/Rewards/std": 142.62982177734375,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9963948130607605,
"sampling/importance_sampling_ratio/min": 0.012470904737710953,
"sampling/sampling_logp_difference/max": 4.38435697555542,
"sampling/sampling_logp_difference/mean": 0.01883932203054428,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1041666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2043.0,
"completions/mean_length": 1634.6875,
"completions/mean_terminated_length": 1586.6279296875,
"completions/min_length": 1052.0,
"completions/min_terminated_length": 1052.0,
"entropy": 0.08093823492527008,
"epoch": 0.1356297093649085,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.875,
"learning_rate": 8.654467168998924e-06,
"loss": 0.0178,
"num_tokens": 15462714.0,
"reward": 167.71546936035156,
"reward_std": 93.82933044433594,
"rewards/Rewards/mean": 167.71546936035156,
"rewards/Rewards/std": 150.2532501220703,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9966925382614136,
"sampling/importance_sampling_ratio/min": 0.010767911560833454,
"sampling/sampling_logp_difference/max": 4.531184673309326,
"sampling/sampling_logp_difference/mean": 0.018742073327302933,
"step": 126
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1666666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2019.0,
"completions/mean_length": 1666.604248046875,
"completions/mean_terminated_length": 1590.3250732421875,
"completions/min_length": 1007.0,
"completions/min_terminated_length": 1007.0,
"entropy": 0.08408204466104507,
"epoch": 0.13670613562970937,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.84375,
"learning_rate": 8.643702906350916e-06,
"loss": 0.0025,
"num_tokens": 15589055.0,
"reward": 123.41200256347656,
"reward_std": 110.46016693115234,
"rewards/Rewards/mean": 123.4119873046875,
"rewards/Rewards/std": 133.6590576171875,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.996285080909729,
"sampling/importance_sampling_ratio/min": 0.0035241865552961826,
"sampling/sampling_logp_difference/max": 5.648105621337891,
"sampling/sampling_logp_difference/mean": 0.019644845277071,
"step": 127
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2291666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1874.0,
"completions/mean_length": 1431.5,
"completions/mean_terminated_length": 1248.2161865234375,
"completions/min_length": 842.0,
"completions/min_terminated_length": 842.0,
"entropy": 0.07925313711166382,
"epoch": 0.1377825618945102,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.03125,
"learning_rate": 8.632938643702907e-06,
"loss": 0.0074,
"num_tokens": 15705359.0,
"reward": 74.46238708496094,
"reward_std": 54.4752197265625,
"rewards/Rewards/mean": 74.4623794555664,
"rewards/Rewards/std": 118.78749084472656,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9974421262741089,
"sampling/importance_sampling_ratio/min": 0.009600045159459114,
"sampling/sampling_logp_difference/max": 4.645987510681152,
"sampling/sampling_logp_difference/mean": 0.018069487065076828,
"step": 128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2083333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2023.0,
"completions/mean_length": 1754.041748046875,
"completions/mean_terminated_length": 1676.6842041015625,
"completions/min_length": 1185.0,
"completions/min_terminated_length": 1185.0,
"entropy": 0.08643808960914612,
"epoch": 0.13885898815931108,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8125,
"learning_rate": 8.622174381054899e-06,
"loss": 0.0347,
"num_tokens": 15837865.0,
"reward": 207.1958465576172,
"reward_std": 120.06201171875,
"rewards/Rewards/mean": 207.19581604003906,
"rewards/Rewards/std": 165.83035278320312,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.996021568775177,
"sampling/importance_sampling_ratio/min": 0.016845213249325752,
"sampling/sampling_logp_difference/max": 4.083688735961914,
"sampling/sampling_logp_difference/mean": 0.019307805225253105,
"step": 129
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2083333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2020.0,
"completions/mean_length": 1653.5833740234375,
"completions/mean_terminated_length": 1549.7894287109375,
"completions/min_length": 1166.0,
"completions/min_terminated_length": 1166.0,
"entropy": 0.08368074148893356,
"epoch": 0.13993541442411195,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.84375,
"learning_rate": 8.611410118406889e-06,
"loss": 0.0144,
"num_tokens": 15970955.0,
"reward": 145.67257690429688,
"reward_std": 118.706787109375,
"rewards/Rewards/mean": 145.6725616455078,
"rewards/Rewards/std": 148.27821350097656,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9966948628425598,
"sampling/importance_sampling_ratio/min": 0.028941864147782326,
"sampling/sampling_logp_difference/max": 3.542466163635254,
"sampling/sampling_logp_difference/mean": 0.019158201292157173,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2011.0,
"completions/mean_length": 1522.604248046875,
"completions/mean_terminated_length": 1447.547607421875,
"completions/min_length": 705.0,
"completions/min_terminated_length": 705.0,
"entropy": 0.08337804675102234,
"epoch": 0.14101184068891282,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.96875,
"learning_rate": 8.60064585575888e-06,
"loss": -0.0715,
"num_tokens": 16102654.0,
"reward": 183.8970947265625,
"reward_std": 76.86428833007812,
"rewards/Rewards/mean": 183.8970947265625,
"rewards/Rewards/std": 149.77781677246094,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9960099458694458,
"sampling/importance_sampling_ratio/min": 0.0454302616417408,
"sampling/sampling_logp_difference/max": 3.0915768146514893,
"sampling/sampling_logp_difference/mean": 0.01908412016928196,
"step": 131
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2000.0,
"completions/mean_length": 1695.1875,
"completions/mean_terminated_length": 1613.769287109375,
"completions/min_length": 1024.0,
"completions/min_terminated_length": 1024.0,
"entropy": 0.08442177623510361,
"epoch": 0.14208826695371368,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.53125,
"learning_rate": 8.589881593110873e-06,
"loss": 0.0203,
"num_tokens": 16228399.0,
"reward": 132.07305908203125,
"reward_std": 133.1316680908203,
"rewards/Rewards/mean": 132.07305908203125,
"rewards/Rewards/std": 145.59878540039062,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9966284036636353,
"sampling/importance_sampling_ratio/min": 0.025022760033607483,
"sampling/sampling_logp_difference/max": 3.687969446182251,
"sampling/sampling_logp_difference/mean": 0.01852625235915184,
"step": 132
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1458333432674408,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2005.0,
"completions/mean_length": 1667.0208740234375,
"completions/mean_terminated_length": 1601.9755859375,
"completions/min_length": 965.0,
"completions/min_terminated_length": 965.0,
"entropy": 0.07948024570941925,
"epoch": 0.14316469321851452,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0,
"learning_rate": 8.579117330462864e-06,
"loss": -0.0127,
"num_tokens": 16360934.0,
"reward": 179.4247283935547,
"reward_std": 117.30535888671875,
"rewards/Rewards/mean": 179.4247283935547,
"rewards/Rewards/std": 155.9678192138672,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9964487552642822,
"sampling/importance_sampling_ratio/min": 0.0016366546042263508,
"sampling/sampling_logp_difference/max": 6.415101051330566,
"sampling/sampling_logp_difference/mean": 0.01843268796801567,
"step": 133
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1041666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2005.0,
"completions/mean_length": 1544.5833740234375,
"completions/mean_terminated_length": 1486.0465087890625,
"completions/min_length": 883.0,
"completions/min_terminated_length": 883.0,
"entropy": 0.08017030358314514,
"epoch": 0.1442411194833154,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8671875,
"learning_rate": 8.568353067814856e-06,
"loss": -0.0251,
"num_tokens": 16483980.0,
"reward": 170.14761352539062,
"reward_std": 108.76873779296875,
"rewards/Rewards/mean": 170.14759826660156,
"rewards/Rewards/std": 149.59971618652344,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9965035319328308,
"sampling/importance_sampling_ratio/min": 0.06854788213968277,
"sampling/sampling_logp_difference/max": 2.680222749710083,
"sampling/sampling_logp_difference/mean": 0.018077414482831955,
"step": 134
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1880.0,
"completions/mean_length": 1547.979248046875,
"completions/mean_terminated_length": 1432.5897216796875,
"completions/min_length": 712.0,
"completions/min_terminated_length": 712.0,
"entropy": 0.08219125866889954,
"epoch": 0.14531754574811626,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.046875,
"learning_rate": 8.557588805166846e-06,
"loss": 0.0168,
"num_tokens": 16601699.0,
"reward": 226.5089111328125,
"reward_std": 113.33275604248047,
"rewards/Rewards/mean": 226.5089111328125,
"rewards/Rewards/std": 153.14732360839844,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9965484142303467,
"sampling/importance_sampling_ratio/min": 0.0002600400475785136,
"sampling/sampling_logp_difference/max": 8.254674911499023,
"sampling/sampling_logp_difference/mean": 0.019097905606031418,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0833333358168602,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2021.0,
"completions/mean_length": 1570.8958740234375,
"completions/mean_terminated_length": 1527.5228271484375,
"completions/min_length": 839.0,
"completions/min_terminated_length": 839.0,
"entropy": 0.08367850631475449,
"epoch": 0.14639397201291712,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0,
"learning_rate": 8.546824542518838e-06,
"loss": -0.0181,
"num_tokens": 16727862.0,
"reward": 169.14260864257812,
"reward_std": 109.80867767333984,
"rewards/Rewards/mean": 169.14259338378906,
"rewards/Rewards/std": 164.3988800048828,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.996330976486206,
"sampling/importance_sampling_ratio/min": 0.02022353932261467,
"sampling/sampling_logp_difference/max": 3.9009079933166504,
"sampling/sampling_logp_difference/mean": 0.01995905488729477,
"step": 136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0833333358168602,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1986.0,
"completions/mean_length": 1430.875,
"completions/mean_terminated_length": 1374.7728271484375,
"completions/min_length": 825.0,
"completions/min_terminated_length": 825.0,
"entropy": 0.07976692914962769,
"epoch": 0.147470398277718,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.078125,
"learning_rate": 8.53606027987083e-06,
"loss": -0.0286,
"num_tokens": 16840806.0,
"reward": 150.56967163085938,
"reward_std": 123.42110443115234,
"rewards/Rewards/mean": 150.5696563720703,
"rewards/Rewards/std": 152.9371337890625,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9966201782226562,
"sampling/importance_sampling_ratio/min": 0.007347543723881245,
"sampling/sampling_logp_difference/max": 4.913389205932617,
"sampling/sampling_logp_difference/mean": 0.018820632249116898,
"step": 137
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2018.0,
"completions/mean_length": 1563.604248046875,
"completions/mean_terminated_length": 1542.54345703125,
"completions/min_length": 681.0,
"completions/min_terminated_length": 681.0,
"entropy": 0.08118607103824615,
"epoch": 0.14854682454251883,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9453125,
"learning_rate": 8.525296017222821e-06,
"loss": -0.0781,
"num_tokens": 16969061.0,
"reward": 178.30577087402344,
"reward_std": 63.06797790527344,
"rewards/Rewards/mean": 178.3057403564453,
"rewards/Rewards/std": 156.191650390625,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9966996908187866,
"sampling/importance_sampling_ratio/min": 0.012999716214835644,
"sampling/sampling_logp_difference/max": 4.342827796936035,
"sampling/sampling_logp_difference/mean": 0.01800641044974327,
"step": 138
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02083333395421505,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2006.0,
"completions/mean_length": 1388.0208740234375,
"completions/mean_terminated_length": 1373.9786376953125,
"completions/min_length": 620.0,
"completions/min_terminated_length": 620.0,
"entropy": 0.07906337827444077,
"epoch": 0.1496232508073197,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.21875,
"learning_rate": 8.514531754574811e-06,
"loss": -0.0267,
"num_tokens": 17080032.0,
"reward": 160.73464965820312,
"reward_std": 93.3261947631836,
"rewards/Rewards/mean": 160.73463439941406,
"rewards/Rewards/std": 167.1549072265625,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9969953894615173,
"sampling/importance_sampling_ratio/min": 0.07584778964519501,
"sampling/sampling_logp_difference/max": 2.579026699066162,
"sampling/sampling_logp_difference/mean": 0.019315559417009354,
"step": 139
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2046.0,
"completions/mean_length": 1438.5,
"completions/mean_terminated_length": 1412.0,
"completions/min_length": 912.0,
"completions/min_terminated_length": 912.0,
"entropy": 0.08420948684215546,
"epoch": 0.15069967707212056,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.1875,
"learning_rate": 8.503767491926803e-06,
"loss": -0.056,
"num_tokens": 17201976.0,
"reward": 130.47802734375,
"reward_std": 110.41607666015625,
"rewards/Rewards/mean": 130.47802734375,
"rewards/Rewards/std": 145.10369873046875,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9967747926712036,
"sampling/importance_sampling_ratio/min": 0.03695596382021904,
"sampling/sampling_logp_difference/max": 3.2980282306671143,
"sampling/sampling_logp_difference/mean": 0.019422899931669235,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2030.0,
"completions/mean_length": 1597.625,
"completions/mean_terminated_length": 1493.6923828125,
"completions/min_length": 874.0,
"completions/min_terminated_length": 874.0,
"entropy": 0.0827246755361557,
"epoch": 0.15177610333692143,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.1875,
"learning_rate": 8.493003229278796e-06,
"loss": 0.0049,
"num_tokens": 17332008.0,
"reward": 83.94515228271484,
"reward_std": 65.63279724121094,
"rewards/Rewards/mean": 83.94515228271484,
"rewards/Rewards/std": 119.19114685058594,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962602853775024,
"sampling/importance_sampling_ratio/min": 0.046488065272569656,
"sampling/sampling_logp_difference/max": 3.0685596466064453,
"sampling/sampling_logp_difference/mean": 0.019384002313017845,
"step": 141
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2022.0,
"completions/mean_length": 1426.1458740234375,
"completions/mean_terminated_length": 1399.1087646484375,
"completions/min_length": 822.0,
"completions/min_terminated_length": 822.0,
"entropy": 0.07838290929794312,
"epoch": 0.15285252960172227,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.921875,
"learning_rate": 8.482238966630786e-06,
"loss": -0.0712,
"num_tokens": 17440903.0,
"reward": 219.865234375,
"reward_std": 75.80467224121094,
"rewards/Rewards/mean": 219.865234375,
"rewards/Rewards/std": 159.51031494140625,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9964785575866699,
"sampling/importance_sampling_ratio/min": 0.06103203073143959,
"sampling/sampling_logp_difference/max": 2.796356439590454,
"sampling/sampling_logp_difference/mean": 0.01858234778046608,
"step": 142
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2004.0,
"completions/mean_length": 1544.0,
"completions/mean_terminated_length": 1522.0870361328125,
"completions/min_length": 959.0,
"completions/min_terminated_length": 959.0,
"entropy": 0.08301036059856415,
"epoch": 0.15392895586652314,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.90625,
"learning_rate": 8.471474703982778e-06,
"loss": -0.0748,
"num_tokens": 17561167.0,
"reward": 180.9502410888672,
"reward_std": 53.65088653564453,
"rewards/Rewards/mean": 180.9502410888672,
"rewards/Rewards/std": 162.4229278564453,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9965589046478271,
"sampling/importance_sampling_ratio/min": 0.056365251541137695,
"sampling/sampling_logp_difference/max": 2.8759024143218994,
"sampling/sampling_logp_difference/mean": 0.01916421577334404,
"step": 143
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2012.0,
"completions/mean_length": 1522.1458740234375,
"completions/mean_terminated_length": 1499.2825927734375,
"completions/min_length": 939.0,
"completions/min_terminated_length": 939.0,
"entropy": 0.08124718070030212,
"epoch": 0.155005382131324,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9296875,
"learning_rate": 8.46071044133477e-06,
"loss": -0.0565,
"num_tokens": 17683118.0,
"reward": 225.0555419921875,
"reward_std": 92.28500366210938,
"rewards/Rewards/mean": 225.0555419921875,
"rewards/Rewards/std": 149.1592254638672,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9963716268539429,
"sampling/importance_sampling_ratio/min": 0.05687575414776802,
"sampling/sampling_logp_difference/max": 2.8668861389160156,
"sampling/sampling_logp_difference/mean": 0.01863805204629898,
"step": 144
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0416666679084301,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1927.0,
"completions/mean_length": 1432.666748046875,
"completions/mean_terminated_length": 1405.9130859375,
"completions/min_length": 718.0,
"completions/min_terminated_length": 718.0,
"entropy": 0.08093176782131195,
"epoch": 0.15608180839612487,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.078125,
"learning_rate": 8.449946178686761e-06,
"loss": -0.0745,
"num_tokens": 17797018.0,
"reward": 135.8857421875,
"reward_std": 84.26956176757812,
"rewards/Rewards/mean": 135.8857421875,
"rewards/Rewards/std": 152.7830047607422,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9965113401412964,
"sampling/importance_sampling_ratio/min": 0.03339711204171181,
"sampling/sampling_logp_difference/max": 3.3992857933044434,
"sampling/sampling_logp_difference/mean": 0.01943063922226429,
"step": 145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1041666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2035.0,
"completions/mean_length": 1558.2083740234375,
"completions/mean_terminated_length": 1501.255859375,
"completions/min_length": 1090.0,
"completions/min_terminated_length": 1090.0,
"entropy": 0.08431422710418701,
"epoch": 0.15715823466092574,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.015625,
"learning_rate": 8.439181916038753e-06,
"loss": -0.0368,
"num_tokens": 17917976.0,
"reward": 141.34796142578125,
"reward_std": 111.29246520996094,
"rewards/Rewards/mean": 141.34796142578125,
"rewards/Rewards/std": 151.082763671875,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9961467981338501,
"sampling/importance_sampling_ratio/min": 0.05715278163552284,
"sampling/sampling_logp_difference/max": 2.862027168273926,
"sampling/sampling_logp_difference/mean": 0.01945657841861248,
"step": 146
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1907.0,
"completions/mean_length": 1446.4583740234375,
"completions/mean_terminated_length": 1406.3555908203125,
"completions/min_length": 866.0,
"completions/min_terminated_length": 866.0,
"entropy": 0.0837797150015831,
"epoch": 0.15823466092572658,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.125,
"learning_rate": 8.428417653390743e-06,
"loss": 0.0078,
"num_tokens": 18036030.0,
"reward": 248.0830078125,
"reward_std": 113.72354125976562,
"rewards/Rewards/mean": 248.0830078125,
"rewards/Rewards/std": 146.14605712890625,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9962608814239502,
"sampling/importance_sampling_ratio/min": 0.05039582774043083,
"sampling/sampling_logp_difference/max": 2.987846851348877,
"sampling/sampling_logp_difference/mean": 0.01956545002758503,
"step": 147
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1041666716337204,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2046.0,
"completions/mean_length": 1549.5833740234375,
"completions/mean_terminated_length": 1491.6279296875,
"completions/min_length": 635.0,
"completions/min_terminated_length": 635.0,
"entropy": 0.08182680606842041,
"epoch": 0.15931108719052745,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9140625,
"learning_rate": 8.417653390742735e-06,
"loss": -0.0206,
"num_tokens": 18167038.0,
"reward": 108.25777435302734,
"reward_std": 85.30794525146484,
"rewards/Rewards/mean": 108.25775909423828,
"rewards/Rewards/std": 126.70792388916016,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9965537786483765,
"sampling/importance_sampling_ratio/min": 0.03747351095080376,
"sampling/sampling_logp_difference/max": 3.284121036529541,
"sampling/sampling_logp_difference/mean": 0.01908835396170616,
"step": 148
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.02083333395421505,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1947.0,
"completions/mean_length": 1398.854248046875,
"completions/mean_terminated_length": 1385.04248046875,
"completions/min_length": 802.0,
"completions/min_terminated_length": 802.0,
"entropy": 0.07840481400489807,
"epoch": 0.1603875134553283,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.328125,
"learning_rate": 8.406889128094727e-06,
"loss": -0.0369,
"num_tokens": 18287115.0,
"reward": 220.73410034179688,
"reward_std": 43.89515686035156,
"rewards/Rewards/mean": 220.7340850830078,
"rewards/Rewards/std": 141.5032958984375,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9964281320571899,
"sampling/importance_sampling_ratio/min": 0.03872040659189224,
"sampling/sampling_logp_difference/max": 3.2513885498046875,
"sampling/sampling_logp_difference/mean": 0.0185480285435915,
"step": 149
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2039.0,
"completions/mean_length": 1486.75,
"completions/mean_terminated_length": 1449.3333740234375,
"completions/min_length": 836.0,
"completions/min_terminated_length": 836.0,
"entropy": 0.07938205450773239,
"epoch": 0.16146393972012918,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.21875,
"learning_rate": 8.396124865446718e-06,
"loss": -0.0306,
"num_tokens": 18403377.0,
"reward": 237.360595703125,
"reward_std": 82.56875610351562,
"rewards/Rewards/mean": 237.360595703125,
"rewards/Rewards/std": 157.23016357421875,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 0.9965106844902039,
"sampling/importance_sampling_ratio/min": 0.03296330198645592,
"sampling/sampling_logp_difference/max": 3.412360429763794,
"sampling/sampling_logp_difference/mean": 0.019054463133215904,
"step": 150
}
],
"logging_steps": 1,
"max_steps": 929,
"num_input_tokens_seen": 18403377,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}