train_A_gin_rummy / trainer_state.json
Gege24's picture
Upload task output 1
0330bb7 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.00936,
"eval_steps": 500,
"global_step": 468,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1785.0,
"completions/max_terminated_length": 1785.0,
"completions/mean_length": 1667.71875,
"completions/mean_terminated_length": 1667.71875,
"completions/min_length": 1335.0,
"completions/min_terminated_length": 1335.0,
"entropy": 0.47570936381816864,
"epoch": 2e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7817060947418213,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0175,
"num_tokens": 73979.0,
"reward": -8.643012046813965,
"reward_std": 11.93884563446045,
"rewards/rollout_reward_func/mean": -8.643012046813965,
"rewards/rollout_reward_func/std": 13.176301956176758,
"sampling/importance_sampling_ratio/max": 1.8267614841461182,
"sampling/importance_sampling_ratio/mean": 1.0556937456130981,
"sampling/importance_sampling_ratio/min": 0.6958155035972595,
"sampling/sampling_logp_difference/max": 0.4538118839263916,
"sampling/sampling_logp_difference/mean": 0.018563803285360336,
"step": 1,
"step_time": 36.57323472299959
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.47570936381816864,
"epoch": 4e-05,
"grad_norm": 1.778490424156189,
"kl": 0.0,
"learning_rate": 2.8571428571428575e-07,
"loss": 0.0175,
"step": 2,
"step_time": 5.7524538550001125
},
{
"clip_ratio/high_max": 0.007694128900766373,
"clip_ratio/high_mean": 0.0038470644503831863,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0038470644503831863,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1804.0,
"completions/max_terminated_length": 1804.0,
"completions/mean_length": 1539.0625,
"completions/mean_terminated_length": 1539.0625,
"completions/min_length": 1102.0,
"completions/min_terminated_length": 1102.0,
"entropy": 0.48639967665076256,
"epoch": 6e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.348313331604004,
"kl": 0.0009020493234856986,
"learning_rate": 5.714285714285715e-07,
"loss": -0.0061,
"num_tokens": 143940.0,
"reward": -12.985191345214844,
"reward_std": 9.043848037719727,
"rewards/rollout_reward_func/mean": -12.985191345214844,
"rewards/rollout_reward_func/std": 13.16507339477539,
"sampling/importance_sampling_ratio/max": 1.4940505027770996,
"sampling/importance_sampling_ratio/mean": 0.9786970019340515,
"sampling/importance_sampling_ratio/min": 0.578092634677887,
"sampling/sampling_logp_difference/max": 0.47244715690612793,
"sampling/sampling_logp_difference/mean": 0.020807698369026184,
"step": 3,
"step_time": 33.27629456400018
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.4873173348605633,
"epoch": 8e-05,
"grad_norm": 2.1872665882110596,
"kl": 0.0008225523779401556,
"learning_rate": 8.571428571428572e-07,
"loss": -0.0047,
"step": 4,
"step_time": 5.767302959000062
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1797.0,
"completions/max_terminated_length": 1797.0,
"completions/mean_length": 1677.5,
"completions/mean_terminated_length": 1677.5,
"completions/min_length": 1286.0,
"completions/min_terminated_length": 1286.0,
"entropy": 0.5088205523788929,
"epoch": 0.0001,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.25364089012146,
"kl": 0.0010678053367882967,
"learning_rate": 1.142857142857143e-06,
"loss": 0.0065,
"num_tokens": 218324.0,
"reward": -10.823358535766602,
"reward_std": 13.736562728881836,
"rewards/rollout_reward_func/mean": -10.823358535766602,
"rewards/rollout_reward_func/std": 15.484944343566895,
"sampling/importance_sampling_ratio/max": 1.4805132150650024,
"sampling/importance_sampling_ratio/mean": 1.0641556978225708,
"sampling/importance_sampling_ratio/min": 0.568811297416687,
"sampling/sampling_logp_difference/max": 0.2526984214782715,
"sampling/sampling_logp_difference/mean": 0.023618247359991074,
"step": 5,
"step_time": 36.10746106100078
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.5093838162720203,
"epoch": 0.00012,
"grad_norm": 2.288219928741455,
"kl": 0.001079607754945755,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.0054,
"step": 6,
"step_time": 5.795254830000886
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.003738839295692742,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003738839295692742,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1815.0,
"completions/max_terminated_length": 1815.0,
"completions/mean_length": 1591.78125,
"completions/mean_terminated_length": 1591.78125,
"completions/min_length": 1105.0,
"completions/min_terminated_length": 1105.0,
"entropy": 0.46493203938007355,
"epoch": 0.00014,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.118450164794922,
"kl": 0.0011459436791483313,
"learning_rate": 1.7142857142857145e-06,
"loss": 0.0396,
"num_tokens": 289395.0,
"reward": -10.49427318572998,
"reward_std": 11.239557266235352,
"rewards/rollout_reward_func/mean": -10.49427318572998,
"rewards/rollout_reward_func/std": 15.952676773071289,
"sampling/importance_sampling_ratio/max": 1.448197603225708,
"sampling/importance_sampling_ratio/mean": 1.0151325464248657,
"sampling/importance_sampling_ratio/min": 7.746867383695566e-12,
"sampling/sampling_logp_difference/max": 24.934412002563477,
"sampling/sampling_logp_difference/mean": 0.07075877487659454,
"step": 7,
"step_time": 33.41762578200087
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.46468354761600494,
"epoch": 0.00016,
"grad_norm": 2.175487995147705,
"kl": 0.0008941343548940495,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0395,
"step": 8,
"step_time": 5.792184412999632
},
{
"clip_ratio/high_max": 0.017968750093132257,
"clip_ratio/high_mean": 0.008984375046566129,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008984375046566129,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1826.0,
"completions/max_terminated_length": 1826.0,
"completions/mean_length": 1506.34375,
"completions/mean_terminated_length": 1506.34375,
"completions/min_length": 281.0,
"completions/min_terminated_length": 281.0,
"entropy": 0.49622904509305954,
"epoch": 0.00018,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7127985954284668,
"kl": 0.0012855095374106895,
"learning_rate": 2.285714285714286e-06,
"loss": 0.0714,
"num_tokens": 358179.0,
"reward": -11.01856803894043,
"reward_std": 10.841419219970703,
"rewards/rollout_reward_func/mean": -11.01856803894043,
"rewards/rollout_reward_func/std": 16.40532112121582,
"sampling/importance_sampling_ratio/max": 1.4218463897705078,
"sampling/importance_sampling_ratio/mean": 0.9850149750709534,
"sampling/importance_sampling_ratio/min": 0.4642792344093323,
"sampling/sampling_logp_difference/max": 0.4877281188964844,
"sampling/sampling_logp_difference/mean": 0.023507963865995407,
"step": 9,
"step_time": 32.45201305700084
},
{
"clip_ratio/high_max": 0.021875000093132257,
"clip_ratio/high_mean": 0.010937500046566129,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012890625046566129,
"entropy": 0.494538277387619,
"epoch": 0.0002,
"grad_norm": 1.6264572143554688,
"kl": 0.0011894339404534549,
"learning_rate": 2.571428571428571e-06,
"loss": 0.0722,
"step": 10,
"step_time": 6.248037073999512
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0018382353009656072,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0018382353009656072,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1761.0,
"completions/max_terminated_length": 1761.0,
"completions/mean_length": 1599.1875,
"completions/mean_terminated_length": 1599.1875,
"completions/min_length": 1161.0,
"completions/min_terminated_length": 1161.0,
"entropy": 0.4714800976216793,
"epoch": 0.00022,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9238476753234863,
"kl": 0.0010464704500918742,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.0011,
"num_tokens": 429182.0,
"reward": -7.59177303314209,
"reward_std": 6.999079704284668,
"rewards/rollout_reward_func/mean": -7.59177303314209,
"rewards/rollout_reward_func/std": 11.693867683410645,
"sampling/importance_sampling_ratio/max": 1.6081898212432861,
"sampling/importance_sampling_ratio/mean": 1.0232391357421875,
"sampling/importance_sampling_ratio/min": 0.7353501915931702,
"sampling/sampling_logp_difference/max": 0.3081374168395996,
"sampling/sampling_logp_difference/mean": 0.018512647598981857,
"step": 11,
"step_time": 36.059418668000035
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.4721835069358349,
"epoch": 0.00024,
"grad_norm": 1.821286916732788,
"kl": 0.0016941909561865032,
"learning_rate": 3.142857142857143e-06,
"loss": -0.0004,
"step": 12,
"step_time": 5.634688684001048
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1825.0,
"completions/max_terminated_length": 1825.0,
"completions/mean_length": 1668.0625,
"completions/mean_terminated_length": 1668.0625,
"completions/min_length": 1357.0,
"completions/min_terminated_length": 1357.0,
"entropy": 0.44804077222943306,
"epoch": 0.00026,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4859932661056519,
"kl": 0.0011393425811547786,
"learning_rate": 3.428571428571429e-06,
"loss": -0.053,
"num_tokens": 503173.0,
"reward": -3.6963202953338623,
"reward_std": 10.114439964294434,
"rewards/rollout_reward_func/mean": -3.6963202953338623,
"rewards/rollout_reward_func/std": 14.7977876663208,
"sampling/importance_sampling_ratio/max": 1.287428855895996,
"sampling/importance_sampling_ratio/mean": 0.9384276866912842,
"sampling/importance_sampling_ratio/min": 1.816465272468093e-13,
"sampling/sampling_logp_difference/max": 28.412260055541992,
"sampling/sampling_logp_difference/mean": 0.07413282990455627,
"step": 13,
"step_time": 34.34775389900187
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.4483692906796932,
"epoch": 0.00028,
"grad_norm": 1.5191307067871094,
"kl": 0.0010726663895184174,
"learning_rate": 3.7142857142857146e-06,
"loss": -0.0537,
"step": 14,
"step_time": 5.816520962000141
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1825.0,
"completions/max_terminated_length": 1825.0,
"completions/mean_length": 1648.125,
"completions/mean_terminated_length": 1646.51611328125,
"completions/min_length": 1169.0,
"completions/min_terminated_length": 1169.0,
"entropy": 0.46873993426561356,
"epoch": 0.0003,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0255329608917236,
"kl": 0.0014358056214405224,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0246,
"num_tokens": 576514.0,
"reward": -10.058404922485352,
"reward_std": 10.943680763244629,
"rewards/rollout_reward_func/mean": -10.058404922485352,
"rewards/rollout_reward_func/std": 13.969420433044434,
"sampling/importance_sampling_ratio/max": 1.829142689704895,
"sampling/importance_sampling_ratio/mean": 0.9690600633621216,
"sampling/importance_sampling_ratio/min": 0.6791275143623352,
"sampling/sampling_logp_difference/max": 0.32487010955810547,
"sampling/sampling_logp_difference/mean": 0.019666891545057297,
"step": 15,
"step_time": 34.85173578099875
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.4683184288442135,
"epoch": 0.00032,
"grad_norm": 1.9144654273986816,
"kl": 0.0020070531027158722,
"learning_rate": 4.2857142857142855e-06,
"loss": -0.0245,
"step": 16,
"step_time": 6.776438935000442
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1802.0,
"completions/max_terminated_length": 1802.0,
"completions/mean_length": 1555.25,
"completions/mean_terminated_length": 1555.25,
"completions/min_length": 780.0,
"completions/min_terminated_length": 780.0,
"entropy": 0.45006250962615013,
"epoch": 0.00034,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7223249673843384,
"kl": 0.0019661694386741146,
"learning_rate": 4.571428571428572e-06,
"loss": -0.0264,
"num_tokens": 647530.0,
"reward": -6.622595310211182,
"reward_std": 12.143867492675781,
"rewards/rollout_reward_func/mean": -6.622595310211182,
"rewards/rollout_reward_func/std": 26.77586555480957,
"sampling/importance_sampling_ratio/max": 1.2898485660552979,
"sampling/importance_sampling_ratio/mean": 1.0187060832977295,
"sampling/importance_sampling_ratio/min": 0.7567406892776489,
"sampling/sampling_logp_difference/max": 0.2456502914428711,
"sampling/sampling_logp_difference/mean": 0.017298312857747078,
"step": 17,
"step_time": 31.772850325000036
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.4529193826019764,
"epoch": 0.00036,
"grad_norm": 1.9187003374099731,
"kl": 0.002475597968441434,
"learning_rate": 4.857142857142858e-06,
"loss": -0.0276,
"step": 18,
"step_time": 5.816502887000752
},
{
"clip_ratio/high_max": 0.00728462846018374,
"clip_ratio/high_mean": 0.00364231423009187,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00364231423009187,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1794.0,
"completions/max_terminated_length": 1794.0,
"completions/mean_length": 1580.625,
"completions/mean_terminated_length": 1580.625,
"completions/min_length": 1142.0,
"completions/min_terminated_length": 1142.0,
"entropy": 0.5051322989165783,
"epoch": 0.00038,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7061576843261719,
"kl": 0.0045548786874860525,
"learning_rate": 5.142857142857142e-06,
"loss": -0.0058,
"num_tokens": 718703.0,
"reward": -9.462235450744629,
"reward_std": 12.071723937988281,
"rewards/rollout_reward_func/mean": -9.462235450744629,
"rewards/rollout_reward_func/std": 19.616609573364258,
"sampling/importance_sampling_ratio/max": 1.4264099597930908,
"sampling/importance_sampling_ratio/mean": 0.9698678255081177,
"sampling/importance_sampling_ratio/min": 0.5692899823188782,
"sampling/sampling_logp_difference/max": 0.41555118560791016,
"sampling/sampling_logp_difference/mean": 0.025009114295244217,
"step": 19,
"step_time": 33.3764044440004
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0016891892300918698,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00364231423009187,
"entropy": 0.5052920691668987,
"epoch": 0.0004,
"grad_norm": 1.6149944067001343,
"kl": 0.00593935526558198,
"learning_rate": 5.428571428571429e-06,
"loss": -0.009,
"step": 20,
"step_time": 5.7515413030014315
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.007694128900766373,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011600378900766373,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1770.0,
"completions/max_terminated_length": 1770.0,
"completions/mean_length": 1632.125,
"completions/mean_terminated_length": 1655.54833984375,
"completions/min_length": 906.0,
"completions/min_terminated_length": 1245.0,
"entropy": 0.4712696149945259,
"epoch": 0.00042,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.766438364982605,
"kl": 0.007341491058468819,
"learning_rate": 5.7142857142857145e-06,
"loss": -0.0201,
"num_tokens": 791658.0,
"reward": -9.433378219604492,
"reward_std": 12.105504989624023,
"rewards/rollout_reward_func/mean": -9.433378219604492,
"rewards/rollout_reward_func/std": 15.258283615112305,
"sampling/importance_sampling_ratio/max": 1.574469804763794,
"sampling/importance_sampling_ratio/mean": 0.9757359027862549,
"sampling/importance_sampling_ratio/min": 0.6087821125984192,
"sampling/sampling_logp_difference/max": 0.3755350112915039,
"sampling/sampling_logp_difference/mean": 0.028891967609524727,
"step": 21,
"step_time": 33.09052270600023
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.46676792576909065,
"epoch": 0.00044,
"grad_norm": 1.957079529762268,
"kl": 0.01086019104695879,
"learning_rate": 6e-06,
"loss": -0.0213,
"step": 22,
"step_time": 6.599295061999328
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.005800189450383186,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005800189450383186,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1817.0,
"completions/max_terminated_length": 1817.0,
"completions/mean_length": 1548.6875,
"completions/mean_terminated_length": 1548.6875,
"completions/min_length": 616.0,
"completions/min_terminated_length": 616.0,
"entropy": 0.449709665030241,
"epoch": 0.00046,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5633138418197632,
"kl": 0.018061322276480496,
"learning_rate": 6.285714285714286e-06,
"loss": -0.0678,
"num_tokens": 861907.0,
"reward": -14.31330394744873,
"reward_std": 9.498095512390137,
"rewards/rollout_reward_func/mean": -14.31330394744873,
"rewards/rollout_reward_func/std": 13.741255760192871,
"sampling/importance_sampling_ratio/max": 1.6751961708068848,
"sampling/importance_sampling_ratio/mean": 0.8833787441253662,
"sampling/importance_sampling_ratio/min": 0.37046000361442566,
"sampling/sampling_logp_difference/max": 0.5361905097961426,
"sampling/sampling_logp_difference/mean": 0.039727941155433655,
"step": 23,
"step_time": 33.344133182000405
},
{
"clip_ratio/high_max": 0.019294507801532745,
"clip_ratio/high_mean": 0.009647253900766373,
"clip_ratio/low_mean": 0.013395675574429333,
"clip_ratio/low_min": 0.0037878789007663727,
"clip_ratio/region_mean": 0.023042929475195706,
"entropy": 0.4419500008225441,
"epoch": 0.00048,
"grad_norm": 1.234191656112671,
"kl": 0.03053808701224625,
"learning_rate": 6.571428571428572e-06,
"loss": -0.0702,
"step": 24,
"step_time": 5.787397329001578
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1765.0,
"completions/max_terminated_length": 1765.0,
"completions/mean_length": 1629.6875,
"completions/mean_terminated_length": 1629.6875,
"completions/min_length": 1226.0,
"completions/min_terminated_length": 1226.0,
"entropy": 0.42522644996643066,
"epoch": 0.0005,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.869774103164673,
"kl": 0.04610300064086914,
"learning_rate": 6.857142857142858e-06,
"loss": -0.0692,
"num_tokens": 934819.0,
"reward": -13.978726387023926,
"reward_std": 13.509387969970703,
"rewards/rollout_reward_func/mean": -13.978726387023926,
"rewards/rollout_reward_func/std": 15.972912788391113,
"sampling/importance_sampling_ratio/max": 2.516436815261841,
"sampling/importance_sampling_ratio/mean": 1.0305383205413818,
"sampling/importance_sampling_ratio/min": 0.15315498411655426,
"sampling/sampling_logp_difference/max": 0.7299051284790039,
"sampling/sampling_logp_difference/mean": 0.05616045743227005,
"step": 25,
"step_time": 34.39816641099969
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.019055451033636928,
"clip_ratio/low_min": 0.0035714285913854837,
"clip_ratio/region_mean": 0.021008576033636928,
"entropy": 0.41626644134521484,
"epoch": 0.00052,
"grad_norm": 2.5028293132781982,
"kl": 0.07302278326824307,
"learning_rate": 7.1428571428571436e-06,
"loss": -0.0701,
"step": 26,
"step_time": 5.674365414000931
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1821.0,
"completions/max_terminated_length": 1821.0,
"completions/mean_length": 1672.75,
"completions/mean_terminated_length": 1672.75,
"completions/min_length": 1139.0,
"completions/min_terminated_length": 1139.0,
"entropy": 0.45271916687488556,
"epoch": 0.00054,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.5938966274261475,
"kl": 0.08275566063821316,
"learning_rate": 7.428571428571429e-06,
"loss": -0.1045,
"num_tokens": 1009027.0,
"reward": -8.39571762084961,
"reward_std": 8.244482040405273,
"rewards/rollout_reward_func/mean": -8.39571762084961,
"rewards/rollout_reward_func/std": 10.250362396240234,
"sampling/importance_sampling_ratio/max": 2.743147850036621,
"sampling/importance_sampling_ratio/mean": 1.0750949382781982,
"sampling/importance_sampling_ratio/min": 0.17412874102592468,
"sampling/sampling_logp_difference/max": 1.0401973724365234,
"sampling/sampling_logp_difference/mean": 0.06947841495275497,
"step": 27,
"step_time": 37.13791847399898
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.009706439450383186,
"clip_ratio/low_min": 0.0037878789007663727,
"clip_ratio/region_mean": 0.015565814450383186,
"entropy": 0.4357459023594856,
"epoch": 0.00056,
"grad_norm": 2.490011692047119,
"kl": 0.1187180420383811,
"learning_rate": 7.714285714285716e-06,
"loss": -0.1102,
"step": 28,
"step_time": 5.809195054999691
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1781.0,
"completions/max_terminated_length": 1781.0,
"completions/mean_length": 1632.46875,
"completions/mean_terminated_length": 1632.46875,
"completions/min_length": 1152.0,
"completions/min_terminated_length": 1152.0,
"entropy": 0.4116561934351921,
"epoch": 0.00058,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.761763572692871,
"kl": 0.1548005947843194,
"learning_rate": 8.000000000000001e-06,
"loss": -0.2984,
"num_tokens": 1081858.0,
"reward": -3.9409875869750977,
"reward_std": 13.231146812438965,
"rewards/rollout_reward_func/mean": -3.9409875869750977,
"rewards/rollout_reward_func/std": 15.818596839904785,
"sampling/importance_sampling_ratio/max": 2.6883790493011475,
"sampling/importance_sampling_ratio/mean": 0.9734947681427002,
"sampling/importance_sampling_ratio/min": 0.0414450503885746,
"sampling/sampling_logp_difference/max": 1.37030029296875,
"sampling/sampling_logp_difference/mean": 0.07883325219154358,
"step": 29,
"step_time": 35.647616072001256
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.021484375,
"clip_ratio/low_min": 0.0078125,
"clip_ratio/region_mean": 0.021484375,
"entropy": 0.39363332837820053,
"epoch": 0.0006,
"grad_norm": 1.4616507291793823,
"kl": 0.23487216513603926,
"learning_rate": 8.285714285714287e-06,
"loss": -0.3039,
"step": 30,
"step_time": 5.713043954000568
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1727.0,
"completions/max_terminated_length": 1727.0,
"completions/mean_length": 1557.90625,
"completions/mean_terminated_length": 1557.90625,
"completions/min_length": 876.0,
"completions/min_terminated_length": 876.0,
"entropy": 0.35244373232126236,
"epoch": 0.00062,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.890699863433838,
"kl": 0.21074518747627735,
"learning_rate": 8.571428571428571e-06,
"loss": -0.29,
"num_tokens": 1152094.0,
"reward": -6.871125221252441,
"reward_std": 8.818297386169434,
"rewards/rollout_reward_func/mean": -6.871125221252441,
"rewards/rollout_reward_func/std": 15.35950756072998,
"sampling/importance_sampling_ratio/max": 2.7834908962249756,
"sampling/importance_sampling_ratio/mean": 1.0891664028167725,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.534125804901123,
"sampling/sampling_logp_difference/mean": 0.08870169520378113,
"step": 31,
"step_time": 32.18954384399967
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.023319128900766373,
"clip_ratio/low_min": 0.011482007801532745,
"clip_ratio/region_mean": 0.023319128900766373,
"entropy": 0.332146555185318,
"epoch": 0.00064,
"grad_norm": 1.486317753791809,
"kl": 0.3121166592463851,
"learning_rate": 8.857142857142858e-06,
"loss": -0.2913,
"step": 32,
"step_time": 5.56816236999839
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1797.0,
"completions/max_terminated_length": 1797.0,
"completions/mean_length": 1660.71875,
"completions/mean_terminated_length": 1660.71875,
"completions/min_length": 1569.0,
"completions/min_terminated_length": 1569.0,
"entropy": 0.30892339907586575,
"epoch": 0.00066,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.552164912223816,
"kl": 0.49738539289683104,
"learning_rate": 9.142857142857144e-06,
"loss": 0.0404,
"num_tokens": 1225849.0,
"reward": -2.268610954284668,
"reward_std": 7.639632225036621,
"rewards/rollout_reward_func/mean": -2.268610954284668,
"rewards/rollout_reward_func/std": 9.778660774230957,
"sampling/importance_sampling_ratio/max": 2.200782299041748,
"sampling/importance_sampling_ratio/mean": 1.0492533445358276,
"sampling/importance_sampling_ratio/min": 0.0332188606262207,
"sampling/sampling_logp_difference/max": 1.9084991216659546,
"sampling/sampling_logp_difference/mean": 0.10417380183935165,
"step": 33,
"step_time": 38.71719070200106
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.013671875,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.017578125,
"entropy": 0.2939576134085655,
"epoch": 0.00068,
"grad_norm": 1.684899091720581,
"kl": 0.6880170339718461,
"learning_rate": 9.42857142857143e-06,
"loss": 0.0402,
"step": 34,
"step_time": 5.760281337999004
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1822.0,
"completions/max_terminated_length": 1822.0,
"completions/mean_length": 1693.6875,
"completions/mean_terminated_length": 1693.6875,
"completions/min_length": 863.0,
"completions/min_terminated_length": 863.0,
"entropy": 0.29154328256845474,
"epoch": 0.0007,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4360551834106445,
"kl": 0.4088546773418784,
"learning_rate": 9.714285714285715e-06,
"loss": -0.0455,
"num_tokens": 1301276.0,
"reward": 0.6782079935073853,
"reward_std": 8.422740936279297,
"rewards/rollout_reward_func/mean": 0.6782079935073853,
"rewards/rollout_reward_func/std": 11.004219055175781,
"sampling/importance_sampling_ratio/max": 2.5485658645629883,
"sampling/importance_sampling_ratio/mean": 0.9325703382492065,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.17055344581604,
"sampling/sampling_logp_difference/mean": 0.10468995571136475,
"step": 35,
"step_time": 35.92719602200032
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.2877119109034538,
"epoch": 0.00072,
"grad_norm": 1.655564308166504,
"kl": 0.46621189545840025,
"learning_rate": 1e-05,
"loss": -0.047,
"step": 36,
"step_time": 5.835250215999622
},
{
"clip_ratio/high_max": 0.009588068351149559,
"clip_ratio/high_mean": 0.0047940341755747795,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0067471591755747795,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1799.0,
"completions/max_terminated_length": 1799.0,
"completions/mean_length": 1550.90625,
"completions/mean_terminated_length": 1550.90625,
"completions/min_length": 408.0,
"completions/min_terminated_length": 408.0,
"entropy": 0.3196103312075138,
"epoch": 0.00074,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.215740919113159,
"kl": 0.681450292468071,
"learning_rate": 9.999999998148153e-06,
"loss": -0.1484,
"num_tokens": 1371674.0,
"reward": -7.619454383850098,
"reward_std": 9.219554901123047,
"rewards/rollout_reward_func/mean": -7.619454383850098,
"rewards/rollout_reward_func/std": 13.010725021362305,
"sampling/importance_sampling_ratio/max": 2.8137710094451904,
"sampling/importance_sampling_ratio/mean": 0.994865894317627,
"sampling/importance_sampling_ratio/min": 0.05689575895667076,
"sampling/sampling_logp_difference/max": 2.270341157913208,
"sampling/sampling_logp_difference/mean": 0.10556286573410034,
"step": 37,
"step_time": 31.889447493997977
},
{
"clip_ratio/high_max": 0.0234375,
"clip_ratio/high_mean": 0.01171875,
"clip_ratio/low_mean": 0.01065340917557478,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.02237215917557478,
"entropy": 0.32365087792277336,
"epoch": 0.00076,
"grad_norm": 1.419662594795227,
"kl": 0.6783300125971437,
"learning_rate": 9.999999992592613e-06,
"loss": -0.1485,
"step": 38,
"step_time": 5.828878851998525
},
{
"clip_ratio/high_max": 0.0036764706019312143,
"clip_ratio/high_mean": 0.0018382353009656072,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0018382353009656072,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1706.0,
"completions/max_terminated_length": 1706.0,
"completions/mean_length": 1627.1875,
"completions/mean_terminated_length": 1627.1875,
"completions/min_length": 1383.0,
"completions/min_terminated_length": 1383.0,
"entropy": 0.27882106602191925,
"epoch": 0.00078,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6235533952713013,
"kl": 0.6874659867025912,
"learning_rate": 9.999999983333379e-06,
"loss": -0.185,
"num_tokens": 1444324.0,
"reward": -7.30010461807251,
"reward_std": 5.225525856018066,
"rewards/rollout_reward_func/mean": -7.30010461807251,
"rewards/rollout_reward_func/std": 7.217170715332031,
"sampling/importance_sampling_ratio/max": 1.8792587518692017,
"sampling/importance_sampling_ratio/mean": 0.7841943502426147,
"sampling/importance_sampling_ratio/min": 1.4742858626612398e-12,
"sampling/sampling_logp_difference/max": 26.153600692749023,
"sampling/sampling_logp_difference/mean": 0.14350782334804535,
"step": 39,
"step_time": 37.57111164300022
},
{
"clip_ratio/high_max": 0.015395220601931214,
"clip_ratio/high_mean": 0.007697610300965607,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011603860300965607,
"entropy": 0.28622257336974144,
"epoch": 0.0008,
"grad_norm": 1.2302234172821045,
"kl": 0.5287733990699053,
"learning_rate": 9.999999970370451e-06,
"loss": -0.1871,
"step": 40,
"step_time": 5.553051099000186
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1804.0,
"completions/max_terminated_length": 1804.0,
"completions/mean_length": 1600.71875,
"completions/mean_terminated_length": 1600.71875,
"completions/min_length": 645.0,
"completions/min_terminated_length": 645.0,
"entropy": 0.29325923696160316,
"epoch": 0.00082,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7616039514541626,
"kl": 1.0703641921281815,
"learning_rate": 9.99999995370383e-06,
"loss": -0.0902,
"num_tokens": 1516459.0,
"reward": -2.2486090660095215,
"reward_std": 6.6893110275268555,
"rewards/rollout_reward_func/mean": -2.2486090660095215,
"rewards/rollout_reward_func/std": 8.419123649597168,
"sampling/importance_sampling_ratio/max": 1.7888535261154175,
"sampling/importance_sampling_ratio/mean": 0.7111120223999023,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 24.990407943725586,
"sampling/sampling_logp_difference/mean": 0.1629885733127594,
"step": 41,
"step_time": 36.283448277998104
},
{
"clip_ratio/high_max": 0.02734375,
"clip_ratio/high_mean": 0.013671875,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"entropy": 0.3051527179777622,
"epoch": 0.00084,
"grad_norm": 1.2423319816589355,
"kl": 0.7511667739599943,
"learning_rate": 9.999999933333514e-06,
"loss": -0.0931,
"step": 42,
"step_time": 5.804295024000567
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1778.0,
"completions/max_terminated_length": 1778.0,
"completions/mean_length": 1654.71875,
"completions/mean_terminated_length": 1654.71875,
"completions/min_length": 1497.0,
"completions/min_terminated_length": 1497.0,
"entropy": 0.30402176454663277,
"epoch": 0.00086,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1876243352890015,
"kl": 0.3072218671441078,
"learning_rate": 9.999999909259504e-06,
"loss": -0.2329,
"num_tokens": 1590037.0,
"reward": -7.089628219604492,
"reward_std": 6.80203914642334,
"rewards/rollout_reward_func/mean": -7.089628219604492,
"rewards/rollout_reward_func/std": 8.887528419494629,
"sampling/importance_sampling_ratio/max": 2.1073219776153564,
"sampling/importance_sampling_ratio/mean": 0.829495906829834,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.700775384902954,
"sampling/sampling_logp_difference/mean": 0.09777739644050598,
"step": 43,
"step_time": 36.3438146339995
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.31153809279203415,
"epoch": 0.00088,
"grad_norm": 1.2713712453842163,
"kl": 0.26692129112780094,
"learning_rate": 9.9999998814818e-06,
"loss": -0.2308,
"step": 44,
"step_time": 6.615750631999617
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0018939394503831863,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0018939394503831863,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1815.0,
"completions/max_terminated_length": 1815.0,
"completions/mean_length": 1707.53125,
"completions/mean_terminated_length": 1707.53125,
"completions/min_length": 1563.0,
"completions/min_terminated_length": 1563.0,
"entropy": 0.3288617916405201,
"epoch": 0.0009,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.71139395236969,
"kl": 0.36906831432133913,
"learning_rate": 9.999999850000403e-06,
"loss": -0.3049,
"num_tokens": 1664951.0,
"reward": -7.296619415283203,
"reward_std": 7.16457462310791,
"rewards/rollout_reward_func/mean": -7.296619415283203,
"rewards/rollout_reward_func/std": 10.008492469787598,
"sampling/importance_sampling_ratio/max": 2.4993622303009033,
"sampling/importance_sampling_ratio/mean": 0.9107609987258911,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.4973220825195312,
"sampling/sampling_logp_difference/mean": 0.08788459748029709,
"step": 45,
"step_time": 37.92444931400041
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0018939394503831863,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007753314450383186,
"entropy": 0.33048854768276215,
"epoch": 0.00092,
"grad_norm": 1.5902395248413086,
"kl": 0.3576007531955838,
"learning_rate": 9.999999814815314e-06,
"loss": -0.3085,
"step": 46,
"step_time": 5.829015459999937
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1783.0,
"completions/max_terminated_length": 1783.0,
"completions/mean_length": 1676.21875,
"completions/mean_terminated_length": 1676.21875,
"completions/min_length": 1445.0,
"completions/min_terminated_length": 1445.0,
"entropy": 0.3326357714831829,
"epoch": 0.00094,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6773115396499634,
"kl": 0.5565784685313702,
"learning_rate": 9.99999977592653e-06,
"loss": -0.007,
"num_tokens": 1739609.0,
"reward": 0.43848538398742676,
"reward_std": 6.706617832183838,
"rewards/rollout_reward_func/mean": 0.43848538398742676,
"rewards/rollout_reward_func/std": 11.718527793884277,
"sampling/importance_sampling_ratio/max": 2.1868457794189453,
"sampling/importance_sampling_ratio/mean": 0.8691270351409912,
"sampling/importance_sampling_ratio/min": 0.01293564960360527,
"sampling/sampling_logp_difference/max": 1.9674878120422363,
"sampling/sampling_logp_difference/mean": 0.11179050803184509,
"step": 47,
"step_time": 38.77335685399976
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.32995380833745,
"epoch": 0.00096,
"grad_norm": 1.376336932182312,
"kl": 0.6505217012017965,
"learning_rate": 9.999999733334051e-06,
"loss": -0.0105,
"step": 48,
"step_time": 5.757157200998336
},
{
"clip_ratio/high_max": 0.006623641354963183,
"clip_ratio/high_mean": 0.0033118206774815917,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005264945677481592,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1766.0,
"completions/max_terminated_length": 1766.0,
"completions/mean_length": 1583.28125,
"completions/mean_terminated_length": 1583.28125,
"completions/min_length": 699.0,
"completions/min_terminated_length": 699.0,
"entropy": 0.32827378809452057,
"epoch": 0.00098,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5635427236557007,
"kl": 0.2907373011112213,
"learning_rate": 9.99999968703788e-06,
"loss": -0.4158,
"num_tokens": 1810562.0,
"reward": -6.829834938049316,
"reward_std": 7.807450294494629,
"rewards/rollout_reward_func/mean": -6.829834938049316,
"rewards/rollout_reward_func/std": 9.211454391479492,
"sampling/importance_sampling_ratio/max": 2.658860683441162,
"sampling/importance_sampling_ratio/mean": 1.1884284019470215,
"sampling/importance_sampling_ratio/min": 8.98076324150387e-20,
"sampling/sampling_logp_difference/max": 23.28862953186035,
"sampling/sampling_logp_difference/mean": 0.18271556496620178,
"step": 49,
"step_time": 34.62787952399958
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.009765625,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.3216196559369564,
"epoch": 0.001,
"grad_norm": 1.6985197067260742,
"kl": 0.3250427544116974,
"learning_rate": 9.999999637038016e-06,
"loss": -0.4232,
"step": 50,
"step_time": 6.581373721998716
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1817.0,
"completions/max_terminated_length": 1817.0,
"completions/mean_length": 1713.4375,
"completions/mean_terminated_length": 1713.4375,
"completions/min_length": 1563.0,
"completions/min_terminated_length": 1563.0,
"entropy": 0.3401281237602234,
"epoch": 0.00102,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.996994972229004,
"kl": 0.4171795817092061,
"learning_rate": 9.999999583334458e-06,
"loss": -0.1942,
"num_tokens": 1885989.0,
"reward": -5.845695495605469,
"reward_std": 7.324997901916504,
"rewards/rollout_reward_func/mean": -5.845695495605469,
"rewards/rollout_reward_func/std": 8.569113731384277,
"sampling/importance_sampling_ratio/max": 2.724292039871216,
"sampling/importance_sampling_ratio/mean": 0.9547368884086609,
"sampling/importance_sampling_ratio/min": 0.08236531913280487,
"sampling/sampling_logp_difference/max": 1.727935791015625,
"sampling/sampling_logp_difference/mean": 0.10494339466094971,
"step": 51,
"step_time": 35.600800350999634
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.009765625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.334337517619133,
"epoch": 0.00104,
"grad_norm": 1.823840618133545,
"kl": 0.4406869113445282,
"learning_rate": 9.999999525927207e-06,
"loss": -0.199,
"step": 52,
"step_time": 5.828666499000974
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1768.0,
"completions/max_terminated_length": 1768.0,
"completions/mean_length": 1652.09375,
"completions/mean_terminated_length": 1652.09375,
"completions/min_length": 1469.0,
"completions/min_terminated_length": 1469.0,
"entropy": 0.30093443021178246,
"epoch": 0.00106,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3367406129837036,
"kl": 0.2392230462282896,
"learning_rate": 9.999999464816262e-06,
"loss": 0.0859,
"num_tokens": 1959377.0,
"reward": -2.500540256500244,
"reward_std": 6.36287260055542,
"rewards/rollout_reward_func/mean": -2.500540256500244,
"rewards/rollout_reward_func/std": 8.245169639587402,
"sampling/importance_sampling_ratio/max": 2.5041768550872803,
"sampling/importance_sampling_ratio/mean": 0.9154846668243408,
"sampling/importance_sampling_ratio/min": 3.6384679991119384e-12,
"sampling/sampling_logp_difference/max": 26.868139266967773,
"sampling/sampling_logp_difference/mean": 0.12778040766716003,
"step": 53,
"step_time": 37.75833414000044
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.30127133056521416,
"epoch": 0.00108,
"grad_norm": 1.2612088918685913,
"kl": 0.2430503461509943,
"learning_rate": 9.999999400001624e-06,
"loss": 0.0846,
"step": 54,
"step_time": 5.662651007998647
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1798.0,
"completions/max_terminated_length": 1798.0,
"completions/mean_length": 1671.09375,
"completions/mean_terminated_length": 1671.09375,
"completions/min_length": 1438.0,
"completions/min_terminated_length": 1438.0,
"entropy": 0.30898030288517475,
"epoch": 0.0011,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7188196182250977,
"kl": 0.3995477119460702,
"learning_rate": 9.999999331483293e-06,
"loss": -0.1787,
"num_tokens": 2033283.0,
"reward": -1.8547379970550537,
"reward_std": 12.750988960266113,
"rewards/rollout_reward_func/mean": -1.8547379970550537,
"rewards/rollout_reward_func/std": 15.986498832702637,
"sampling/importance_sampling_ratio/max": 2.544159173965454,
"sampling/importance_sampling_ratio/mean": 0.9402295351028442,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.6443579196929932,
"sampling/sampling_logp_difference/mean": 0.084171824157238,
"step": 55,
"step_time": 36.70284881099997
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.31088376231491566,
"epoch": 0.00112,
"grad_norm": 1.6016024351119995,
"kl": 0.4059265488758683,
"learning_rate": 9.999999259261269e-06,
"loss": -0.1811,
"step": 56,
"step_time": 6.270625210000617
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1824.0,
"completions/max_terminated_length": 1824.0,
"completions/mean_length": 1709.875,
"completions/mean_terminated_length": 1709.875,
"completions/min_length": 1545.0,
"completions/min_terminated_length": 1545.0,
"entropy": 0.29036473482847214,
"epoch": 0.00114,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5101275444030762,
"kl": 0.5090412199497223,
"learning_rate": 9.999999183335551e-06,
"loss": 0.0419,
"num_tokens": 2108465.0,
"reward": -3.56026291847229,
"reward_std": 6.288623809814453,
"rewards/rollout_reward_func/mean": -3.56026291847229,
"rewards/rollout_reward_func/std": 8.626129150390625,
"sampling/importance_sampling_ratio/max": 2.409752607345581,
"sampling/importance_sampling_ratio/mean": 1.0026273727416992,
"sampling/importance_sampling_ratio/min": 0.07137506455183029,
"sampling/sampling_logp_difference/max": 1.469163417816162,
"sampling/sampling_logp_difference/mean": 0.07790054380893707,
"step": 57,
"step_time": 37.90581317999931
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.2957710847258568,
"epoch": 0.00116,
"grad_norm": 1.3159449100494385,
"kl": 0.4575840122997761,
"learning_rate": 9.999999103706142e-06,
"loss": 0.0392,
"step": 58,
"step_time": 5.813098757998887
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1777.0,
"completions/max_terminated_length": 1777.0,
"completions/mean_length": 1694.0,
"completions/mean_terminated_length": 1694.0,
"completions/min_length": 1574.0,
"completions/min_terminated_length": 1574.0,
"entropy": 0.31563786044716835,
"epoch": 0.00118,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3663612604141235,
"kl": 0.29621826112270355,
"learning_rate": 9.999999020373038e-06,
"loss": -0.1621,
"num_tokens": 2183167.0,
"reward": -2.3904330730438232,
"reward_std": 9.909059524536133,
"rewards/rollout_reward_func/mean": -2.3904330730438232,
"rewards/rollout_reward_func/std": 13.05620002746582,
"sampling/importance_sampling_ratio/max": 2.209512233734131,
"sampling/importance_sampling_ratio/mean": 1.010425329208374,
"sampling/importance_sampling_ratio/min": 0.09116992354393005,
"sampling/sampling_logp_difference/max": 1.1912827491760254,
"sampling/sampling_logp_difference/mean": 0.07018784433603287,
"step": 59,
"step_time": 35.51438805900034
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.3229809142649174,
"epoch": 0.0012,
"grad_norm": 1.4618602991104126,
"kl": 0.2840815596282482,
"learning_rate": 9.999998933336242e-06,
"loss": -0.1622,
"step": 60,
"step_time": 5.714706689000195
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.003791360300965607,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005744485300965607,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1790.0,
"completions/max_terminated_length": 1790.0,
"completions/mean_length": 1640.28125,
"completions/mean_terminated_length": 1640.28125,
"completions/min_length": 1306.0,
"completions/min_terminated_length": 1306.0,
"entropy": 0.3373373970389366,
"epoch": 0.00122,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2567992210388184,
"kl": 0.4037298448383808,
"learning_rate": 9.999998842595754e-06,
"loss": -0.2237,
"num_tokens": 2256023.0,
"reward": -5.301498889923096,
"reward_std": 9.393596649169922,
"rewards/rollout_reward_func/mean": -5.301498889923096,
"rewards/rollout_reward_func/std": 11.857550621032715,
"sampling/importance_sampling_ratio/max": 1.5339993238449097,
"sampling/importance_sampling_ratio/mean": 0.7339890599250793,
"sampling/importance_sampling_ratio/min": 4.623656180147151e-10,
"sampling/sampling_logp_difference/max": 21.543071746826172,
"sampling/sampling_logp_difference/mean": 0.1303299069404602,
"step": 61,
"step_time": 37.366424696999275
},
{
"clip_ratio/high_max": 0.02748579578474164,
"clip_ratio/high_mean": 0.01374289789237082,
"clip_ratio/low_mean": 0.003791360300965607,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017534258076921105,
"entropy": 0.3488129526376724,
"epoch": 0.00124,
"grad_norm": 1.0096025466918945,
"kl": 0.36894314270466566,
"learning_rate": 9.999998748151573e-06,
"loss": -0.2269,
"step": 62,
"step_time": 5.755052162999164
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1794.0,
"completions/max_terminated_length": 1794.0,
"completions/mean_length": 1623.4375,
"completions/mean_terminated_length": 1623.4375,
"completions/min_length": 1016.0,
"completions/min_terminated_length": 1016.0,
"entropy": 0.33689238503575325,
"epoch": 0.00126,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6014989614486694,
"kl": 0.25019469298422337,
"learning_rate": 9.999998650003697e-06,
"loss": -0.2651,
"num_tokens": 2328798.0,
"reward": -1.0410969257354736,
"reward_std": 7.0697126388549805,
"rewards/rollout_reward_func/mean": -1.0410969257354736,
"rewards/rollout_reward_func/std": 8.429243087768555,
"sampling/importance_sampling_ratio/max": 2.9675095081329346,
"sampling/importance_sampling_ratio/mean": 1.0478936433792114,
"sampling/importance_sampling_ratio/min": 0.13199584186077118,
"sampling/sampling_logp_difference/max": 1.1867618560791016,
"sampling/sampling_logp_difference/mean": 0.0737098827958107,
"step": 63,
"step_time": 34.2187673239996
},
{
"clip_ratio/high_max": 0.0037878789007663727,
"clip_ratio/high_mean": 0.0018939394503831863,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0038470644503831863,
"entropy": 0.33753813430666924,
"epoch": 0.00128,
"grad_norm": 1.4934430122375488,
"kl": 0.25292993150651455,
"learning_rate": 9.999998548152132e-06,
"loss": -0.269,
"step": 64,
"step_time": 5.7678524439998
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1667.0,
"completions/max_terminated_length": 1667.0,
"completions/mean_length": 1601.15625,
"completions/mean_terminated_length": 1601.15625,
"completions/min_length": 1455.0,
"completions/min_terminated_length": 1455.0,
"entropy": 0.32645104452967644,
"epoch": 0.0013,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3427740335464478,
"kl": 0.5479664504528046,
"learning_rate": 9.999998442596872e-06,
"loss": -0.0545,
"num_tokens": 2400705.0,
"reward": -2.5378642082214355,
"reward_std": 5.739175319671631,
"rewards/rollout_reward_func/mean": -2.5378642082214355,
"rewards/rollout_reward_func/std": 7.828306198120117,
"sampling/importance_sampling_ratio/max": 2.44716215133667,
"sampling/importance_sampling_ratio/mean": 0.7654808163642883,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2951812744140625,
"sampling/sampling_logp_difference/mean": 0.11022517830133438,
"step": 65,
"step_time": 37.02498609800023
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.009765625,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.015625,
"entropy": 0.3282633610069752,
"epoch": 0.00132,
"grad_norm": 1.3897420167922974,
"kl": 0.5626763068139553,
"learning_rate": 9.999998333337923e-06,
"loss": -0.0562,
"step": 66,
"step_time": 5.503358288000527
},
{
"clip_ratio/high_max": 0.027107007801532745,
"clip_ratio/high_mean": 0.013553503900766373,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017459753900766373,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1817.0,
"completions/max_terminated_length": 1817.0,
"completions/mean_length": 1672.75,
"completions/mean_terminated_length": 1672.75,
"completions/min_length": 1138.0,
"completions/min_terminated_length": 1138.0,
"entropy": 0.37730714678764343,
"epoch": 0.00134,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0150251388549805,
"kl": 0.2086246032267809,
"learning_rate": 9.99999822037528e-06,
"loss": -0.2037,
"num_tokens": 2474830.0,
"reward": -2.223475933074951,
"reward_std": 10.239798545837402,
"rewards/rollout_reward_func/mean": -2.223475933074951,
"rewards/rollout_reward_func/std": 10.8533353805542,
"sampling/importance_sampling_ratio/max": 2.6771914958953857,
"sampling/importance_sampling_ratio/mean": 0.894666850566864,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.1186803579330444,
"sampling/sampling_logp_difference/mean": 0.0862615779042244,
"step": 67,
"step_time": 36.272244204000344
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.007753314450383186,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009706439450383186,
"entropy": 0.3758174180984497,
"epoch": 0.00136,
"grad_norm": 1.9776581525802612,
"kl": 0.22687916830182076,
"learning_rate": 9.999998103708944e-06,
"loss": -0.2033,
"step": 68,
"step_time": 5.841565231000459
},
{
"clip_ratio/high_max": 0.007694128900766373,
"clip_ratio/high_mean": 0.0038470644503831863,
"clip_ratio/low_mean": 0.006138392956927419,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009985457174479961,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1798.0,
"completions/max_terminated_length": 1798.0,
"completions/mean_length": 1643.09375,
"completions/mean_terminated_length": 1643.09375,
"completions/min_length": 983.0,
"completions/min_terminated_length": 983.0,
"entropy": 0.36833325773477554,
"epoch": 0.00138,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.960590124130249,
"kl": 0.34138039220124483,
"learning_rate": 9.999997983338918e-06,
"loss": -0.0379,
"num_tokens": 2548037.0,
"reward": -7.881344318389893,
"reward_std": 6.760621547698975,
"rewards/rollout_reward_func/mean": -7.881344318389893,
"rewards/rollout_reward_func/std": 10.78054428100586,
"sampling/importance_sampling_ratio/max": 2.7962646484375,
"sampling/importance_sampling_ratio/mean": 0.860181450843811,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.209362506866455,
"sampling/sampling_logp_difference/mean": 0.09240779280662537,
"step": 69,
"step_time": 36.16751210700022
},
{
"clip_ratio/high_max": 0.007694128900766373,
"clip_ratio/high_mean": 0.0038470644503831863,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007753314450383186,
"entropy": 0.36754436418414116,
"epoch": 0.0014,
"grad_norm": 2.076843500137329,
"kl": 0.3671616306528449,
"learning_rate": 9.999997859265198e-06,
"loss": -0.0415,
"step": 70,
"step_time": 5.7710249699994165
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1788.0,
"completions/max_terminated_length": 1788.0,
"completions/mean_length": 1649.71875,
"completions/mean_terminated_length": 1649.71875,
"completions/min_length": 1263.0,
"completions/min_terminated_length": 1263.0,
"entropy": 0.3730818182229996,
"epoch": 0.00142,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2103312015533447,
"kl": 0.3976009897887707,
"learning_rate": 9.999997731487788e-06,
"loss": -0.1988,
"num_tokens": 2621492.0,
"reward": 1.924525260925293,
"reward_std": 9.762751579284668,
"rewards/rollout_reward_func/mean": 1.924525260925293,
"rewards/rollout_reward_func/std": 11.85600757598877,
"sampling/importance_sampling_ratio/max": 2.882676362991333,
"sampling/importance_sampling_ratio/mean": 0.8822349905967712,
"sampling/importance_sampling_ratio/min": 0.08272430300712585,
"sampling/sampling_logp_difference/max": 1.3770942687988281,
"sampling/sampling_logp_difference/mean": 0.09666060656309128,
"step": 71,
"step_time": 35.46656954899936
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.3668806441128254,
"epoch": 0.00144,
"grad_norm": 1.325650930404663,
"kl": 0.4230933412909508,
"learning_rate": 9.999997600006685e-06,
"loss": -0.2015,
"step": 72,
"step_time": 6.277837039999213
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1798.0,
"completions/max_terminated_length": 1798.0,
"completions/mean_length": 1667.5625,
"completions/mean_terminated_length": 1667.5625,
"completions/min_length": 1473.0,
"completions/min_terminated_length": 1473.0,
"entropy": 0.3248750977218151,
"epoch": 0.00146,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4092869758605957,
"kl": 0.33990394696593285,
"learning_rate": 9.999997464821892e-06,
"loss": -0.0273,
"num_tokens": 2695474.0,
"reward": -1.5238330364227295,
"reward_std": 6.099469184875488,
"rewards/rollout_reward_func/mean": -1.5238330364227295,
"rewards/rollout_reward_func/std": 7.693445682525635,
"sampling/importance_sampling_ratio/max": 2.1886417865753174,
"sampling/importance_sampling_ratio/mean": 0.9180707931518555,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.373499870300293,
"sampling/sampling_logp_difference/mean": 0.07824774086475372,
"step": 73,
"step_time": 35.77179241700014
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.3200640454888344,
"epoch": 0.00148,
"grad_norm": 1.2980810403823853,
"kl": 0.37775165028870106,
"learning_rate": 9.999997325933409e-06,
"loss": -0.0305,
"step": 74,
"step_time": 5.783254465000027
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1758.0,
"completions/max_terminated_length": 1758.0,
"completions/mean_length": 1648.6875,
"completions/mean_terminated_length": 1648.6875,
"completions/min_length": 1500.0,
"completions/min_terminated_length": 1500.0,
"entropy": 0.3307141959667206,
"epoch": 0.0015,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.529542088508606,
"kl": 0.3330858051776886,
"learning_rate": 9.999997183341233e-06,
"loss": -0.1579,
"num_tokens": 2768769.0,
"reward": -1.9445281028747559,
"reward_std": 6.960838317871094,
"rewards/rollout_reward_func/mean": -1.9445281028747559,
"rewards/rollout_reward_func/std": 8.304977416992188,
"sampling/importance_sampling_ratio/max": 2.4997639656066895,
"sampling/importance_sampling_ratio/mean": 0.8088055849075317,
"sampling/importance_sampling_ratio/min": 0.09736470133066177,
"sampling/sampling_logp_difference/max": 1.7568840980529785,
"sampling/sampling_logp_difference/mean": 0.09207496047019958,
"step": 75,
"step_time": 34.512230323999574
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.005800189450383186,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007753314450383186,
"entropy": 0.322955708950758,
"epoch": 0.00152,
"grad_norm": 1.5048940181732178,
"kl": 0.37345754727721214,
"learning_rate": 9.999997037045365e-06,
"loss": -0.1606,
"step": 76,
"step_time": 5.6651066240001455
},
{
"clip_ratio/high_max": 0.011600378900766373,
"clip_ratio/high_mean": 0.005800189450383186,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007753314450383186,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1744.0,
"completions/max_terminated_length": 1744.0,
"completions/mean_length": 1672.03125,
"completions/mean_terminated_length": 1672.03125,
"completions/min_length": 1536.0,
"completions/min_terminated_length": 1536.0,
"entropy": 0.3117912784218788,
"epoch": 0.00154,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2413570880889893,
"kl": 0.7537785340100527,
"learning_rate": 9.999996887045808e-06,
"loss": -0.0368,
"num_tokens": 2842874.0,
"reward": -1.6352441310882568,
"reward_std": 6.7997283935546875,
"rewards/rollout_reward_func/mean": -1.6352441310882568,
"rewards/rollout_reward_func/std": 9.460980415344238,
"sampling/importance_sampling_ratio/max": 2.215763568878174,
"sampling/importance_sampling_ratio/mean": 0.7862486839294434,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.4788150787353516,
"sampling/sampling_logp_difference/mean": 0.10331455618143082,
"step": 77,
"step_time": 37.13179366900022
},
{
"clip_ratio/high_max": 0.011600378900766373,
"clip_ratio/high_mean": 0.005800189450383186,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.013612689450383186,
"entropy": 0.30787648260593414,
"epoch": 0.00156,
"grad_norm": 1.3049235343933105,
"kl": 0.7575539667159319,
"learning_rate": 9.99999673334256e-06,
"loss": -0.0388,
"step": 78,
"step_time": 6.551664077999703
},
{
"clip_ratio/high_max": 0.014436141354963183,
"clip_ratio/high_mean": 0.007218070677481592,
"clip_ratio/low_mean": 0.0034564394736662507,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010674510151147842,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1718.0,
"completions/max_terminated_length": 1718.0,
"completions/mean_length": 1622.15625,
"completions/mean_terminated_length": 1619.806396484375,
"completions/min_length": 1097.0,
"completions/min_terminated_length": 1097.0,
"entropy": 0.28395895659923553,
"epoch": 0.00158,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.774448275566101,
"kl": 0.48293524608016014,
"learning_rate": 9.99999657593562e-06,
"loss": -0.108,
"num_tokens": 2915438.0,
"reward": -4.349352836608887,
"reward_std": 5.503687381744385,
"rewards/rollout_reward_func/mean": -4.349352836608887,
"rewards/rollout_reward_func/std": 6.947903156280518,
"sampling/importance_sampling_ratio/max": 2.6842427253723145,
"sampling/importance_sampling_ratio/mean": 0.6991802453994751,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.6641254425048828,
"sampling/sampling_logp_difference/mean": 0.10404618084430695,
"step": 79,
"step_time": 37.2892129060001
},
{
"clip_ratio/high_max": 0.021467391401529312,
"clip_ratio/high_mean": 0.010733695700764656,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012686820700764656,
"entropy": 0.28391823545098305,
"epoch": 0.0016,
"grad_norm": 1.2243019342422485,
"kl": 0.5016566403210163,
"learning_rate": 9.99999641482499e-06,
"loss": -0.1094,
"step": 80,
"step_time": 5.5879295219992855
},
{
"clip_ratio/high_max": 0.008370535913854837,
"clip_ratio/high_mean": 0.006138392724096775,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008091517724096775,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1776.0,
"completions/max_terminated_length": 1776.0,
"completions/mean_length": 1655.34375,
"completions/mean_terminated_length": 1655.34375,
"completions/min_length": 1172.0,
"completions/min_terminated_length": 1172.0,
"entropy": 0.26277439296245575,
"epoch": 0.00162,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.137077808380127,
"kl": 0.6223671287298203,
"learning_rate": 9.999996250010671e-06,
"loss": -0.1437,
"num_tokens": 2989329.0,
"reward": -2.7409677505493164,
"reward_std": 7.541594505310059,
"rewards/rollout_reward_func/mean": -2.7409677505493164,
"rewards/rollout_reward_func/std": 10.378182411193848,
"sampling/importance_sampling_ratio/max": 2.4534168243408203,
"sampling/importance_sampling_ratio/mean": 0.8305980563163757,
"sampling/importance_sampling_ratio/min": 0.05540309473872185,
"sampling/sampling_logp_difference/max": 1.5272252559661865,
"sampling/sampling_logp_difference/mean": 0.08948823064565659,
"step": 81,
"step_time": 34.51007757499883
},
{
"clip_ratio/high_max": 0.012276785913854837,
"clip_ratio/high_mean": 0.006138392956927419,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010044642956927419,
"entropy": 0.2634422518312931,
"epoch": 0.00164,
"grad_norm": 1.2092783451080322,
"kl": 0.5849091820418835,
"learning_rate": 9.999996081492662e-06,
"loss": -0.1446,
"step": 82,
"step_time": 5.717573774000812
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1830.0,
"completions/max_terminated_length": 1830.0,
"completions/mean_length": 1691.46875,
"completions/mean_terminated_length": 1691.46875,
"completions/min_length": 1465.0,
"completions/min_terminated_length": 1465.0,
"entropy": 0.274138493463397,
"epoch": 0.00166,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.1113321781158447,
"kl": 0.42387660406529903,
"learning_rate": 9.999995909270962e-06,
"loss": -0.2307,
"num_tokens": 3063737.0,
"reward": -5.677342414855957,
"reward_std": 5.941880226135254,
"rewards/rollout_reward_func/mean": -5.677342414855957,
"rewards/rollout_reward_func/std": 10.720724105834961,
"sampling/importance_sampling_ratio/max": 2.54227614402771,
"sampling/importance_sampling_ratio/mean": 1.0267385244369507,
"sampling/importance_sampling_ratio/min": 0.09123067557811737,
"sampling/sampling_logp_difference/max": 1.443819522857666,
"sampling/sampling_logp_difference/mean": 0.08112908899784088,
"step": 83,
"step_time": 35.11318050199952
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.009765625,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.013671875,
"entropy": 0.2726801373064518,
"epoch": 0.00168,
"grad_norm": 1.3651243448257446,
"kl": 0.3971481677144766,
"learning_rate": 9.999995733345573e-06,
"loss": -0.2362,
"step": 84,
"step_time": 6.308203391001371
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1837.0,
"completions/max_terminated_length": 1837.0,
"completions/mean_length": 1705.21875,
"completions/mean_terminated_length": 1705.21875,
"completions/min_length": 1491.0,
"completions/min_terminated_length": 1491.0,
"entropy": 0.297397093847394,
"epoch": 0.0017,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3406461477279663,
"kl": 0.37086668983101845,
"learning_rate": 9.999995553716494e-06,
"loss": -0.1103,
"num_tokens": 3139446.0,
"reward": -6.638528823852539,
"reward_std": 7.859951972961426,
"rewards/rollout_reward_func/mean": -6.638528823852539,
"rewards/rollout_reward_func/std": 12.807344436645508,
"sampling/importance_sampling_ratio/max": 2.7764086723327637,
"sampling/importance_sampling_ratio/mean": 0.7711232900619507,
"sampling/importance_sampling_ratio/min": 0.055727217346429825,
"sampling/sampling_logp_difference/max": 1.3549007177352905,
"sampling/sampling_logp_difference/mean": 0.09796138107776642,
"step": 85,
"step_time": 35.18805706900184
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.29359815642237663,
"epoch": 0.00172,
"grad_norm": 1.2749756574630737,
"kl": 0.37067493610084057,
"learning_rate": 9.999995370383725e-06,
"loss": -0.1135,
"step": 86,
"step_time": 5.857377255998472
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1799.0,
"completions/max_terminated_length": 1799.0,
"completions/mean_length": 1642.34375,
"completions/mean_terminated_length": 1642.34375,
"completions/min_length": 1546.0,
"completions/min_terminated_length": 1546.0,
"entropy": 0.25620525516569614,
"epoch": 0.00174,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2850360870361328,
"kl": 0.5967638976871967,
"learning_rate": 9.999995183347268e-06,
"loss": -0.2511,
"num_tokens": 3212569.0,
"reward": -4.25941276550293,
"reward_std": 4.8053388595581055,
"rewards/rollout_reward_func/mean": -4.25941276550293,
"rewards/rollout_reward_func/std": 7.727044105529785,
"sampling/importance_sampling_ratio/max": 2.633681058883667,
"sampling/importance_sampling_ratio/mean": 0.743991494178772,
"sampling/importance_sampling_ratio/min": 0.027961313724517822,
"sampling/sampling_logp_difference/max": 2.183626651763916,
"sampling/sampling_logp_difference/mean": 0.09684586524963379,
"step": 87,
"step_time": 35.86174799499986
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.2555703092366457,
"epoch": 0.00176,
"grad_norm": 1.0292936563491821,
"kl": 0.5942294523119926,
"learning_rate": 9.999994992607122e-06,
"loss": -0.2543,
"step": 88,
"step_time": 5.7727871480010435
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1785.0,
"completions/max_terminated_length": 1785.0,
"completions/mean_length": 1587.25,
"completions/mean_terminated_length": 1587.25,
"completions/min_length": 714.0,
"completions/min_terminated_length": 714.0,
"entropy": 0.2820359170436859,
"epoch": 0.00178,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.688633680343628,
"kl": 0.7878309283405542,
"learning_rate": 9.999994798163286e-06,
"loss": -0.088,
"num_tokens": 3283670.0,
"reward": -1.0320963859558105,
"reward_std": 7.258734703063965,
"rewards/rollout_reward_func/mean": -1.0320963859558105,
"rewards/rollout_reward_func/std": 11.793685913085938,
"sampling/importance_sampling_ratio/max": 2.2084357738494873,
"sampling/importance_sampling_ratio/mean": 0.8563207387924194,
"sampling/importance_sampling_ratio/min": 0.050470076501369476,
"sampling/sampling_logp_difference/max": 1.6832528114318848,
"sampling/sampling_logp_difference/mean": 0.09054332971572876,
"step": 89,
"step_time": 35.323508203998244
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.28230165503919125,
"epoch": 0.0018,
"grad_norm": 1.442618727684021,
"kl": 0.6910470742732286,
"learning_rate": 9.999994600015764e-06,
"loss": -0.0916,
"step": 90,
"step_time": 5.74529949799944
},
{
"clip_ratio/high_max": 0.0032051282469183207,
"clip_ratio/high_mean": 0.0016025641234591603,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0035556891234591603,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1806.0,
"completions/max_terminated_length": 1806.0,
"completions/mean_length": 1682.875,
"completions/mean_terminated_length": 1682.875,
"completions/min_length": 1197.0,
"completions/min_terminated_length": 1197.0,
"entropy": 0.31537965685129166,
"epoch": 0.00182,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3677870035171509,
"kl": 0.8561782389879227,
"learning_rate": 9.99999439816455e-06,
"loss": -0.0483,
"num_tokens": 3358360.0,
"reward": -6.243229866027832,
"reward_std": 5.828756809234619,
"rewards/rollout_reward_func/mean": -6.243229866027832,
"rewards/rollout_reward_func/std": 7.688446998596191,
"sampling/importance_sampling_ratio/max": 1.9665902853012085,
"sampling/importance_sampling_ratio/mean": 0.5829079151153564,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.1947691440582275,
"sampling/sampling_logp_difference/mean": 0.10844805836677551,
"step": 91,
"step_time": 36.43737021200013
},
{
"clip_ratio/high_max": 0.01101762824691832,
"clip_ratio/high_mean": 0.00550881412345916,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00746193912345916,
"entropy": 0.31900631822645664,
"epoch": 0.00184,
"grad_norm": 0.9948733448982239,
"kl": 0.6338084079325199,
"learning_rate": 9.999994192609649e-06,
"loss": -0.0534,
"step": 92,
"step_time": 5.8420921550005005
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1778.0,
"completions/max_terminated_length": 1778.0,
"completions/mean_length": 1658.375,
"completions/mean_terminated_length": 1658.375,
"completions/min_length": 1488.0,
"completions/min_terminated_length": 1488.0,
"entropy": 0.24276937916874886,
"epoch": 0.00186,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.994575023651123,
"kl": 0.6424030996859074,
"learning_rate": 9.99999398335106e-06,
"loss": -0.1681,
"num_tokens": 3432383.0,
"reward": -2.1294469833374023,
"reward_std": 8.107059478759766,
"rewards/rollout_reward_func/mean": -2.1294469833374023,
"rewards/rollout_reward_func/std": 9.736873626708984,
"sampling/importance_sampling_ratio/max": 1.9103875160217285,
"sampling/importance_sampling_ratio/mean": 0.5858784914016724,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.698678970336914,
"sampling/sampling_logp_difference/mean": 0.10006298124790192,
"step": 93,
"step_time": 35.31451669100079
},
{
"clip_ratio/high_max": 0.02734375,
"clip_ratio/high_mean": 0.015625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017578125,
"entropy": 0.24839669093489647,
"epoch": 0.00188,
"grad_norm": 0.7679377198219299,
"kl": 0.5870474837720394,
"learning_rate": 9.999993770388785e-06,
"loss": -0.1693,
"step": 94,
"step_time": 5.76490683999873
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1810.0,
"completions/max_terminated_length": 1810.0,
"completions/mean_length": 1640.625,
"completions/mean_terminated_length": 1640.625,
"completions/min_length": 1128.0,
"completions/min_terminated_length": 1128.0,
"entropy": 0.2957016546279192,
"epoch": 0.0019,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.351029634475708,
"kl": 0.21496129594743252,
"learning_rate": 9.99999355372282e-06,
"loss": -0.1061,
"num_tokens": 3505361.0,
"reward": -2.3848154544830322,
"reward_std": 4.5673828125,
"rewards/rollout_reward_func/mean": -2.3848154544830322,
"rewards/rollout_reward_func/std": 7.392127513885498,
"sampling/importance_sampling_ratio/max": 2.9048802852630615,
"sampling/importance_sampling_ratio/mean": 1.1882892847061157,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2800261974334717,
"sampling/sampling_logp_difference/mean": 0.07288186252117157,
"step": 95,
"step_time": 35.78977110999949
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.30211447179317474,
"epoch": 0.00192,
"grad_norm": 1.8887192010879517,
"kl": 0.20806447509676218,
"learning_rate": 9.999993333353169e-06,
"loss": -0.1059,
"step": 96,
"step_time": 5.819023845999254
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1788.0,
"completions/max_terminated_length": 1788.0,
"completions/mean_length": 1695.625,
"completions/mean_terminated_length": 1695.625,
"completions/min_length": 1572.0,
"completions/min_terminated_length": 1572.0,
"entropy": 0.29871419444680214,
"epoch": 0.00194,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2661700248718262,
"kl": 0.31952402740716934,
"learning_rate": 9.999993109279829e-06,
"loss": -0.2492,
"num_tokens": 3580589.0,
"reward": -1.2452682256698608,
"reward_std": 4.780298233032227,
"rewards/rollout_reward_func/mean": -1.2452682256698608,
"rewards/rollout_reward_func/std": 7.40675163269043,
"sampling/importance_sampling_ratio/max": 2.997992515563965,
"sampling/importance_sampling_ratio/mean": 0.8266191482543945,
"sampling/importance_sampling_ratio/min": 0.1390458345413208,
"sampling/sampling_logp_difference/max": 1.113917589187622,
"sampling/sampling_logp_difference/mean": 0.07985492050647736,
"step": 97,
"step_time": 35.633980886000245
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.30503564700484276,
"epoch": 0.00196,
"grad_norm": 1.3084163665771484,
"kl": 0.3136520925909281,
"learning_rate": 9.999992881502803e-06,
"loss": -0.2512,
"step": 98,
"step_time": 5.782925431000876
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1825.0,
"completions/max_terminated_length": 1825.0,
"completions/mean_length": 1708.65625,
"completions/mean_terminated_length": 1708.65625,
"completions/min_length": 1498.0,
"completions/min_terminated_length": 1498.0,
"entropy": 0.28120192512869835,
"epoch": 0.00198,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.1371867656707764,
"kl": 0.30992733500897884,
"learning_rate": 9.999992650022092e-06,
"loss": -0.0307,
"num_tokens": 3655808.0,
"reward": -2.348900556564331,
"reward_std": 5.844359397888184,
"rewards/rollout_reward_func/mean": -2.348900556564331,
"rewards/rollout_reward_func/std": 7.3201165199279785,
"sampling/importance_sampling_ratio/max": 2.0892417430877686,
"sampling/importance_sampling_ratio/mean": 0.8098887205123901,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.9662381410598755,
"sampling/sampling_logp_difference/mean": 0.0835987776517868,
"step": 99,
"step_time": 38.374850201998925
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.28472086787223816,
"epoch": 0.002,
"grad_norm": 1.0553685426712036,
"kl": 0.32899605855345726,
"learning_rate": 9.999992414837692e-06,
"loss": -0.0328,
"step": 100,
"step_time": 6.762840421000874
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1794.0,
"completions/max_terminated_length": 1794.0,
"completions/mean_length": 1659.1875,
"completions/mean_terminated_length": 1659.1875,
"completions/min_length": 1242.0,
"completions/min_terminated_length": 1242.0,
"entropy": 0.3777337931096554,
"epoch": 0.00202,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.684167504310608,
"kl": 0.24033249728381634,
"learning_rate": 9.999992175949606e-06,
"loss": -0.1423,
"num_tokens": 3729226.0,
"reward": -0.08680570125579834,
"reward_std": 6.447270393371582,
"rewards/rollout_reward_func/mean": -0.08680570125579834,
"rewards/rollout_reward_func/std": 11.136384963989258,
"sampling/importance_sampling_ratio/max": 2.5989115238189697,
"sampling/importance_sampling_ratio/mean": 0.9449913501739502,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2526357173919678,
"sampling/sampling_logp_difference/mean": 0.08848999440670013,
"step": 101,
"step_time": 35.97663474399906
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.0038470644503831863,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011659564450383186,
"entropy": 0.3802182339131832,
"epoch": 0.00204,
"grad_norm": 1.5747473239898682,
"kl": 0.22995636146515608,
"learning_rate": 9.999991933357835e-06,
"loss": -0.1454,
"step": 102,
"step_time": 5.79218666900033
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1761.0,
"completions/max_terminated_length": 1761.0,
"completions/mean_length": 1652.96875,
"completions/mean_terminated_length": 1652.96875,
"completions/min_length": 1516.0,
"completions/min_terminated_length": 1516.0,
"entropy": 0.33094101771712303,
"epoch": 0.00206,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6915507316589355,
"kl": 0.277794674038887,
"learning_rate": 9.999991687062379e-06,
"loss": -0.2347,
"num_tokens": 3802729.0,
"reward": 1.5054678916931152,
"reward_std": 6.697179317474365,
"rewards/rollout_reward_func/mean": 1.5054678916931152,
"rewards/rollout_reward_func/std": 12.069811820983887,
"sampling/importance_sampling_ratio/max": 2.974453926086426,
"sampling/importance_sampling_ratio/mean": 0.9312765598297119,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.4296720027923584,
"sampling/sampling_logp_difference/mean": 0.08249014616012573,
"step": 103,
"step_time": 36.59170679699946
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013671875,
"entropy": 0.33079541847109795,
"epoch": 0.00208,
"grad_norm": 1.049177885055542,
"kl": 0.2827172800898552,
"learning_rate": 9.999991437063234e-06,
"loss": -0.2385,
"step": 104,
"step_time": 5.693732602999262
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1820.0,
"completions/max_terminated_length": 1820.0,
"completions/mean_length": 1666.9375,
"completions/mean_terminated_length": 1666.9375,
"completions/min_length": 1540.0,
"completions/min_terminated_length": 1540.0,
"entropy": 0.3682529255747795,
"epoch": 0.0021,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7118496894836426,
"kl": 0.2884139958769083,
"learning_rate": 9.999991183360406e-06,
"loss": -0.1423,
"num_tokens": 3876320.0,
"reward": -0.8790185451507568,
"reward_std": 5.496585845947266,
"rewards/rollout_reward_func/mean": -0.8790185451507568,
"rewards/rollout_reward_func/std": 7.660371780395508,
"sampling/importance_sampling_ratio/max": 2.7382900714874268,
"sampling/importance_sampling_ratio/mean": 1.0317010879516602,
"sampling/importance_sampling_ratio/min": 0.19462421536445618,
"sampling/sampling_logp_difference/max": 1.0869190692901611,
"sampling/sampling_logp_difference/mean": 0.07802367210388184,
"step": 105,
"step_time": 36.844014158999016
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.36161230877041817,
"epoch": 0.00212,
"grad_norm": 1.3527196645736694,
"kl": 0.32736348174512386,
"learning_rate": 9.999990925953894e-06,
"loss": -0.1463,
"step": 106,
"step_time": 6.756272589999753
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.005800189450383186,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.011659564450383186,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1805.0,
"completions/max_terminated_length": 1805.0,
"completions/mean_length": 1727.4375,
"completions/mean_terminated_length": 1727.4375,
"completions/min_length": 1536.0,
"completions/min_terminated_length": 1536.0,
"entropy": 0.3097492754459381,
"epoch": 0.00214,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5999138355255127,
"kl": 0.5242087468504906,
"learning_rate": 9.999990664843696e-06,
"loss": -0.3461,
"num_tokens": 3952089.0,
"reward": -2.7279787063598633,
"reward_std": 7.685024738311768,
"rewards/rollout_reward_func/mean": -2.7279787063598633,
"rewards/rollout_reward_func/std": 8.918709754943848,
"sampling/importance_sampling_ratio/max": 2.823850631713867,
"sampling/importance_sampling_ratio/mean": 1.0143675804138184,
"sampling/importance_sampling_ratio/min": 0.10803008079528809,
"sampling/sampling_logp_difference/max": 1.6307916641235352,
"sampling/sampling_logp_difference/mean": 0.08010027557611465,
"step": 107,
"step_time": 34.80215419899923
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.005800189450383186,
"clip_ratio/low_mean": 0.017518939450383186,
"clip_ratio/low_min": 0.0078125,
"clip_ratio/region_mean": 0.023319128900766373,
"entropy": 0.2990362048149109,
"epoch": 0.00216,
"grad_norm": 1.7662874460220337,
"kl": 0.5438167825341225,
"learning_rate": 9.999990400029814e-06,
"loss": -0.3506,
"step": 108,
"step_time": 5.828784976000861
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1835.0,
"completions/max_terminated_length": 1835.0,
"completions/mean_length": 1701.5,
"completions/mean_terminated_length": 1701.5,
"completions/min_length": 1555.0,
"completions/min_terminated_length": 1555.0,
"entropy": 0.2941362299025059,
"epoch": 0.00218,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.4735493659973145,
"kl": 0.46664091385900974,
"learning_rate": 9.999990131512245e-06,
"loss": -0.1545,
"num_tokens": 4027063.0,
"reward": -1.1875361204147339,
"reward_std": 6.082664489746094,
"rewards/rollout_reward_func/mean": -1.1875361204147339,
"rewards/rollout_reward_func/std": 7.877804279327393,
"sampling/importance_sampling_ratio/max": 2.3916215896606445,
"sampling/importance_sampling_ratio/mean": 0.7803431153297424,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.4629955291748047,
"sampling/sampling_logp_difference/mean": 0.08069309592247009,
"step": 109,
"step_time": 35.122385199001656
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.28531504422426224,
"epoch": 0.0022,
"grad_norm": 1.2467232942581177,
"kl": 0.5820192918181419,
"learning_rate": 9.999989859290995e-06,
"loss": -0.1614,
"step": 110,
"step_time": 5.834867768998265
},
{
"clip_ratio/high_max": 0.011600378900766373,
"clip_ratio/high_mean": 0.005800189450383186,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007753314450383186,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1805.0,
"completions/max_terminated_length": 1805.0,
"completions/mean_length": 1649.5625,
"completions/mean_terminated_length": 1649.5625,
"completions/min_length": 687.0,
"completions/min_terminated_length": 687.0,
"entropy": 0.29295698180794716,
"epoch": 0.00222,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.934397578239441,
"kl": 0.9904868267476559,
"learning_rate": 9.99998958336606e-06,
"loss": -0.2592,
"num_tokens": 4100465.0,
"reward": -1.0518858432769775,
"reward_std": 7.8057475090026855,
"rewards/rollout_reward_func/mean": -1.0518858432769775,
"rewards/rollout_reward_func/std": 12.11864185333252,
"sampling/importance_sampling_ratio/max": 2.4532089233398438,
"sampling/importance_sampling_ratio/mean": 0.7939997315406799,
"sampling/importance_sampling_ratio/min": 0.03784231096506119,
"sampling/sampling_logp_difference/max": 2.585331439971924,
"sampling/sampling_logp_difference/mean": 0.11609256267547607,
"step": 111,
"step_time": 35.7164067620015
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.017578125,
"clip_ratio/low_min": 0.0078125,
"clip_ratio/region_mean": 0.021484375,
"entropy": 0.28337718918919563,
"epoch": 0.00224,
"grad_norm": 1.8777400255203247,
"kl": 1.192313952371478,
"learning_rate": 9.999989303737442e-06,
"loss": -0.2583,
"step": 112,
"step_time": 6.234686438999233
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1700.0,
"completions/max_terminated_length": 1700.0,
"completions/mean_length": 1627.25,
"completions/mean_terminated_length": 1627.25,
"completions/min_length": 1213.0,
"completions/min_terminated_length": 1213.0,
"entropy": 0.23321667686104774,
"epoch": 0.00226,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0336928367614746,
"kl": 1.1097694747149944,
"learning_rate": 9.999989020405141e-06,
"loss": -0.136,
"num_tokens": 4173152.0,
"reward": -6.243222713470459,
"reward_std": 10.733819961547852,
"rewards/rollout_reward_func/mean": -6.243222713470459,
"rewards/rollout_reward_func/std": 13.576552391052246,
"sampling/importance_sampling_ratio/max": 2.0692925453186035,
"sampling/importance_sampling_ratio/mean": 0.7719696760177612,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 3.4085850715637207,
"sampling/sampling_logp_difference/mean": 0.11740700155496597,
"step": 113,
"step_time": 37.108936664000794
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.013055098708719015,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013055098708719015,
"entropy": 0.2277730144560337,
"epoch": 0.00228,
"grad_norm": 1.9860761165618896,
"kl": 1.227958582341671,
"learning_rate": 9.999988733369157e-06,
"loss": -0.1385,
"step": 114,
"step_time": 5.572819186999368
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0018939394503831863,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007753314450383186,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1798.0,
"completions/max_terminated_length": 1798.0,
"completions/mean_length": 1693.75,
"completions/mean_terminated_length": 1693.75,
"completions/min_length": 1321.0,
"completions/min_terminated_length": 1321.0,
"entropy": 0.23424900509417057,
"epoch": 0.0023,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6176177263259888,
"kl": 1.1609091851860285,
"learning_rate": 9.999988442629489e-06,
"loss": -0.2144,
"num_tokens": 4248083.0,
"reward": -5.069212913513184,
"reward_std": 5.557045936584473,
"rewards/rollout_reward_func/mean": -5.069212913513184,
"rewards/rollout_reward_func/std": 12.084410667419434,
"sampling/importance_sampling_ratio/max": 2.459650754928589,
"sampling/importance_sampling_ratio/mean": 0.6402795910835266,
"sampling/importance_sampling_ratio/min": 5.712434665512732e-15,
"sampling/sampling_logp_difference/max": 30.60143280029297,
"sampling/sampling_logp_difference/mean": 0.17031052708625793,
"step": 115,
"step_time": 35.503748573997655
},
{
"clip_ratio/high_max": 0.0234375,
"clip_ratio/high_mean": 0.01171875,
"clip_ratio/low_mean": 0.005800189450383186,
"clip_ratio/low_min": 0.0037878789007663727,
"clip_ratio/region_mean": 0.017518939450383186,
"entropy": 0.23416488245129585,
"epoch": 0.00232,
"grad_norm": 1.2904891967773438,
"kl": 1.3006360940635204,
"learning_rate": 9.99998814818614e-06,
"loss": -0.217,
"step": 116,
"step_time": 5.804856802999893
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1801.0,
"completions/max_terminated_length": 1801.0,
"completions/mean_length": 1710.40625,
"completions/mean_terminated_length": 1710.40625,
"completions/min_length": 1604.0,
"completions/min_terminated_length": 1604.0,
"entropy": 0.23163120076060295,
"epoch": 0.00234,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6878459453582764,
"kl": 0.438737278804183,
"learning_rate": 9.999987850039108e-06,
"loss": -0.0609,
"num_tokens": 4323796.0,
"reward": -2.6192214488983154,
"reward_std": 8.0145263671875,
"rewards/rollout_reward_func/mean": -2.6192214488983154,
"rewards/rollout_reward_func/std": 10.107186317443848,
"sampling/importance_sampling_ratio/max": 1.9165576696395874,
"sampling/importance_sampling_ratio/mean": 0.9111311435699463,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.697357416152954,
"sampling/sampling_logp_difference/mean": 0.08581140637397766,
"step": 117,
"step_time": 38.28407665499981
},
{
"clip_ratio/high_max": 0.019412878900766373,
"clip_ratio/high_mean": 0.009706439450383186,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009706439450383186,
"entropy": 0.23309355787932873,
"epoch": 0.00236,
"grad_norm": 1.4197455644607544,
"kl": 0.39980714581906796,
"learning_rate": 9.999987548188395e-06,
"loss": -0.0665,
"step": 118,
"step_time": 5.822731712999484
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.008984374813735485,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008984374813735485,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1796.0,
"completions/max_terminated_length": 1796.0,
"completions/mean_length": 1468.46875,
"completions/mean_terminated_length": 1468.46875,
"completions/min_length": 191.0,
"completions/min_terminated_length": 191.0,
"entropy": 0.24193942546844482,
"epoch": 0.00238,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7685405015945435,
"kl": 0.39049934409558773,
"learning_rate": 9.999987242634e-06,
"loss": -0.0657,
"num_tokens": 4391500.0,
"reward": 1.9110275506973267,
"reward_std": 6.384462356567383,
"rewards/rollout_reward_func/mean": 1.9110275506973267,
"rewards/rollout_reward_func/std": 14.606425285339355,
"sampling/importance_sampling_ratio/max": 2.8693697452545166,
"sampling/importance_sampling_ratio/mean": 1.2269116640090942,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.3209800720214844,
"sampling/sampling_logp_difference/mean": 0.08356651663780212,
"step": 119,
"step_time": 31.813835332000053
},
{
"clip_ratio/high_max": 0.03158482210710645,
"clip_ratio/high_mean": 0.01774553582072258,
"clip_ratio/low_mean": 0.009706439450383186,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.027451975271105766,
"entropy": 0.2410776149481535,
"epoch": 0.0024,
"grad_norm": 1.5952562093734741,
"kl": 0.40999631211161613,
"learning_rate": 9.999986933375924e-06,
"loss": -0.0703,
"step": 120,
"step_time": 5.7420445510006175
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1751.0,
"completions/max_terminated_length": 1751.0,
"completions/mean_length": 1555.34375,
"completions/mean_terminated_length": 1555.34375,
"completions/min_length": 645.0,
"completions/min_terminated_length": 645.0,
"entropy": 0.2007010132074356,
"epoch": 0.00242,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9870114922523499,
"kl": 0.5888753104954958,
"learning_rate": 9.999986620414169e-06,
"loss": -0.0606,
"num_tokens": 4461583.0,
"reward": -3.0732581615448,
"reward_std": 2.935732364654541,
"rewards/rollout_reward_func/mean": -3.0732581615448,
"rewards/rollout_reward_func/std": 5.027767658233643,
"sampling/importance_sampling_ratio/max": 2.2580418586730957,
"sampling/importance_sampling_ratio/mean": 0.8960647583007812,
"sampling/importance_sampling_ratio/min": 4.473894033019121e-10,
"sampling/sampling_logp_difference/max": 16.850034713745117,
"sampling/sampling_logp_difference/mean": 0.11269309371709824,
"step": 121,
"step_time": 35.07806598499883
},
{
"clip_ratio/high_max": 0.016927083488553762,
"clip_ratio/high_mean": 0.008463541744276881,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012369791744276881,
"entropy": 0.2029730472713709,
"epoch": 0.00244,
"grad_norm": 0.7402886152267456,
"kl": 0.4976696763187647,
"learning_rate": 9.999986303748731e-06,
"loss": -0.0621,
"step": 122,
"step_time": 5.627297465000083
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1806.0,
"completions/max_terminated_length": 1806.0,
"completions/mean_length": 1683.59375,
"completions/mean_terminated_length": 1683.59375,
"completions/min_length": 1471.0,
"completions/min_terminated_length": 1471.0,
"entropy": 0.21090154722332954,
"epoch": 0.00246,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2972943782806396,
"kl": 0.4575018994510174,
"learning_rate": 9.999985983379614e-06,
"loss": -0.2785,
"num_tokens": 4535822.0,
"reward": -3.0389723777770996,
"reward_std": 4.446030616760254,
"rewards/rollout_reward_func/mean": -3.0389723777770996,
"rewards/rollout_reward_func/std": 6.439523220062256,
"sampling/importance_sampling_ratio/max": 1.792663335800171,
"sampling/importance_sampling_ratio/mean": 0.8646963834762573,
"sampling/importance_sampling_ratio/min": 0.10849396139383316,
"sampling/sampling_logp_difference/max": 2.1350996494293213,
"sampling/sampling_logp_difference/mean": 0.07303433865308762,
"step": 123,
"step_time": 37.839723685000536
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.21065166406333447,
"epoch": 0.00248,
"grad_norm": 1.294693946838379,
"kl": 0.4491172023117542,
"learning_rate": 9.999985659306817e-06,
"loss": -0.2798,
"step": 124,
"step_time": 5.816134531000898
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1791.0,
"completions/max_terminated_length": 1791.0,
"completions/mean_length": 1671.8125,
"completions/mean_terminated_length": 1671.8125,
"completions/min_length": 1453.0,
"completions/min_terminated_length": 1453.0,
"entropy": 0.31402457505464554,
"epoch": 0.0025,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7626806497573853,
"kl": 0.3699891809374094,
"learning_rate": 9.999985331530339e-06,
"loss": -0.123,
"num_tokens": 4609806.0,
"reward": -1.0957450866699219,
"reward_std": 8.374095916748047,
"rewards/rollout_reward_func/mean": -1.0957450866699219,
"rewards/rollout_reward_func/std": 10.550527572631836,
"sampling/importance_sampling_ratio/max": 2.8980863094329834,
"sampling/importance_sampling_ratio/mean": 1.0585088729858398,
"sampling/importance_sampling_ratio/min": 1.4836078201818364e-17,
"sampling/sampling_logp_difference/max": 20.844825744628906,
"sampling/sampling_logp_difference/mean": 0.13481880724430084,
"step": 125,
"step_time": 36.72139834099926
},
{
"clip_ratio/high_max": 0.006623641354963183,
"clip_ratio/high_mean": 0.0033118206774815917,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007218070677481592,
"entropy": 0.31360605917871,
"epoch": 0.00252,
"grad_norm": 1.7030891180038452,
"kl": 0.3880602568387985,
"learning_rate": 9.999985000050181e-06,
"loss": -0.1268,
"step": 126,
"step_time": 5.752750929000285
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0018939394503831863,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005800189450383186,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1770.0,
"completions/max_terminated_length": 1770.0,
"completions/mean_length": 1624.90625,
"completions/mean_terminated_length": 1624.90625,
"completions/min_length": 1254.0,
"completions/min_terminated_length": 1254.0,
"entropy": 0.23735753446817398,
"epoch": 0.00254,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.49963116645813,
"kl": 1.4479252882301807,
"learning_rate": 9.999984664866347e-06,
"loss": -0.1985,
"num_tokens": 4682513.0,
"reward": 0.2574213147163391,
"reward_std": 9.919325828552246,
"rewards/rollout_reward_func/mean": 0.2574213147163391,
"rewards/rollout_reward_func/std": 12.862212181091309,
"sampling/importance_sampling_ratio/max": 2.5531129837036133,
"sampling/importance_sampling_ratio/mean": 0.8520662784576416,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.3626585006713867,
"sampling/sampling_logp_difference/mean": 0.09023018181324005,
"step": 127,
"step_time": 36.4248705359978
},
{
"clip_ratio/high_max": 0.011979166883975267,
"clip_ratio/high_mean": 0.005989583441987634,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011848958441987634,
"entropy": 0.2332585696130991,
"epoch": 0.00256,
"grad_norm": 1.9195269346237183,
"kl": 1.4073903393000364,
"learning_rate": 9.999984325978833e-06,
"loss": -0.2016,
"step": 128,
"step_time": 6.607886812998004
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1825.0,
"completions/max_terminated_length": 1825.0,
"completions/mean_length": 1695.625,
"completions/mean_terminated_length": 1695.625,
"completions/min_length": 1334.0,
"completions/min_terminated_length": 1334.0,
"entropy": 0.2307057324796915,
"epoch": 0.00258,
"frac_reward_zero_std": 0.0,
"grad_norm": 6.967021465301514,
"kl": 2.5539040379226208,
"learning_rate": 9.99998398338764e-06,
"loss": -0.1924,
"num_tokens": 4757093.0,
"reward": -2.7488653659820557,
"reward_std": 5.045413017272949,
"rewards/rollout_reward_func/mean": -2.7488653659820557,
"rewards/rollout_reward_func/std": 10.707550048828125,
"sampling/importance_sampling_ratio/max": 2.2723803520202637,
"sampling/importance_sampling_ratio/mean": 0.799378514289856,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 21.946773529052734,
"sampling/sampling_logp_difference/mean": 0.14229635894298553,
"step": 129,
"step_time": 36.29410157899838
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.22786042280495167,
"epoch": 0.0026,
"grad_norm": 2.270787477493286,
"kl": 1.118930522352457,
"learning_rate": 9.99998363709277e-06,
"loss": -0.2081,
"step": 130,
"step_time": 5.818163544999152
},
{
"clip_ratio/high_max": 0.0036764706019312143,
"clip_ratio/high_mean": 0.0018382353009656072,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003791360300965607,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1784.0,
"completions/max_terminated_length": 1784.0,
"completions/mean_length": 1663.96875,
"completions/mean_terminated_length": 1663.96875,
"completions/min_length": 1170.0,
"completions/min_terminated_length": 1170.0,
"entropy": 0.23138621263206005,
"epoch": 0.00262,
"frac_reward_zero_std": 0.0,
"grad_norm": 10.312149047851562,
"kl": 4.53411222063005,
"learning_rate": 9.999983287094222e-06,
"loss": 0.0936,
"num_tokens": 4830857.0,
"reward": -1.0345556735992432,
"reward_std": 4.783634185791016,
"rewards/rollout_reward_func/mean": -1.0345556735992432,
"rewards/rollout_reward_func/std": 6.499684810638428,
"sampling/importance_sampling_ratio/max": 2.2273147106170654,
"sampling/importance_sampling_ratio/mean": 0.8017725944519043,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 22.242347717285156,
"sampling/sampling_logp_difference/mean": 0.13735142350196838,
"step": 131,
"step_time": 37.860813379999854
},
{
"clip_ratio/high_max": 0.026424632407724857,
"clip_ratio/high_mean": 0.013212316203862429,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015165441203862429,
"entropy": 0.23595001175999641,
"epoch": 0.00264,
"grad_norm": 6.295961380004883,
"kl": 2.904046291485429,
"learning_rate": 9.999982933391998e-06,
"loss": 0.0759,
"step": 132,
"step_time": 5.760328608001146
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.004185267956927419,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008091517956927419,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1723.0,
"completions/max_terminated_length": 1723.0,
"completions/mean_length": 1599.625,
"completions/mean_terminated_length": 1599.625,
"completions/min_length": 1124.0,
"completions/min_terminated_length": 1124.0,
"entropy": 0.1828257255256176,
"epoch": 0.00266,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.1880617141723633,
"kl": 0.6531639210879803,
"learning_rate": 9.999982575986095e-06,
"loss": -0.173,
"num_tokens": 4902653.0,
"reward": -1.3747859001159668,
"reward_std": 5.477799415588379,
"rewards/rollout_reward_func/mean": -1.3747859001159668,
"rewards/rollout_reward_func/std": 7.5533223152160645,
"sampling/importance_sampling_ratio/max": 2.5466907024383545,
"sampling/importance_sampling_ratio/mean": 0.995758056640625,
"sampling/importance_sampling_ratio/min": 0.007883523590862751,
"sampling/sampling_logp_difference/max": 2.0956740379333496,
"sampling/sampling_logp_difference/mean": 0.07060299813747406,
"step": 133,
"step_time": 34.61869661600031
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.19106297940015793,
"epoch": 0.00268,
"grad_norm": 1.2263319492340088,
"kl": 0.6410297751426697,
"learning_rate": 9.999982214876516e-06,
"loss": -0.1744,
"step": 134,
"step_time": 6.524274291999973
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1685.0,
"completions/max_terminated_length": 1685.0,
"completions/mean_length": 1521.625,
"completions/mean_terminated_length": 1521.625,
"completions/min_length": 953.0,
"completions/min_terminated_length": 953.0,
"entropy": 0.23908153176307678,
"epoch": 0.0027,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.1356704235076904,
"kl": 0.3956664204597473,
"learning_rate": 9.999981850063262e-06,
"loss": -0.2331,
"num_tokens": 4971778.0,
"reward": 1.5177106857299805,
"reward_std": 4.678309440612793,
"rewards/rollout_reward_func/mean": 1.5177106857299805,
"rewards/rollout_reward_func/std": 16.861400604248047,
"sampling/importance_sampling_ratio/max": 2.699097156524658,
"sampling/importance_sampling_ratio/mean": 0.861909806728363,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 19.98867416381836,
"sampling/sampling_logp_difference/mean": 0.13872608542442322,
"step": 135,
"step_time": 32.25473025500105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.2471965067088604,
"epoch": 0.00272,
"grad_norm": 2.370650053024292,
"kl": 0.35189586132764816,
"learning_rate": 9.99998148154633e-06,
"loss": -0.2343,
"step": 136,
"step_time": 5.5205114909995245
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1778.0,
"completions/max_terminated_length": 1778.0,
"completions/mean_length": 1687.96875,
"completions/mean_terminated_length": 1687.96875,
"completions/min_length": 1462.0,
"completions/min_terminated_length": 1462.0,
"entropy": 0.25870174542069435,
"epoch": 0.00274,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2739732265472412,
"kl": 0.3620435334742069,
"learning_rate": 9.999981109325725e-06,
"loss": 0.0377,
"num_tokens": 5046449.0,
"reward": -4.809016227722168,
"reward_std": 5.6776933670043945,
"rewards/rollout_reward_func/mean": -4.809016227722168,
"rewards/rollout_reward_func/std": 15.731524467468262,
"sampling/importance_sampling_ratio/max": 2.0674188137054443,
"sampling/importance_sampling_ratio/mean": 0.8131706714630127,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 24.600109100341797,
"sampling/sampling_logp_difference/mean": 0.11974793672561646,
"step": 137,
"step_time": 36.03785672200047
},
{
"clip_ratio/high_max": 0.02734375,
"clip_ratio/high_mean": 0.017578125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.021484375,
"entropy": 0.2636088263243437,
"epoch": 0.00276,
"grad_norm": 0.9780673384666443,
"kl": 0.32067642733454704,
"learning_rate": 9.999980733401442e-06,
"loss": 0.0335,
"step": 138,
"step_time": 5.760654310999598
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1777.0,
"completions/max_terminated_length": 1777.0,
"completions/mean_length": 1652.3125,
"completions/mean_terminated_length": 1652.3125,
"completions/min_length": 1389.0,
"completions/min_terminated_length": 1389.0,
"entropy": 0.21654790081083775,
"epoch": 0.00278,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9906519651412964,
"kl": 0.22392724081873894,
"learning_rate": 9.999980353773486e-06,
"loss": -0.1274,
"num_tokens": 5119986.0,
"reward": -3.179938316345215,
"reward_std": 5.390859603881836,
"rewards/rollout_reward_func/mean": -3.179938316345215,
"rewards/rollout_reward_func/std": 7.339827537536621,
"sampling/importance_sampling_ratio/max": 2.915480613708496,
"sampling/importance_sampling_ratio/mean": 1.0259276628494263,
"sampling/importance_sampling_ratio/min": 3.232071999360181e-13,
"sampling/sampling_logp_difference/max": 29.06264877319336,
"sampling/sampling_logp_difference/mean": 0.10719652473926544,
"step": 139,
"step_time": 35.030210969999644
},
{
"clip_ratio/high_max": 0.015395220601931214,
"clip_ratio/high_mean": 0.009650735300965607,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011603860184550285,
"entropy": 0.22184443473815918,
"epoch": 0.0028,
"grad_norm": 1.378363013267517,
"kl": 0.2069135345518589,
"learning_rate": 9.999979970441856e-06,
"loss": -0.1325,
"step": 140,
"step_time": 6.193294476998744
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0036764706019312143,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0036764706019312143,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1768.0,
"completions/max_terminated_length": 1768.0,
"completions/mean_length": 1623.75,
"completions/mean_terminated_length": 1623.75,
"completions/min_length": 642.0,
"completions/min_terminated_length": 642.0,
"entropy": 0.29241427406668663,
"epoch": 0.00282,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9409116506576538,
"kl": 0.32110051065683365,
"learning_rate": 9.999979583406551e-06,
"loss": -0.1415,
"num_tokens": 5192337.0,
"reward": -3.2484803199768066,
"reward_std": 5.998169422149658,
"rewards/rollout_reward_func/mean": -3.2484803199768066,
"rewards/rollout_reward_func/std": 7.8606648445129395,
"sampling/importance_sampling_ratio/max": 2.497420072555542,
"sampling/importance_sampling_ratio/mean": 1.0857844352722168,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 26.159547805786133,
"sampling/sampling_logp_difference/mean": 0.12232924997806549,
"step": 141,
"step_time": 36.19595617200048
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.009420956019312143,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.015280331019312143,
"entropy": 0.2928194999694824,
"epoch": 0.00284,
"grad_norm": 1.5054811239242554,
"kl": 0.3162485882639885,
"learning_rate": 9.999979192667574e-06,
"loss": -0.1445,
"step": 142,
"step_time": 5.68182536599943
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.007634943351149559,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011541193351149559,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1782.0,
"completions/max_terminated_length": 1782.0,
"completions/mean_length": 1645.90625,
"completions/mean_terminated_length": 1645.90625,
"completions/min_length": 1448.0,
"completions/min_terminated_length": 1448.0,
"entropy": 0.3010127767920494,
"epoch": 0.00286,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9966871738433838,
"kl": 0.22623557038605213,
"learning_rate": 9.999978798224922e-06,
"loss": 0.094,
"num_tokens": 5265626.0,
"reward": -1.248368740081787,
"reward_std": 8.209827423095703,
"rewards/rollout_reward_func/mean": -1.248368740081787,
"rewards/rollout_reward_func/std": 10.719414710998535,
"sampling/importance_sampling_ratio/max": 2.8611063957214355,
"sampling/importance_sampling_ratio/mean": 1.1009293794631958,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 20.22269058227539,
"sampling/sampling_logp_difference/mean": 0.10865116864442825,
"step": 143,
"step_time": 34.8812253310025
},
{
"clip_ratio/high_max": 0.02688419120386243,
"clip_ratio/high_mean": 0.013442095601931214,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013442095601931214,
"entropy": 0.3076608795672655,
"epoch": 0.00288,
"grad_norm": 2.357238531112671,
"kl": 0.22019376046955585,
"learning_rate": 9.999978400078598e-06,
"loss": 0.0904,
"step": 144,
"step_time": 5.731513302997882
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1785.0,
"completions/max_terminated_length": 1785.0,
"completions/mean_length": 1654.75,
"completions/mean_terminated_length": 1654.75,
"completions/min_length": 1508.0,
"completions/min_terminated_length": 1508.0,
"entropy": 0.2691913191229105,
"epoch": 0.0029,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4389311075210571,
"kl": 0.30777561012655497,
"learning_rate": 9.9999779982286e-06,
"loss": -0.0461,
"num_tokens": 5339115.0,
"reward": 4.553075790405273,
"reward_std": 7.375555992126465,
"rewards/rollout_reward_func/mean": 4.553075790405273,
"rewards/rollout_reward_func/std": 9.625307083129883,
"sampling/importance_sampling_ratio/max": 2.137481451034546,
"sampling/importance_sampling_ratio/mean": 1.0123145580291748,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.0253870487213135,
"sampling/sampling_logp_difference/mean": 0.053506266325712204,
"step": 145,
"step_time": 37.3788223370002
},
{
"clip_ratio/high_max": 0.0234375,
"clip_ratio/high_mean": 0.01171875,
"clip_ratio/low_mean": 0.0038470644503831863,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015565814450383186,
"entropy": 0.27271147817373276,
"epoch": 0.00292,
"grad_norm": 1.379260540008545,
"kl": 0.27049021050333977,
"learning_rate": 9.999977592674933e-06,
"loss": -0.0501,
"step": 146,
"step_time": 5.7508253040005
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1769.0,
"completions/max_terminated_length": 1769.0,
"completions/mean_length": 1629.46875,
"completions/mean_terminated_length": 1629.46875,
"completions/min_length": 1117.0,
"completions/min_terminated_length": 1117.0,
"entropy": 0.273640938103199,
"epoch": 0.00294,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9744895696640015,
"kl": 0.21257582679390907,
"learning_rate": 9.999977183417593e-06,
"loss": -0.0741,
"num_tokens": 5412122.0,
"reward": -1.7804348468780518,
"reward_std": 5.91984224319458,
"rewards/rollout_reward_func/mean": -1.7804348468780518,
"rewards/rollout_reward_func/std": 9.313576698303223,
"sampling/importance_sampling_ratio/max": 2.493762493133545,
"sampling/importance_sampling_ratio/mean": 1.099168300628662,
"sampling/importance_sampling_ratio/min": 0.22300267219543457,
"sampling/sampling_logp_difference/max": 1.1365747451782227,
"sampling/sampling_logp_difference/mean": 0.05694221705198288,
"step": 147,
"step_time": 36.152454600000965
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.2737910356372595,
"epoch": 0.00296,
"grad_norm": 1.6924679279327393,
"kl": 0.2047729678452015,
"learning_rate": 9.999976770456581e-06,
"loss": -0.0774,
"step": 148,
"step_time": 5.677298628001154
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1790.0,
"completions/max_terminated_length": 1790.0,
"completions/mean_length": 1715.875,
"completions/mean_terminated_length": 1715.875,
"completions/min_length": 1601.0,
"completions/min_terminated_length": 1601.0,
"entropy": 0.27875737100839615,
"epoch": 0.00298,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0113556385040283,
"kl": 0.2866439474746585,
"learning_rate": 9.999976353791898e-06,
"loss": -0.05,
"num_tokens": 5487662.0,
"reward": -3.409506320953369,
"reward_std": 4.39523983001709,
"rewards/rollout_reward_func/mean": -3.409506320953369,
"rewards/rollout_reward_func/std": 9.453926086425781,
"sampling/importance_sampling_ratio/max": 2.8223652839660645,
"sampling/importance_sampling_ratio/mean": 0.9090695977210999,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.958104133605957,
"sampling/sampling_logp_difference/mean": 0.0680600255727768,
"step": 149,
"step_time": 36.694275042998925
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.2778263594955206,
"epoch": 0.003,
"grad_norm": 1.462227463722229,
"kl": 0.30287417955696583,
"learning_rate": 9.999975933423546e-06,
"loss": -0.055,
"step": 150,
"step_time": 6.261132174002341
},
{
"clip_ratio/high_max": 0.011979166883975267,
"clip_ratio/high_mean": 0.005989583441987634,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005989583441987634,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1789.0,
"completions/max_terminated_length": 1789.0,
"completions/mean_length": 1670.53125,
"completions/mean_terminated_length": 1670.53125,
"completions/min_length": 1212.0,
"completions/min_terminated_length": 1212.0,
"entropy": 0.2821527421474457,
"epoch": 0.00302,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.420735478401184,
"kl": 0.2623867988586426,
"learning_rate": 9.999975509351522e-06,
"loss": -0.1198,
"num_tokens": 5561719.0,
"reward": 1.7372636795043945,
"reward_std": 7.599005222320557,
"rewards/rollout_reward_func/mean": 1.7372636795043945,
"rewards/rollout_reward_func/std": 15.485095977783203,
"sampling/importance_sampling_ratio/max": 2.6034064292907715,
"sampling/importance_sampling_ratio/mean": 1.0206873416900635,
"sampling/importance_sampling_ratio/min": 0.175743967294693,
"sampling/sampling_logp_difference/max": 1.0962951183319092,
"sampling/sampling_logp_difference/mean": 0.05628112703561783,
"step": 151,
"step_time": 34.65810895300274
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.011979166883975267,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013932291883975267,
"entropy": 0.27934372052550316,
"epoch": 0.00304,
"grad_norm": 1.3764787912368774,
"kl": 0.30363442841917276,
"learning_rate": 9.99997508157583e-06,
"loss": -0.1206,
"step": 152,
"step_time": 5.7744073849989945
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1781.0,
"completions/max_terminated_length": 1781.0,
"completions/mean_length": 1703.21875,
"completions/mean_terminated_length": 1703.21875,
"completions/min_length": 1567.0,
"completions/min_terminated_length": 1567.0,
"entropy": 0.2447413094341755,
"epoch": 0.00306,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.6576489210128784,
"kl": 0.4590331744402647,
"learning_rate": 9.999974650096467e-06,
"loss": -0.1218,
"num_tokens": 5636992.0,
"reward": -0.15775525569915771,
"reward_std": 9.043015480041504,
"rewards/rollout_reward_func/mean": -0.15775525569915771,
"rewards/rollout_reward_func/std": 10.28443431854248,
"sampling/importance_sampling_ratio/max": 2.2586140632629395,
"sampling/importance_sampling_ratio/mean": 0.8791904449462891,
"sampling/importance_sampling_ratio/min": 0.13126111030578613,
"sampling/sampling_logp_difference/max": 1.3272500038146973,
"sampling/sampling_logp_difference/mean": 0.06481630355119705,
"step": 153,
"step_time": 35.60768943099811
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.009765625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"entropy": 0.2421154472976923,
"epoch": 0.00308,
"grad_norm": 1.613027572631836,
"kl": 0.47781567834317684,
"learning_rate": 9.999974214913438e-06,
"loss": -0.1224,
"step": 154,
"step_time": 5.782275483999001
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1742.0,
"completions/max_terminated_length": 1742.0,
"completions/mean_length": 1647.8125,
"completions/mean_terminated_length": 1647.8125,
"completions/min_length": 1372.0,
"completions/min_terminated_length": 1372.0,
"entropy": 0.309613186866045,
"epoch": 0.0031,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3915244340896606,
"kl": 0.34489365108311176,
"learning_rate": 9.99997377602674e-06,
"loss": -0.2332,
"num_tokens": 5710235.0,
"reward": -5.272271156311035,
"reward_std": 4.266254901885986,
"rewards/rollout_reward_func/mean": -5.272271156311035,
"rewards/rollout_reward_func/std": 11.99683666229248,
"sampling/importance_sampling_ratio/max": 2.386578321456909,
"sampling/importance_sampling_ratio/mean": 0.8981663584709167,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 28.397716522216797,
"sampling/sampling_logp_difference/mean": 0.16181710362434387,
"step": 155,
"step_time": 36.79902890199992
},
{
"clip_ratio/high_max": 0.01907169120386243,
"clip_ratio/high_mean": 0.01148897036910057,
"clip_ratio/low_mean": 0.009142287075519562,
"clip_ratio/low_min": 0.002659574383869767,
"clip_ratio/region_mean": 0.020631257444620132,
"entropy": 0.3100100867450237,
"epoch": 0.00312,
"grad_norm": 0.9337635636329651,
"kl": 0.36718725040555,
"learning_rate": 9.999973333436373e-06,
"loss": -0.2342,
"step": 156,
"step_time": 6.479778260998501
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1778.0,
"completions/max_terminated_length": 1778.0,
"completions/mean_length": 1645.84375,
"completions/mean_terminated_length": 1645.84375,
"completions/min_length": 859.0,
"completions/min_terminated_length": 859.0,
"entropy": 0.23720368929207325,
"epoch": 0.00314,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3682451248168945,
"kl": 0.3653513854369521,
"learning_rate": 9.999972887142338e-06,
"loss": -0.1482,
"num_tokens": 5783053.0,
"reward": -5.2926249504089355,
"reward_std": 6.853837013244629,
"rewards/rollout_reward_func/mean": -5.2926249504089355,
"rewards/rollout_reward_func/std": 9.61239242553711,
"sampling/importance_sampling_ratio/max": 1.6078870296478271,
"sampling/importance_sampling_ratio/mean": 0.9465827345848083,
"sampling/importance_sampling_ratio/min": 0.14573633670806885,
"sampling/sampling_logp_difference/max": 1.2857561111450195,
"sampling/sampling_logp_difference/mean": 0.05283693969249725,
"step": 157,
"step_time": 35.09920485400198
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.23178580962121487,
"epoch": 0.00316,
"grad_norm": 1.3167202472686768,
"kl": 0.4019450955092907,
"learning_rate": 9.999972437144638e-06,
"loss": -0.1491,
"step": 158,
"step_time": 5.754554526000902
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1746.0,
"completions/max_terminated_length": 1746.0,
"completions/mean_length": 1676.34375,
"completions/mean_terminated_length": 1676.34375,
"completions/min_length": 1622.0,
"completions/min_terminated_length": 1622.0,
"entropy": 0.25272640213370323,
"epoch": 0.00318,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2554067373275757,
"kl": 0.2935179714113474,
"learning_rate": 9.99997198344327e-06,
"loss": -0.1578,
"num_tokens": 5857549.0,
"reward": 1.5661380290985107,
"reward_std": 8.48967170715332,
"rewards/rollout_reward_func/mean": 1.5661380290985107,
"rewards/rollout_reward_func/std": 10.192009925842285,
"sampling/importance_sampling_ratio/max": 1.9953430891036987,
"sampling/importance_sampling_ratio/mean": 0.9570077061653137,
"sampling/importance_sampling_ratio/min": 0.08553915470838547,
"sampling/sampling_logp_difference/max": 1.1288342475891113,
"sampling/sampling_logp_difference/mean": 0.05689948797225952,
"step": 159,
"step_time": 36.0974576239978
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.009765625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.24649916216731071,
"epoch": 0.0032,
"grad_norm": 1.2487872838974,
"kl": 0.3261691927909851,
"learning_rate": 9.999971526038236e-06,
"loss": -0.1592,
"step": 160,
"step_time": 5.650420889000998
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0038470644503831863,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005800189450383186,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1807.0,
"completions/max_terminated_length": 1807.0,
"completions/mean_length": 1604.75,
"completions/mean_terminated_length": 1604.75,
"completions/min_length": 1071.0,
"completions/min_terminated_length": 1071.0,
"entropy": 0.23597443290054798,
"epoch": 0.00322,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1652355194091797,
"kl": 0.5538041703402996,
"learning_rate": 9.999971064929537e-06,
"loss": -0.1124,
"num_tokens": 5929687.0,
"reward": 5.239955425262451,
"reward_std": 8.498127937316895,
"rewards/rollout_reward_func/mean": 5.239955425262451,
"rewards/rollout_reward_func/std": 10.677428245544434,
"sampling/importance_sampling_ratio/max": 2.1468427181243896,
"sampling/importance_sampling_ratio/mean": 0.8562913537025452,
"sampling/importance_sampling_ratio/min": 0.11029359698295593,
"sampling/sampling_logp_difference/max": 1.4075746536254883,
"sampling/sampling_logp_difference/mean": 0.05289062112569809,
"step": 161,
"step_time": 35.0522555460002
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.009836647659540176,
"clip_ratio/low_min": 0.0078125,
"clip_ratio/region_mean": 0.013742897659540176,
"entropy": 0.23265731893479824,
"epoch": 0.00324,
"grad_norm": 1.0499224662780762,
"kl": 0.5876908944919705,
"learning_rate": 9.999970600117172e-06,
"loss": -0.1161,
"step": 162,
"step_time": 6.270556325998768
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1808.0,
"completions/max_terminated_length": 1808.0,
"completions/mean_length": 1688.3125,
"completions/mean_terminated_length": 1688.3125,
"completions/min_length": 1426.0,
"completions/min_terminated_length": 1426.0,
"entropy": 0.24026566371321678,
"epoch": 0.00326,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.526390552520752,
"kl": 0.6829012483358383,
"learning_rate": 9.999970131601143e-06,
"loss": -0.1204,
"num_tokens": 6004625.0,
"reward": -2.4879937171936035,
"reward_std": 6.033343315124512,
"rewards/rollout_reward_func/mean": -2.4879937171936035,
"rewards/rollout_reward_func/std": 9.8729829788208,
"sampling/importance_sampling_ratio/max": 2.08561372756958,
"sampling/importance_sampling_ratio/mean": 0.930208683013916,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 21.826404571533203,
"sampling/sampling_logp_difference/mean": 0.10662569105625153,
"step": 163,
"step_time": 37.25797335199968
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.23994088359177113,
"epoch": 0.00328,
"grad_norm": 0.99588543176651,
"kl": 0.7377689871937037,
"learning_rate": 9.99996965938145e-06,
"loss": -0.123,
"step": 164,
"step_time": 5.81927943699975
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1800.0,
"completions/max_terminated_length": 1800.0,
"completions/mean_length": 1682.96875,
"completions/mean_terminated_length": 1682.96875,
"completions/min_length": 1526.0,
"completions/min_terminated_length": 1526.0,
"entropy": 0.20219792239367962,
"epoch": 0.0033,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7853528261184692,
"kl": 0.6294286772608757,
"learning_rate": 9.999969183458093e-06,
"loss": -0.1554,
"num_tokens": 6078930.0,
"reward": -0.3497920036315918,
"reward_std": 7.241074085235596,
"rewards/rollout_reward_func/mean": -0.3497920036315918,
"rewards/rollout_reward_func/std": 11.06623363494873,
"sampling/importance_sampling_ratio/max": 2.898749828338623,
"sampling/importance_sampling_ratio/mean": 1.0241472721099854,
"sampling/importance_sampling_ratio/min": 0.15246078372001648,
"sampling/sampling_logp_difference/max": 1.200087070465088,
"sampling/sampling_logp_difference/mean": 0.052497610449790955,
"step": 165,
"step_time": 36.50886810300017
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.19982031919062138,
"epoch": 0.00332,
"grad_norm": 1.6592210531234741,
"kl": 0.6106511801481247,
"learning_rate": 9.999968703831072e-06,
"loss": -0.1618,
"step": 166,
"step_time": 5.802947281998968
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.003791360300965607,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003791360300965607,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1792.0,
"completions/max_terminated_length": 1792.0,
"completions/mean_length": 1664.03125,
"completions/mean_terminated_length": 1664.03125,
"completions/min_length": 1444.0,
"completions/min_terminated_length": 1444.0,
"entropy": 0.25958102196455,
"epoch": 0.00334,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5413814783096313,
"kl": 0.43835699930787086,
"learning_rate": 9.999968220500388e-06,
"loss": -0.0403,
"num_tokens": 6152886.0,
"reward": 2.1668155193328857,
"reward_std": 6.447092056274414,
"rewards/rollout_reward_func/mean": 2.1668155193328857,
"rewards/rollout_reward_func/std": 8.427111625671387,
"sampling/importance_sampling_ratio/max": 2.721165180206299,
"sampling/importance_sampling_ratio/mean": 1.0039149522781372,
"sampling/importance_sampling_ratio/min": 1.5273299985851807e-11,
"sampling/sampling_logp_difference/max": 25.23809814453125,
"sampling/sampling_logp_difference/mean": 0.11750782281160355,
"step": 167,
"step_time": 37.20988352099903
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.2585952654480934,
"epoch": 0.00336,
"grad_norm": 1.6427268981933594,
"kl": 0.41400426626205444,
"learning_rate": 9.99996773346604e-06,
"loss": -0.044,
"step": 168,
"step_time": 6.244365029000619
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1772.0,
"completions/max_terminated_length": 1772.0,
"completions/mean_length": 1701.78125,
"completions/mean_terminated_length": 1701.78125,
"completions/min_length": 1452.0,
"completions/min_terminated_length": 1452.0,
"entropy": 0.2237578984349966,
"epoch": 0.00338,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.704355239868164,
"kl": 0.408824922516942,
"learning_rate": 9.999967242728034e-06,
"loss": 0.018,
"num_tokens": 6227758.0,
"reward": 1.3272455930709839,
"reward_std": 8.560393333435059,
"rewards/rollout_reward_func/mean": 1.3272455930709839,
"rewards/rollout_reward_func/std": 9.793672561645508,
"sampling/importance_sampling_ratio/max": 2.430826187133789,
"sampling/importance_sampling_ratio/mean": 0.9597625136375427,
"sampling/importance_sampling_ratio/min": 2.0419729275522602e-12,
"sampling/sampling_logp_difference/max": 25.280433654785156,
"sampling/sampling_logp_difference/mean": 0.11089442670345306,
"step": 169,
"step_time": 37.15113865700005
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.013671875,
"entropy": 0.22276777774095535,
"epoch": 0.0034,
"grad_norm": 1.461653709411621,
"kl": 0.3720815582200885,
"learning_rate": 9.999966748286364e-06,
"loss": 0.0138,
"step": 170,
"step_time": 5.70916496099926
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1812.0,
"completions/max_terminated_length": 1812.0,
"completions/mean_length": 1649.53125,
"completions/mean_terminated_length": 1649.53125,
"completions/min_length": 1475.0,
"completions/min_terminated_length": 1475.0,
"entropy": 0.24526489153504372,
"epoch": 0.00342,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3005672693252563,
"kl": 0.2525577172636986,
"learning_rate": 9.999966250141033e-06,
"loss": -0.1557,
"num_tokens": 6300886.0,
"reward": -5.017649173736572,
"reward_std": 4.73761510848999,
"rewards/rollout_reward_func/mean": -5.017649173736572,
"rewards/rollout_reward_func/std": 11.356165885925293,
"sampling/importance_sampling_ratio/max": 2.0284202098846436,
"sampling/importance_sampling_ratio/mean": 0.9477963447570801,
"sampling/importance_sampling_ratio/min": 0.1324770450592041,
"sampling/sampling_logp_difference/max": 0.9475624561309814,
"sampling/sampling_logp_difference/mean": 0.05894453451037407,
"step": 171,
"step_time": 36.065134260999
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.24479248374700546,
"epoch": 0.00344,
"grad_norm": 1.2820854187011719,
"kl": 0.25359731540083885,
"learning_rate": 9.999965748292042e-06,
"loss": -0.1577,
"step": 172,
"step_time": 5.793963940999674
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1802.0,
"completions/max_terminated_length": 1802.0,
"completions/mean_length": 1693.0625,
"completions/mean_terminated_length": 1693.0625,
"completions/min_length": 907.0,
"completions/min_terminated_length": 907.0,
"entropy": 0.22402186505496502,
"epoch": 0.00346,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0302447080612183,
"kl": 0.6031353138387203,
"learning_rate": 9.999965242739394e-06,
"loss": -0.1091,
"num_tokens": 6375440.0,
"reward": -1.3529514074325562,
"reward_std": 5.233081340789795,
"rewards/rollout_reward_func/mean": -1.3529514074325562,
"rewards/rollout_reward_func/std": 8.062092781066895,
"sampling/importance_sampling_ratio/max": 2.0994455814361572,
"sampling/importance_sampling_ratio/mean": 0.7845278978347778,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.5858683586120605,
"sampling/sampling_logp_difference/mean": 0.07677553594112396,
"step": 173,
"step_time": 36.896430036998936
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.22430452704429626,
"epoch": 0.00348,
"grad_norm": 0.8624597191810608,
"kl": 0.5716133154928684,
"learning_rate": 9.999964733483082e-06,
"loss": -0.1117,
"step": 174,
"step_time": 5.81097331699857
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1783.0,
"completions/max_terminated_length": 1783.0,
"completions/mean_length": 1665.46875,
"completions/mean_terminated_length": 1665.46875,
"completions/min_length": 1372.0,
"completions/min_terminated_length": 1372.0,
"entropy": 0.22247346490621567,
"epoch": 0.0035,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.365124225616455,
"kl": 0.869195181876421,
"learning_rate": 9.999964220523113e-06,
"loss": -0.1818,
"num_tokens": 6449233.0,
"reward": 0.06757661700248718,
"reward_std": 4.011537551879883,
"rewards/rollout_reward_func/mean": 0.06757661700248718,
"rewards/rollout_reward_func/std": 8.588602066040039,
"sampling/importance_sampling_ratio/max": 1.9519811868667603,
"sampling/importance_sampling_ratio/mean": 0.8214517831802368,
"sampling/importance_sampling_ratio/min": 8.936198135633955e-13,
"sampling/sampling_logp_difference/max": 24.413681030273438,
"sampling/sampling_logp_difference/mean": 0.12217515707015991,
"step": 175,
"step_time": 35.031672232000346
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.22109077498316765,
"epoch": 0.00352,
"grad_norm": 1.0243955850601196,
"kl": 0.8096068780869246,
"learning_rate": 9.999963703859486e-06,
"loss": -0.1845,
"step": 176,
"step_time": 5.7405531860003975
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1797.0,
"completions/max_terminated_length": 1797.0,
"completions/mean_length": 1651.4375,
"completions/mean_terminated_length": 1651.4375,
"completions/min_length": 1455.0,
"completions/min_terminated_length": 1455.0,
"entropy": 0.2104360293596983,
"epoch": 0.00354,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.1368544101715088,
"kl": 0.4305075518786907,
"learning_rate": 9.999963183492201e-06,
"loss": -0.1685,
"num_tokens": 6522462.0,
"reward": -1.636040449142456,
"reward_std": 4.548352241516113,
"rewards/rollout_reward_func/mean": -1.636040449142456,
"rewards/rollout_reward_func/std": 9.328537940979004,
"sampling/importance_sampling_ratio/max": 2.3601131439208984,
"sampling/importance_sampling_ratio/mean": 1.1350016593933105,
"sampling/importance_sampling_ratio/min": 9.077130865529914e-13,
"sampling/sampling_logp_difference/max": 25.7470760345459,
"sampling/sampling_logp_difference/mean": 0.10910745710134506,
"step": 177,
"step_time": 35.85149571300008
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.20941109023988247,
"epoch": 0.00356,
"grad_norm": 1.1521154642105103,
"kl": 0.41772544756531715,
"learning_rate": 9.999962659421257e-06,
"loss": -0.1698,
"step": 178,
"step_time": 6.28170923900052
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1805.0,
"completions/max_terminated_length": 1805.0,
"completions/mean_length": 1698.21875,
"completions/mean_terminated_length": 1698.21875,
"completions/min_length": 1600.0,
"completions/min_terminated_length": 1600.0,
"entropy": 0.24070261418819427,
"epoch": 0.00358,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4422119855880737,
"kl": 0.45182526856660843,
"learning_rate": 9.999962131646657e-06,
"loss": -0.2072,
"num_tokens": 6597489.0,
"reward": 3.5879530906677246,
"reward_std": 5.621713638305664,
"rewards/rollout_reward_func/mean": 3.5879530906677246,
"rewards/rollout_reward_func/std": 9.696173667907715,
"sampling/importance_sampling_ratio/max": 2.181058406829834,
"sampling/importance_sampling_ratio/mean": 0.8959834575653076,
"sampling/importance_sampling_ratio/min": 0.14532947540283203,
"sampling/sampling_logp_difference/max": 1.420717716217041,
"sampling/sampling_logp_difference/mean": 0.06300906836986542,
"step": 179,
"step_time": 36.23942648999855
},
{
"clip_ratio/high_max": 0.006756756920367479,
"clip_ratio/high_mean": 0.0033783784601837397,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00728462846018374,
"entropy": 0.23607005551457405,
"epoch": 0.0036,
"grad_norm": 1.3579888343811035,
"kl": 0.5251178797334433,
"learning_rate": 9.999961600168402e-06,
"loss": -0.2103,
"step": 180,
"step_time": 5.82093875299779
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1816.0,
"completions/max_terminated_length": 1816.0,
"completions/mean_length": 1669.5,
"completions/mean_terminated_length": 1669.5,
"completions/min_length": 1570.0,
"completions/min_terminated_length": 1570.0,
"entropy": 0.18744167499244213,
"epoch": 0.00362,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0111624002456665,
"kl": 0.4290591198951006,
"learning_rate": 9.99996106498649e-06,
"loss": -0.1616,
"num_tokens": 6671509.0,
"reward": 2.143825054168701,
"reward_std": 4.177690029144287,
"rewards/rollout_reward_func/mean": 2.143825054168701,
"rewards/rollout_reward_func/std": 8.129756927490234,
"sampling/importance_sampling_ratio/max": 2.1915409564971924,
"sampling/importance_sampling_ratio/mean": 0.9412689805030823,
"sampling/importance_sampling_ratio/min": 0.09555165469646454,
"sampling/sampling_logp_difference/max": 1.42826509475708,
"sampling/sampling_logp_difference/mean": 0.05709037184715271,
"step": 181,
"step_time": 35.89454559800015
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.18788514845073223,
"epoch": 0.00364,
"grad_norm": 0.9392257332801819,
"kl": 0.4581598751246929,
"learning_rate": 9.999960526100922e-06,
"loss": -0.163,
"step": 182,
"step_time": 5.8134966330007956
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1730.0,
"completions/max_terminated_length": 1730.0,
"completions/mean_length": 1637.65625,
"completions/mean_terminated_length": 1637.65625,
"completions/min_length": 1298.0,
"completions/min_terminated_length": 1298.0,
"entropy": 0.22764514200389385,
"epoch": 0.00366,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.381371259689331,
"kl": 0.5989289321005344,
"learning_rate": 9.9999599835117e-06,
"loss": -0.0257,
"num_tokens": 6744431.0,
"reward": -4.3541483879089355,
"reward_std": 6.479227066040039,
"rewards/rollout_reward_func/mean": -4.3541483879089355,
"rewards/rollout_reward_func/std": 11.574047088623047,
"sampling/importance_sampling_ratio/max": 2.5868866443634033,
"sampling/importance_sampling_ratio/mean": 0.8098483085632324,
"sampling/importance_sampling_ratio/min": 1.047447632102072e-11,
"sampling/sampling_logp_difference/max": 24.79946517944336,
"sampling/sampling_logp_difference/mean": 0.10594826936721802,
"step": 183,
"step_time": 37.10624599499897
},
{
"clip_ratio/high_max": 0.015395220601931214,
"clip_ratio/high_mean": 0.007697610300965607,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007697610300965607,
"entropy": 0.22827458009123802,
"epoch": 0.00368,
"grad_norm": 0.8857015371322632,
"kl": 0.5525546111166477,
"learning_rate": 9.999959437218823e-06,
"loss": -0.0309,
"step": 184,
"step_time": 6.532324665999113
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1829.0,
"completions/max_terminated_length": 1829.0,
"completions/mean_length": 1658.28125,
"completions/mean_terminated_length": 1658.28125,
"completions/min_length": 1517.0,
"completions/min_terminated_length": 1517.0,
"entropy": 0.21369964070618153,
"epoch": 0.0037,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3246303796768188,
"kl": 0.47560007125139236,
"learning_rate": 9.999958887222293e-06,
"loss": -0.2377,
"num_tokens": 6817978.0,
"reward": 2.9859910011291504,
"reward_std": 4.5678887367248535,
"rewards/rollout_reward_func/mean": 2.9859910011291504,
"rewards/rollout_reward_func/std": 8.68184757232666,
"sampling/importance_sampling_ratio/max": 2.1020846366882324,
"sampling/importance_sampling_ratio/mean": 1.0168509483337402,
"sampling/importance_sampling_ratio/min": 0.16904951632022858,
"sampling/sampling_logp_difference/max": 1.180922269821167,
"sampling/sampling_logp_difference/mean": 0.05478543043136597,
"step": 185,
"step_time": 36.360830822000025
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013671875,
"entropy": 0.2150357235223055,
"epoch": 0.00372,
"grad_norm": 1.0169010162353516,
"kl": 0.4974839948117733,
"learning_rate": 9.999958333522109e-06,
"loss": -0.2427,
"step": 186,
"step_time": 5.833081005999702
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1804.0,
"completions/max_terminated_length": 1804.0,
"completions/mean_length": 1707.5625,
"completions/mean_terminated_length": 1707.5625,
"completions/min_length": 1588.0,
"completions/min_terminated_length": 1588.0,
"entropy": 0.22898070700466633,
"epoch": 0.00374,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.298535943031311,
"kl": 0.5328315943479538,
"learning_rate": 9.999957776118273e-06,
"loss": -0.087,
"num_tokens": 6893533.0,
"reward": -0.973763108253479,
"reward_std": 5.657554626464844,
"rewards/rollout_reward_func/mean": -0.973763108253479,
"rewards/rollout_reward_func/std": 13.231979370117188,
"sampling/importance_sampling_ratio/max": 2.4051334857940674,
"sampling/importance_sampling_ratio/mean": 1.0956934690475464,
"sampling/importance_sampling_ratio/min": 0.1452791541814804,
"sampling/sampling_logp_difference/max": 1.2502107620239258,
"sampling/sampling_logp_difference/mean": 0.05563567206263542,
"step": 187,
"step_time": 36.085149069999716
},
{
"clip_ratio/high_max": 0.0234375,
"clip_ratio/high_mean": 0.01171875,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01953125,
"entropy": 0.23145854473114014,
"epoch": 0.00376,
"grad_norm": 1.0598479509353638,
"kl": 0.5177410617470741,
"learning_rate": 9.999957215010786e-06,
"loss": -0.0919,
"step": 188,
"step_time": 5.831472408998707
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1764.0,
"completions/max_terminated_length": 1764.0,
"completions/mean_length": 1684.125,
"completions/mean_terminated_length": 1684.125,
"completions/min_length": 1553.0,
"completions/min_terminated_length": 1553.0,
"entropy": 0.20638220198452473,
"epoch": 0.00378,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.619344711303711,
"kl": 0.3770788684487343,
"learning_rate": 9.999956650199647e-06,
"loss": -0.1053,
"num_tokens": 6968120.0,
"reward": 4.376737594604492,
"reward_std": 4.202037811279297,
"rewards/rollout_reward_func/mean": 4.376737594604492,
"rewards/rollout_reward_func/std": 7.641676425933838,
"sampling/importance_sampling_ratio/max": 2.5485401153564453,
"sampling/importance_sampling_ratio/mean": 1.0775609016418457,
"sampling/importance_sampling_ratio/min": 0.35982266068458557,
"sampling/sampling_logp_difference/max": 0.8528366088867188,
"sampling/sampling_logp_difference/mean": 0.04990018904209137,
"step": 189,
"step_time": 37.8296041270014
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013671875,
"entropy": 0.20882763899862766,
"epoch": 0.0038,
"grad_norm": 1.2702538967132568,
"kl": 0.39946580305695534,
"learning_rate": 9.999956081684854e-06,
"loss": -0.1089,
"step": 190,
"step_time": 6.217620228997475
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0020833334419876337,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004036458441987634,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1785.0,
"completions/max_terminated_length": 1785.0,
"completions/mean_length": 1665.28125,
"completions/mean_terminated_length": 1665.28125,
"completions/min_length": 1322.0,
"completions/min_terminated_length": 1322.0,
"entropy": 0.22902198880910873,
"epoch": 0.00382,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5835474729537964,
"kl": 0.5039874613285065,
"learning_rate": 9.999955509466414e-06,
"loss": -0.1075,
"num_tokens": 7042262.0,
"reward": 2.3839476108551025,
"reward_std": 4.715667724609375,
"rewards/rollout_reward_func/mean": 2.3839476108551025,
"rewards/rollout_reward_func/std": 9.231707572937012,
"sampling/importance_sampling_ratio/max": 2.368562936782837,
"sampling/importance_sampling_ratio/mean": 0.8959805965423584,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2122876644134521,
"sampling/sampling_logp_difference/mean": 0.06825359165668488,
"step": 191,
"step_time": 36.59035418899839
},
{
"clip_ratio/high_max": 0.020052083767950535,
"clip_ratio/high_mean": 0.010026041883975267,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017838541883975267,
"entropy": 0.22917617112398148,
"epoch": 0.00384,
"grad_norm": 1.1909282207489014,
"kl": 0.5397434663027525,
"learning_rate": 9.999954933544324e-06,
"loss": -0.1104,
"step": 192,
"step_time": 5.772955709000598
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1807.0,
"completions/max_terminated_length": 1807.0,
"completions/mean_length": 1626.8125,
"completions/mean_terminated_length": 1626.8125,
"completions/min_length": 841.0,
"completions/min_terminated_length": 841.0,
"entropy": 0.22650794871151447,
"epoch": 0.00386,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1048918962478638,
"kl": 0.7586736418306828,
"learning_rate": 9.999954353918583e-06,
"loss": -0.16,
"num_tokens": 7115196.0,
"reward": 6.401093482971191,
"reward_std": 8.368595123291016,
"rewards/rollout_reward_func/mean": 6.401093482971191,
"rewards/rollout_reward_func/std": 10.889983177185059,
"sampling/importance_sampling_ratio/max": 2.1145029067993164,
"sampling/importance_sampling_ratio/mean": 0.8738132119178772,
"sampling/importance_sampling_ratio/min": 0.07303983718156815,
"sampling/sampling_logp_difference/max": 1.4383344650268555,
"sampling/sampling_logp_difference/mean": 0.06884820759296417,
"step": 193,
"step_time": 35.971025829999235
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.01953125,
"entropy": 0.22370813973248005,
"epoch": 0.00388,
"grad_norm": 0.8128796219825745,
"kl": 0.7968976274132729,
"learning_rate": 9.999953770589195e-06,
"loss": -0.1637,
"step": 194,
"step_time": 5.7590284980014985
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1784.0,
"completions/max_terminated_length": 1784.0,
"completions/mean_length": 1585.0625,
"completions/mean_terminated_length": 1585.0625,
"completions/min_length": 780.0,
"completions/min_terminated_length": 780.0,
"entropy": 0.1900235153734684,
"epoch": 0.0039,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.860226571559906,
"kl": 0.3734680116176605,
"learning_rate": 9.999953183556157e-06,
"loss": -0.0902,
"num_tokens": 7186577.0,
"reward": 3.927793264389038,
"reward_std": 5.897523880004883,
"rewards/rollout_reward_func/mean": 3.927793264389038,
"rewards/rollout_reward_func/std": 13.196045875549316,
"sampling/importance_sampling_ratio/max": 2.547750949859619,
"sampling/importance_sampling_ratio/mean": 0.9823294878005981,
"sampling/importance_sampling_ratio/min": 0.09533966332674026,
"sampling/sampling_logp_difference/max": 1.204984426498413,
"sampling/sampling_logp_difference/mean": 0.047755300998687744,
"step": 195,
"step_time": 35.797688914998616
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.18859419785439968,
"epoch": 0.00392,
"grad_norm": 0.834666907787323,
"kl": 0.3848831467330456,
"learning_rate": 9.999952592819472e-06,
"loss": -0.0905,
"step": 196,
"step_time": 5.723597453000366
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1802.0,
"completions/max_terminated_length": 1802.0,
"completions/mean_length": 1687.1875,
"completions/mean_terminated_length": 1687.1875,
"completions/min_length": 1557.0,
"completions/min_terminated_length": 1557.0,
"entropy": 0.217379130423069,
"epoch": 0.00394,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.430611252784729,
"kl": 0.6379734985530376,
"learning_rate": 9.999951998379141e-06,
"loss": -0.123,
"num_tokens": 7260812.0,
"reward": 2.1545767784118652,
"reward_std": 5.987642765045166,
"rewards/rollout_reward_func/mean": 2.1545767784118652,
"rewards/rollout_reward_func/std": 7.767053127288818,
"sampling/importance_sampling_ratio/max": 2.526907444000244,
"sampling/importance_sampling_ratio/mean": 0.8589324951171875,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.872528076171875,
"sampling/sampling_logp_difference/mean": 0.06381456553936005,
"step": 197,
"step_time": 35.551387553001405
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.21903221122920513,
"epoch": 0.00396,
"grad_norm": 1.5187474489212036,
"kl": 0.6173972077667713,
"learning_rate": 9.999951400235163e-06,
"loss": -0.1276,
"step": 198,
"step_time": 5.812083100000564
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1799.0,
"completions/max_terminated_length": 1799.0,
"completions/mean_length": 1667.71875,
"completions/mean_terminated_length": 1667.71875,
"completions/min_length": 769.0,
"completions/min_terminated_length": 769.0,
"entropy": 0.20807519741356373,
"epoch": 0.00398,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3347526788711548,
"kl": 0.47471469454467297,
"learning_rate": 9.999950798387541e-06,
"loss": 0.0517,
"num_tokens": 7334918.0,
"reward": -3.7062830924987793,
"reward_std": 9.045034408569336,
"rewards/rollout_reward_func/mean": -3.7062830924987793,
"rewards/rollout_reward_func/std": 11.927645683288574,
"sampling/importance_sampling_ratio/max": 1.9491627216339111,
"sampling/importance_sampling_ratio/mean": 0.930275559425354,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.7468228340148926,
"sampling/sampling_logp_difference/mean": 0.055586282163858414,
"step": 199,
"step_time": 35.96847454199906
},
{
"clip_ratio/high_max": 0.0234375,
"clip_ratio/high_mean": 0.013671875,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"entropy": 0.2134502585977316,
"epoch": 0.004,
"grad_norm": 1.0929890871047974,
"kl": 0.43012629821896553,
"learning_rate": 9.999950192836272e-06,
"loss": 0.0473,
"step": 200,
"step_time": 5.775383055001839
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1768.0,
"completions/max_terminated_length": 1768.0,
"completions/mean_length": 1598.46875,
"completions/mean_terminated_length": 1598.46875,
"completions/min_length": 1034.0,
"completions/min_terminated_length": 1034.0,
"entropy": 0.22695614211261272,
"epoch": 0.00402,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.23384690284729,
"kl": 0.5254122829064727,
"learning_rate": 9.999949583581358e-06,
"loss": -0.0825,
"num_tokens": 7406851.0,
"reward": 7.607372283935547,
"reward_std": 9.699461936950684,
"rewards/rollout_reward_func/mean": 7.607372283935547,
"rewards/rollout_reward_func/std": 17.210861206054688,
"sampling/importance_sampling_ratio/max": 1.7273722887039185,
"sampling/importance_sampling_ratio/mean": 0.9000965356826782,
"sampling/importance_sampling_ratio/min": 0.3363904654979706,
"sampling/sampling_logp_difference/max": 1.1262538433074951,
"sampling/sampling_logp_difference/mean": 0.05383963882923126,
"step": 201,
"step_time": 35.05857214800017
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.23254447057843208,
"epoch": 0.00404,
"grad_norm": 1.067671298980713,
"kl": 0.5273672332987189,
"learning_rate": 9.999948970622801e-06,
"loss": -0.0877,
"step": 202,
"step_time": 5.642065598000045
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1830.0,
"completions/max_terminated_length": 1830.0,
"completions/mean_length": 1694.34375,
"completions/mean_terminated_length": 1694.34375,
"completions/min_length": 1444.0,
"completions/min_terminated_length": 1444.0,
"entropy": 0.2840390596538782,
"epoch": 0.00406,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5355794429779053,
"kl": 0.39024360105395317,
"learning_rate": 9.9999483539606e-06,
"loss": -0.0927,
"num_tokens": 7481438.0,
"reward": 0.7141335010528564,
"reward_std": 3.9084863662719727,
"rewards/rollout_reward_func/mean": 0.7141335010528564,
"rewards/rollout_reward_func/std": 9.917291641235352,
"sampling/importance_sampling_ratio/max": 2.4801061153411865,
"sampling/importance_sampling_ratio/mean": 1.0379252433776855,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 28.232906341552734,
"sampling/sampling_logp_difference/mean": 0.10223247855901718,
"step": 203,
"step_time": 35.488045403002616
},
{
"clip_ratio/high_max": 0.006623641354963183,
"clip_ratio/high_mean": 0.0033118206774815917,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005264945677481592,
"entropy": 0.28290311247110367,
"epoch": 0.00408,
"grad_norm": 1.391993522644043,
"kl": 0.37290242314338684,
"learning_rate": 9.999947733594757e-06,
"loss": -0.0955,
"step": 204,
"step_time": 5.823711978000574
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1780.0,
"completions/max_terminated_length": 1780.0,
"completions/mean_length": 1677.25,
"completions/mean_terminated_length": 1677.25,
"completions/min_length": 1565.0,
"completions/min_terminated_length": 1565.0,
"entropy": 0.25766652449965477,
"epoch": 0.0041,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6373906135559082,
"kl": 0.7895541861653328,
"learning_rate": 9.99994710952527e-06,
"loss": -0.2043,
"num_tokens": 7555620.0,
"reward": 0.5474326610565186,
"reward_std": 5.27195930480957,
"rewards/rollout_reward_func/mean": 0.5474326610565186,
"rewards/rollout_reward_func/std": 9.263967514038086,
"sampling/importance_sampling_ratio/max": 2.6494393348693848,
"sampling/importance_sampling_ratio/mean": 0.9114236831665039,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2698535919189453,
"sampling/sampling_logp_difference/mean": 0.07712831348180771,
"step": 205,
"step_time": 37.12327094200191
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.009765625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01953125,
"entropy": 0.2550558466464281,
"epoch": 0.00412,
"grad_norm": 1.3040579557418823,
"kl": 0.8384794592857361,
"learning_rate": 9.999946481752143e-06,
"loss": -0.208,
"step": 206,
"step_time": 6.206274641001073
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1770.0,
"completions/max_terminated_length": 1770.0,
"completions/mean_length": 1649.0625,
"completions/mean_terminated_length": 1649.0625,
"completions/min_length": 1216.0,
"completions/min_terminated_length": 1216.0,
"entropy": 0.2342894095927477,
"epoch": 0.00414,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9534094929695129,
"kl": 0.5438143275678158,
"learning_rate": 9.999945850275376e-06,
"loss": -0.1419,
"num_tokens": 7628698.0,
"reward": 4.569943428039551,
"reward_std": 5.696385383605957,
"rewards/rollout_reward_func/mean": 4.569943428039551,
"rewards/rollout_reward_func/std": 11.701920509338379,
"sampling/importance_sampling_ratio/max": 2.2198283672332764,
"sampling/importance_sampling_ratio/mean": 0.913837194442749,
"sampling/importance_sampling_ratio/min": 0.21426241099834442,
"sampling/sampling_logp_difference/max": 0.973109245300293,
"sampling/sampling_logp_difference/mean": 0.05454317480325699,
"step": 207,
"step_time": 34.45527249300176
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.2328789085149765,
"epoch": 0.00416,
"grad_norm": 0.9537069797515869,
"kl": 0.5390445850789547,
"learning_rate": 9.999945215094968e-06,
"loss": -0.1447,
"step": 208,
"step_time": 5.694117715996981
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1822.0,
"completions/max_terminated_length": 1822.0,
"completions/mean_length": 1689.59375,
"completions/mean_terminated_length": 1689.59375,
"completions/min_length": 1565.0,
"completions/min_terminated_length": 1565.0,
"entropy": 0.23806917294859886,
"epoch": 0.00418,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0990620851516724,
"kl": 0.441210076212883,
"learning_rate": 9.99994457621092e-06,
"loss": -0.1143,
"num_tokens": 7703515.0,
"reward": -0.2485913634300232,
"reward_std": 7.676312446594238,
"rewards/rollout_reward_func/mean": -0.2485913634300232,
"rewards/rollout_reward_func/std": 10.19659423828125,
"sampling/importance_sampling_ratio/max": 2.198415994644165,
"sampling/importance_sampling_ratio/mean": 0.9838775396347046,
"sampling/importance_sampling_ratio/min": 0.2651430070400238,
"sampling/sampling_logp_difference/max": 1.1037625074386597,
"sampling/sampling_logp_difference/mean": 0.05238356068730354,
"step": 209,
"step_time": 35.312499495998054
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.23526065610349178,
"epoch": 0.0042,
"grad_norm": 1.1086832284927368,
"kl": 0.4285181984305382,
"learning_rate": 9.999943933623233e-06,
"loss": -0.119,
"step": 210,
"step_time": 5.8305381829995895
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1811.0,
"completions/max_terminated_length": 1811.0,
"completions/mean_length": 1694.34375,
"completions/mean_terminated_length": 1694.34375,
"completions/min_length": 1365.0,
"completions/min_terminated_length": 1365.0,
"entropy": 0.2092603985220194,
"epoch": 0.00422,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.210951328277588,
"kl": 0.7087297588586807,
"learning_rate": 9.999943287331909e-06,
"loss": -0.1151,
"num_tokens": 7778540.0,
"reward": -0.5601588487625122,
"reward_std": 4.062717437744141,
"rewards/rollout_reward_func/mean": -0.5601588487625122,
"rewards/rollout_reward_func/std": 7.223782539367676,
"sampling/importance_sampling_ratio/max": 2.789172410964966,
"sampling/importance_sampling_ratio/mean": 0.9125500917434692,
"sampling/importance_sampling_ratio/min": 1.555470856284824e-11,
"sampling/sampling_logp_difference/max": 23.064640045166016,
"sampling/sampling_logp_difference/mean": 0.10738179087638855,
"step": 211,
"step_time": 36.472802145999594
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0036764706019312143,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009535845601931214,
"entropy": 0.2104271575808525,
"epoch": 0.00424,
"grad_norm": 1.276593804359436,
"kl": 0.6613003388047218,
"learning_rate": 9.999942637336943e-06,
"loss": -0.1181,
"step": 212,
"step_time": 6.7614001400033885
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1787.0,
"completions/max_terminated_length": 1787.0,
"completions/mean_length": 1655.40625,
"completions/mean_terminated_length": 1655.40625,
"completions/min_length": 1447.0,
"completions/min_terminated_length": 1447.0,
"entropy": 0.255456043407321,
"epoch": 0.00426,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.1123297214508057,
"kl": 0.6608983650803566,
"learning_rate": 9.999941983638343e-06,
"loss": 0.0397,
"num_tokens": 7852267.0,
"reward": -2.758002996444702,
"reward_std": 6.682594299316406,
"rewards/rollout_reward_func/mean": -2.758002996444702,
"rewards/rollout_reward_func/std": 14.27979850769043,
"sampling/importance_sampling_ratio/max": 2.8922348022460938,
"sampling/importance_sampling_ratio/mean": 1.0855534076690674,
"sampling/importance_sampling_ratio/min": 0.13145792484283447,
"sampling/sampling_logp_difference/max": 1.5352246761322021,
"sampling/sampling_logp_difference/mean": 0.06498946994543076,
"step": 213,
"step_time": 35.69279204999839
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.009706439450383186,
"clip_ratio/low_min": 0.0037878789007663727,
"clip_ratio/region_mean": 0.015565814450383186,
"entropy": 0.2532905638217926,
"epoch": 0.00428,
"grad_norm": 1.6830919981002808,
"kl": 0.6681476458907127,
"learning_rate": 9.999941326236106e-06,
"loss": 0.0344,
"step": 214,
"step_time": 5.748447021000175
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0038470644503831863,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005800189450383186,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1804.0,
"completions/max_terminated_length": 1804.0,
"completions/mean_length": 1685.09375,
"completions/mean_terminated_length": 1685.09375,
"completions/min_length": 1064.0,
"completions/min_terminated_length": 1064.0,
"entropy": 0.304389376193285,
"epoch": 0.0043,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0502004623413086,
"kl": 0.5163263864815235,
"learning_rate": 9.999940665130233e-06,
"loss": -0.2013,
"num_tokens": 7926984.0,
"reward": 0.5382822751998901,
"reward_std": 5.5314249992370605,
"rewards/rollout_reward_func/mean": 0.5382822751998901,
"rewards/rollout_reward_func/std": 8.343429565429688,
"sampling/importance_sampling_ratio/max": 2.3640105724334717,
"sampling/importance_sampling_ratio/mean": 1.0280786752700806,
"sampling/importance_sampling_ratio/min": 2.17974208311227e-20,
"sampling/sampling_logp_difference/max": 17.786712646484375,
"sampling/sampling_logp_difference/mean": 0.15175247192382812,
"step": 215,
"step_time": 36.67021942499741
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.30115350522100925,
"epoch": 0.00432,
"grad_norm": 0.9675745964050293,
"kl": 0.5201160311698914,
"learning_rate": 9.999940000320726e-06,
"loss": -0.2036,
"step": 216,
"step_time": 5.841338779997386
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1782.0,
"completions/max_terminated_length": 1782.0,
"completions/mean_length": 1673.5625,
"completions/mean_terminated_length": 1673.5625,
"completions/min_length": 1486.0,
"completions/min_terminated_length": 1486.0,
"entropy": 0.21431122720241547,
"epoch": 0.00434,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.535396933555603,
"kl": 0.516041487455368,
"learning_rate": 9.999939331807582e-06,
"loss": -0.0649,
"num_tokens": 8001770.0,
"reward": 4.304098129272461,
"reward_std": 6.8736419677734375,
"rewards/rollout_reward_func/mean": 4.304098129272461,
"rewards/rollout_reward_func/std": 15.587159156799316,
"sampling/importance_sampling_ratio/max": 2.9547388553619385,
"sampling/importance_sampling_ratio/mean": 1.0765466690063477,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.359184980392456,
"sampling/sampling_logp_difference/mean": 0.05725931376218796,
"step": 217,
"step_time": 35.47372649599674
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.2103865798562765,
"epoch": 0.00436,
"grad_norm": 1.193834662437439,
"kl": 0.5380803644657135,
"learning_rate": 9.999938659590807e-06,
"loss": -0.0676,
"step": 218,
"step_time": 6.670932665001601
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1786.0,
"completions/max_terminated_length": 1786.0,
"completions/mean_length": 1694.21875,
"completions/mean_terminated_length": 1694.21875,
"completions/min_length": 1544.0,
"completions/min_terminated_length": 1544.0,
"entropy": 0.1990877389907837,
"epoch": 0.00438,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2838199138641357,
"kl": 1.5804159492254257,
"learning_rate": 9.999937983670399e-06,
"loss": -0.1898,
"num_tokens": 8076281.0,
"reward": 4.216753005981445,
"reward_std": 4.654492378234863,
"rewards/rollout_reward_func/mean": 4.216753005981445,
"rewards/rollout_reward_func/std": 7.823268413543701,
"sampling/importance_sampling_ratio/max": 2.515824556350708,
"sampling/importance_sampling_ratio/mean": 0.7889621257781982,
"sampling/importance_sampling_ratio/min": 0.07681597769260406,
"sampling/sampling_logp_difference/max": 2.2358238697052,
"sampling/sampling_logp_difference/mean": 0.08003745228052139,
"step": 219,
"step_time": 37.58560269500049
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.19579317048192024,
"epoch": 0.0044,
"grad_norm": 0.9996125102043152,
"kl": 1.5866832248866558,
"learning_rate": 9.999937304046356e-06,
"loss": -0.1923,
"step": 220,
"step_time": 5.752707515996008
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1790.0,
"completions/max_terminated_length": 1790.0,
"completions/mean_length": 1667.9375,
"completions/mean_terminated_length": 1667.9375,
"completions/min_length": 1480.0,
"completions/min_terminated_length": 1480.0,
"entropy": 0.22602487355470657,
"epoch": 0.00442,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.9601463675498962,
"kl": 0.6449089199304581,
"learning_rate": 9.99993662071868e-06,
"loss": -0.1863,
"num_tokens": 8150161.0,
"reward": -3.656409740447998,
"reward_std": 5.3043951988220215,
"rewards/rollout_reward_func/mean": -3.656409740447998,
"rewards/rollout_reward_func/std": 11.171483993530273,
"sampling/importance_sampling_ratio/max": 2.952061653137207,
"sampling/importance_sampling_ratio/mean": 0.7157564163208008,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 25.000808715820312,
"sampling/sampling_logp_difference/mean": 0.11645625531673431,
"step": 221,
"step_time": 37.025948885000616
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.021484375,
"entropy": 0.22284850105643272,
"epoch": 0.00444,
"grad_norm": 0.906544029712677,
"kl": 0.6455521509051323,
"learning_rate": 9.999935933687375e-06,
"loss": -0.1901,
"step": 222,
"step_time": 5.777160473999174
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1813.0,
"completions/max_terminated_length": 1813.0,
"completions/mean_length": 1680.875,
"completions/mean_terminated_length": 1680.875,
"completions/min_length": 1399.0,
"completions/min_terminated_length": 1399.0,
"entropy": 0.20937101170420647,
"epoch": 0.00446,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.123390555381775,
"kl": 0.7408150210976601,
"learning_rate": 9.99993524295244e-06,
"loss": -0.1761,
"num_tokens": 8224415.0,
"reward": -1.7068707942962646,
"reward_std": 6.148789882659912,
"rewards/rollout_reward_func/mean": -1.7068707942962646,
"rewards/rollout_reward_func/std": 11.295839309692383,
"sampling/importance_sampling_ratio/max": 2.4562361240386963,
"sampling/importance_sampling_ratio/mean": 0.7759705781936646,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.6416139602661133,
"sampling/sampling_logp_difference/mean": 0.07305672764778137,
"step": 223,
"step_time": 36.99016052300249
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.009765625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.2060265801846981,
"epoch": 0.00448,
"grad_norm": 1.0533183813095093,
"kl": 0.7319801151752472,
"learning_rate": 9.999934548513875e-06,
"loss": -0.1797,
"step": 224,
"step_time": 5.814677470998504
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0026041667442768812,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004557291744276881,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1785.0,
"completions/max_terminated_length": 1785.0,
"completions/mean_length": 1656.375,
"completions/mean_terminated_length": 1656.375,
"completions/min_length": 697.0,
"completions/min_terminated_length": 697.0,
"entropy": 0.18023328483104706,
"epoch": 0.0045,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.3207415342330933,
"kl": 0.9109714720398188,
"learning_rate": 9.999933850371681e-06,
"loss": -0.2514,
"num_tokens": 8297842.0,
"reward": 0.6653532981872559,
"reward_std": 4.2065510749816895,
"rewards/rollout_reward_func/mean": 0.6653532981872559,
"rewards/rollout_reward_func/std": 7.791869163513184,
"sampling/importance_sampling_ratio/max": 2.713641405105591,
"sampling/importance_sampling_ratio/mean": 0.8001816868782043,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.8015203475952148,
"sampling/sampling_logp_difference/mean": 0.07049349695444107,
"step": 225,
"step_time": 35.520827905998885
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.004557291744276881,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006510416744276881,
"entropy": 0.1753668338060379,
"epoch": 0.00452,
"grad_norm": 1.195987343788147,
"kl": 0.9277527779340744,
"learning_rate": 9.999933148525858e-06,
"loss": -0.2553,
"step": 226,
"step_time": 5.7582581370006665
},
{
"clip_ratio/high_max": 0.0036764706019312143,
"clip_ratio/high_mean": 0.0018382353009656072,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0037913601845502853,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1834.0,
"completions/max_terminated_length": 1834.0,
"completions/mean_length": 1666.375,
"completions/mean_terminated_length": 1666.375,
"completions/min_length": 1242.0,
"completions/min_terminated_length": 1242.0,
"entropy": 0.20567630976438522,
"epoch": 0.00454,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.6567699909210205,
"kl": 0.8131851889193058,
"learning_rate": 9.999932442976408e-06,
"loss": 0.2988,
"num_tokens": 8371968.0,
"reward": 1.6306254863739014,
"reward_std": 4.1761884689331055,
"rewards/rollout_reward_func/mean": 1.6306254863739014,
"rewards/rollout_reward_func/std": 13.871889114379883,
"sampling/importance_sampling_ratio/max": 2.815732717514038,
"sampling/importance_sampling_ratio/mean": 0.9413543939590454,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 26.72653579711914,
"sampling/sampling_logp_difference/mean": 0.11560939252376556,
"step": 227,
"step_time": 35.89721404099873
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.20439518056809902,
"epoch": 0.00456,
"grad_norm": 1.5760877132415771,
"kl": 0.8568692058324814,
"learning_rate": 9.99993173372333e-06,
"loss": 0.2944,
"step": 228,
"step_time": 5.827364698001475
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1787.0,
"completions/max_terminated_length": 1787.0,
"completions/mean_length": 1671.65625,
"completions/mean_terminated_length": 1671.65625,
"completions/min_length": 1521.0,
"completions/min_terminated_length": 1521.0,
"entropy": 0.22973632253706455,
"epoch": 0.00458,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1046069860458374,
"kl": 1.1489849016070366,
"learning_rate": 9.999931020766626e-06,
"loss": -0.3328,
"num_tokens": 8446037.0,
"reward": 3.500030279159546,
"reward_std": 7.045429229736328,
"rewards/rollout_reward_func/mean": 3.500030279159546,
"rewards/rollout_reward_func/std": 8.772632598876953,
"sampling/importance_sampling_ratio/max": 2.272545099258423,
"sampling/importance_sampling_ratio/mean": 0.9003783464431763,
"sampling/importance_sampling_ratio/min": 7.296939094625365e-12,
"sampling/sampling_logp_difference/max": 24.617910385131836,
"sampling/sampling_logp_difference/mean": 0.12970557808876038,
"step": 229,
"step_time": 37.32937073400353
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.009650735184550285,
"clip_ratio/low_mean": 0.009765625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.019416360184550285,
"entropy": 0.22551355883479118,
"epoch": 0.0046,
"grad_norm": 1.1626572608947754,
"kl": 1.2178469970822334,
"learning_rate": 9.999930304106296e-06,
"loss": -0.3359,
"step": 230,
"step_time": 5.758782369999608
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1806.0,
"completions/max_terminated_length": 1806.0,
"completions/mean_length": 1643.5,
"completions/mean_terminated_length": 1643.5,
"completions/min_length": 1231.0,
"completions/min_terminated_length": 1231.0,
"entropy": 0.18366078101098537,
"epoch": 0.00462,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5905568599700928,
"kl": 0.48303202725946903,
"learning_rate": 9.99992958374234e-06,
"loss": -0.0905,
"num_tokens": 8519605.0,
"reward": 6.464164733886719,
"reward_std": 3.211686372756958,
"rewards/rollout_reward_func/mean": 6.464164733886719,
"rewards/rollout_reward_func/std": 16.602075576782227,
"sampling/importance_sampling_ratio/max": 2.2648463249206543,
"sampling/importance_sampling_ratio/mean": 0.9129816293716431,
"sampling/importance_sampling_ratio/min": 1.7741487651274626e-18,
"sampling/sampling_logp_difference/max": 25.275915145874023,
"sampling/sampling_logp_difference/mean": 0.17626813054084778,
"step": 231,
"step_time": 35.11706399999821
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.003791360300965607,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007697610300965607,
"entropy": 0.1828291267156601,
"epoch": 0.00464,
"grad_norm": 1.4188190698623657,
"kl": 0.4656708240509033,
"learning_rate": 9.999928859674762e-06,
"loss": -0.0945,
"step": 232,
"step_time": 5.8302720290066645
},
{
"clip_ratio/high_max": 0.005681818351149559,
"clip_ratio/high_mean": 0.0028409091755747795,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0028409091755747795,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1836.0,
"completions/max_terminated_length": 1836.0,
"completions/mean_length": 1539.03125,
"completions/mean_terminated_length": 1539.03125,
"completions/min_length": 372.0,
"completions/min_terminated_length": 372.0,
"entropy": 0.20367534644901752,
"epoch": 0.00466,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.0687413215637207,
"kl": 1.4640729799866676,
"learning_rate": 9.999928131903557e-06,
"loss": 0.0491,
"num_tokens": 8589842.0,
"reward": 7.841023921966553,
"reward_std": 7.656625747680664,
"rewards/rollout_reward_func/mean": 7.841023921966553,
"rewards/rollout_reward_func/std": 13.105290412902832,
"sampling/importance_sampling_ratio/max": 2.7232446670532227,
"sampling/importance_sampling_ratio/mean": 0.7804238796234131,
"sampling/importance_sampling_ratio/min": 0.13633084297180176,
"sampling/sampling_logp_difference/max": 2.269442558288574,
"sampling/sampling_logp_difference/mean": 0.06575377285480499,
"step": 233,
"step_time": 33.579131972999676
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0026041667442768812,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008463541744276881,
"entropy": 0.20685471966862679,
"epoch": 0.00468,
"grad_norm": 2.082902193069458,
"kl": 1.0169252753257751,
"learning_rate": 9.999927400428733e-06,
"loss": 0.0434,
"step": 234,
"step_time": 5.8269990400003735
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.009765625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1803.0,
"completions/max_terminated_length": 1803.0,
"completions/mean_length": 1706.3125,
"completions/mean_terminated_length": 1706.3125,
"completions/min_length": 1573.0,
"completions/min_terminated_length": 1573.0,
"entropy": 0.1788796652108431,
"epoch": 0.0047,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1087870597839355,
"kl": 0.5621245931833982,
"learning_rate": 9.999926665250287e-06,
"loss": -0.0253,
"num_tokens": 8664926.0,
"reward": 2.5220413208007812,
"reward_std": 3.3794925212860107,
"rewards/rollout_reward_func/mean": 2.5220413208007812,
"rewards/rollout_reward_func/std": 5.743170261383057,
"sampling/importance_sampling_ratio/max": 2.7984251976013184,
"sampling/importance_sampling_ratio/mean": 0.8231557607650757,
"sampling/importance_sampling_ratio/min": 0.2042286992073059,
"sampling/sampling_logp_difference/max": 1.3143759965896606,
"sampling/sampling_logp_difference/mean": 0.058986593037843704,
"step": 235,
"step_time": 35.771082822997414
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"entropy": 0.18589365482330322,
"epoch": 0.00472,
"grad_norm": 1.253989338874817,
"kl": 0.5263552982360125,
"learning_rate": 9.999925926368217e-06,
"loss": -0.028,
"step": 236,
"step_time": 5.811419122999723
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1790.0,
"completions/max_terminated_length": 1790.0,
"completions/mean_length": 1694.59375,
"completions/mean_terminated_length": 1694.59375,
"completions/min_length": 1606.0,
"completions/min_terminated_length": 1606.0,
"entropy": 0.17474745213985443,
"epoch": 0.00474,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3517323732376099,
"kl": 0.8328725062310696,
"learning_rate": 9.999925183782528e-06,
"loss": -0.1822,
"num_tokens": 8739844.0,
"reward": 0.9312727451324463,
"reward_std": 4.400314807891846,
"rewards/rollout_reward_func/mean": 0.9312727451324463,
"rewards/rollout_reward_func/std": 12.9136381149292,
"sampling/importance_sampling_ratio/max": 1.6590906381607056,
"sampling/importance_sampling_ratio/mean": 0.8407135605812073,
"sampling/importance_sampling_ratio/min": 0.12429223209619522,
"sampling/sampling_logp_difference/max": 1.6658029556274414,
"sampling/sampling_logp_difference/mean": 0.05644724518060684,
"step": 237,
"step_time": 36.696694154996294
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.01171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"entropy": 0.1794893555343151,
"epoch": 0.00476,
"grad_norm": 1.0692886114120483,
"kl": 0.8188136555254459,
"learning_rate": 9.99992443749322e-06,
"loss": -0.184,
"step": 238,
"step_time": 5.769766430996242
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1819.0,
"completions/max_terminated_length": 1819.0,
"completions/mean_length": 1614.9375,
"completions/mean_terminated_length": 1614.9375,
"completions/min_length": 806.0,
"completions/min_terminated_length": 806.0,
"entropy": 0.18347695656120777,
"epoch": 0.00478,
"frac_reward_zero_std": 0.125,
"grad_norm": 2.0552101135253906,
"kl": 0.3097874727100134,
"learning_rate": 9.99992368750029e-06,
"loss": 0.0694,
"num_tokens": 8812269.0,
"reward": 8.014856338500977,
"reward_std": 4.783290863037109,
"rewards/rollout_reward_func/mean": 8.014856338500977,
"rewards/rollout_reward_func/std": 12.856328964233398,
"sampling/importance_sampling_ratio/max": 2.936600923538208,
"sampling/importance_sampling_ratio/mean": 1.0911171436309814,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.1101388931274414,
"sampling/sampling_logp_difference/mean": 0.040012530982494354,
"step": 239,
"step_time": 33.62449117500364
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.006138392956927419,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011997767956927419,
"entropy": 0.1877030562609434,
"epoch": 0.0048,
"grad_norm": 1.3781535625457764,
"kl": 0.3060177452862263,
"learning_rate": 9.999922933803743e-06,
"loss": 0.0649,
"step": 240,
"step_time": 6.31635257200287
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1782.0,
"completions/max_terminated_length": 1782.0,
"completions/mean_length": 1668.1875,
"completions/mean_terminated_length": 1668.1875,
"completions/min_length": 1092.0,
"completions/min_terminated_length": 1092.0,
"entropy": 0.2379392758011818,
"epoch": 0.00482,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.326453685760498,
"kl": 0.6201226636767387,
"learning_rate": 9.999922176403579e-06,
"loss": -0.1752,
"num_tokens": 8886070.0,
"reward": -0.9512331485748291,
"reward_std": 5.470861911773682,
"rewards/rollout_reward_func/mean": -0.9512331485748291,
"rewards/rollout_reward_func/std": 10.099501609802246,
"sampling/importance_sampling_ratio/max": 2.10463285446167,
"sampling/importance_sampling_ratio/mean": 0.9869784116744995,
"sampling/importance_sampling_ratio/min": 1.0831281184453534e-11,
"sampling/sampling_logp_difference/max": 23.32485580444336,
"sampling/sampling_logp_difference/mean": 0.11264218389987946,
"step": 241,
"step_time": 36.34408466700006
},
{
"clip_ratio/high_max": 0.011488970601931214,
"clip_ratio/high_mean": 0.007697610300965607,
"clip_ratio/low_mean": 0.013020833488553762,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.02071844378951937,
"entropy": 0.2382805310189724,
"epoch": 0.00484,
"grad_norm": 0.9912970662117004,
"kl": 0.5726640149950981,
"learning_rate": 9.999921415299796e-06,
"loss": -0.1802,
"step": 242,
"step_time": 5.751053724003214
},
{
"clip_ratio/high_max": 0.0234375,
"clip_ratio/high_mean": 0.01171875,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.017578125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1797.0,
"completions/max_terminated_length": 1797.0,
"completions/mean_length": 1690.5,
"completions/mean_terminated_length": 1690.5,
"completions/min_length": 1430.0,
"completions/min_terminated_length": 1430.0,
"entropy": 0.19938462413847446,
"epoch": 0.00486,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.312639594078064,
"kl": 0.6614471711218357,
"learning_rate": 9.9999206504924e-06,
"loss": -0.1751,
"num_tokens": 8960640.0,
"reward": 1.9128804206848145,
"reward_std": 4.455126762390137,
"rewards/rollout_reward_func/mean": 1.9128804206848145,
"rewards/rollout_reward_func/std": 7.3313374519348145,
"sampling/importance_sampling_ratio/max": 2.6208341121673584,
"sampling/importance_sampling_ratio/mean": 1.0078068971633911,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.7909104824066162,
"sampling/sampling_logp_difference/mean": 0.060140155255794525,
"step": 243,
"step_time": 36.51225224099835
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.01171875,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.017578125,
"entropy": 0.19580959528684616,
"epoch": 0.00488,
"grad_norm": 1.1415836811065674,
"kl": 0.6649853922426701,
"learning_rate": 9.999919881981385e-06,
"loss": -0.1771,
"step": 244,
"step_time": 5.780806564001978
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1801.0,
"completions/max_terminated_length": 1801.0,
"completions/mean_length": 1675.125,
"completions/mean_terminated_length": 1675.125,
"completions/min_length": 1362.0,
"completions/min_terminated_length": 1362.0,
"entropy": 0.24602361768484116,
"epoch": 0.0049,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2397838830947876,
"kl": 0.5857466273009777,
"learning_rate": 9.99991910976676e-06,
"loss": -0.041,
"num_tokens": 9034635.0,
"reward": 1.560486912727356,
"reward_std": 6.24477481842041,
"rewards/rollout_reward_func/mean": 1.560486912727356,
"rewards/rollout_reward_func/std": 11.325467109680176,
"sampling/importance_sampling_ratio/max": 1.8394286632537842,
"sampling/importance_sampling_ratio/mean": 0.8830543756484985,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 20.549360275268555,
"sampling/sampling_logp_difference/mean": 0.12986382842063904,
"step": 245,
"step_time": 36.87764433600023
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.24585570394992828,
"epoch": 0.00492,
"grad_norm": 1.0776090621948242,
"kl": 0.6381884589791298,
"learning_rate": 9.999918333848517e-06,
"loss": -0.0432,
"step": 246,
"step_time": 6.26423046400123
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1804.0,
"completions/max_terminated_length": 1804.0,
"completions/mean_length": 1672.25,
"completions/mean_terminated_length": 1672.25,
"completions/min_length": 1498.0,
"completions/min_terminated_length": 1498.0,
"entropy": 0.1877462100237608,
"epoch": 0.00494,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.3833686113357544,
"kl": 0.9569054283201694,
"learning_rate": 9.999917554226663e-06,
"loss": -0.0526,
"num_tokens": 9108890.0,
"reward": 4.796204090118408,
"reward_std": 5.669317722320557,
"rewards/rollout_reward_func/mean": 4.796204090118408,
"rewards/rollout_reward_func/std": 10.93161678314209,
"sampling/importance_sampling_ratio/max": 2.1446852684020996,
"sampling/importance_sampling_ratio/mean": 0.9228631258010864,
"sampling/importance_sampling_ratio/min": 0.04307766631245613,
"sampling/sampling_logp_difference/max": 1.969184160232544,
"sampling/sampling_logp_difference/mean": 0.06175312399864197,
"step": 247,
"step_time": 35.40395965700009
},
{
"clip_ratio/high_max": 0.0234375,
"clip_ratio/high_mean": 0.01171875,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"entropy": 0.18796673975884914,
"epoch": 0.00496,
"grad_norm": 1.1867882013320923,
"kl": 0.8879027515649796,
"learning_rate": 9.999916770901197e-06,
"loss": -0.0583,
"step": 248,
"step_time": 5.787131861001399
},
{
"clip_ratio/high_max": 0.011488970601931214,
"clip_ratio/high_mean": 0.005744485300965607,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005744485300965607,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1790.0,
"completions/max_terminated_length": 1790.0,
"completions/mean_length": 1709.9375,
"completions/mean_terminated_length": 1709.9375,
"completions/min_length": 1346.0,
"completions/min_terminated_length": 1346.0,
"entropy": 0.18857184424996376,
"epoch": 0.00498,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.3715571165084839,
"kl": 0.6793716847896576,
"learning_rate": 9.999915983872118e-06,
"loss": 0.0529,
"num_tokens": 9184343.0,
"reward": 8.366767883300781,
"reward_std": 4.157901287078857,
"rewards/rollout_reward_func/mean": 8.366767883300781,
"rewards/rollout_reward_func/std": 10.230462074279785,
"sampling/importance_sampling_ratio/max": 1.9824284315109253,
"sampling/importance_sampling_ratio/mean": 0.9958308339118958,
"sampling/importance_sampling_ratio/min": 9.667034967658639e-13,
"sampling/sampling_logp_difference/max": 26.4067325592041,
"sampling/sampling_logp_difference/mean": 0.10002212971448898,
"step": 249,
"step_time": 34.97354119299962
},
{
"clip_ratio/high_max": 0.023207720601931214,
"clip_ratio/high_mean": 0.013556985300965607,
"clip_ratio/low_mean": 0.009650735300965607,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.023207720601931214,
"entropy": 0.18937593139708042,
"epoch": 0.005,
"grad_norm": 0.9603456854820251,
"kl": 0.6774471215903759,
"learning_rate": 9.99991519313943e-06,
"loss": 0.0489,
"step": 250,
"step_time": 5.79304310400039
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1829.0,
"completions/max_terminated_length": 1829.0,
"completions/mean_length": 1708.8125,
"completions/mean_terminated_length": 1708.8125,
"completions/min_length": 1359.0,
"completions/min_terminated_length": 1359.0,
"entropy": 0.1984053999185562,
"epoch": 0.00502,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.047070026397705,
"kl": 1.8021456748247147,
"learning_rate": 9.999914398703129e-06,
"loss": -0.038,
"num_tokens": 9259484.0,
"reward": 6.946834087371826,
"reward_std": 5.76743221282959,
"rewards/rollout_reward_func/mean": 6.946834087371826,
"rewards/rollout_reward_func/std": 14.136740684509277,
"sampling/importance_sampling_ratio/max": 2.717007875442505,
"sampling/importance_sampling_ratio/mean": 1.0246163606643677,
"sampling/importance_sampling_ratio/min": 6.007295355603404e-11,
"sampling/sampling_logp_difference/max": 23.782129287719727,
"sampling/sampling_logp_difference/mean": 0.11047248542308807,
"step": 251,
"step_time": 37.13627710699984
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.005744485300965607,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015510110300965607,
"entropy": 0.20306510664522648,
"epoch": 0.00504,
"grad_norm": 1.5997041463851929,
"kl": 1.5759989619255066,
"learning_rate": 9.99991360056322e-06,
"loss": -0.0419,
"step": 252,
"step_time": 5.818557085000066
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013671875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1787.0,
"completions/max_terminated_length": 1787.0,
"completions/mean_length": 1716.0,
"completions/mean_terminated_length": 1716.0,
"completions/min_length": 1596.0,
"completions/min_terminated_length": 1596.0,
"entropy": 0.24246199801564217,
"epoch": 0.00506,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3612204790115356,
"kl": 0.5908343940973282,
"learning_rate": 9.999912798719703e-06,
"loss": -0.078,
"num_tokens": 9334973.0,
"reward": 6.846713542938232,
"reward_std": 8.991424560546875,
"rewards/rollout_reward_func/mean": 6.846713542938232,
"rewards/rollout_reward_func/std": 9.810685157775879,
"sampling/importance_sampling_ratio/max": 2.0176548957824707,
"sampling/importance_sampling_ratio/mean": 0.9388402104377747,
"sampling/importance_sampling_ratio/min": 0.17772360146045685,
"sampling/sampling_logp_difference/max": 1.4709601402282715,
"sampling/sampling_logp_difference/mean": 0.06256502121686935,
"step": 253,
"step_time": 36.63974985200002
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.015625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017578125,
"entropy": 0.24471604451537132,
"epoch": 0.00508,
"grad_norm": 1.3345394134521484,
"kl": 0.5763493180274963,
"learning_rate": 9.999911993172577e-06,
"loss": -0.081,
"step": 254,
"step_time": 5.774125941998136
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1767.0,
"completions/max_terminated_length": 1767.0,
"completions/mean_length": 1612.78125,
"completions/mean_terminated_length": 1612.78125,
"completions/min_length": 612.0,
"completions/min_terminated_length": 612.0,
"entropy": 0.19570082426071167,
"epoch": 0.0051,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4826679229736328,
"kl": 0.6293303966522217,
"learning_rate": 9.999911183921846e-06,
"loss": -0.1805,
"num_tokens": 9407379.0,
"reward": 3.33709716796875,
"reward_std": 4.991109848022461,
"rewards/rollout_reward_func/mean": 3.33709716796875,
"rewards/rollout_reward_func/std": 10.105610847473145,
"sampling/importance_sampling_ratio/max": 2.305222511291504,
"sampling/importance_sampling_ratio/mean": 0.985419750213623,
"sampling/importance_sampling_ratio/min": 0.12469647079706192,
"sampling/sampling_logp_difference/max": 1.334002137184143,
"sampling/sampling_logp_difference/mean": 0.039491329342126846,
"step": 255,
"step_time": 35.05917253399275
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.19735204800963402,
"epoch": 0.00512,
"grad_norm": 1.344710111618042,
"kl": 0.5909395255148411,
"learning_rate": 9.999910370967508e-06,
"loss": -0.184,
"step": 256,
"step_time": 5.756917624001289
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1787.0,
"completions/max_terminated_length": 1787.0,
"completions/mean_length": 1546.15625,
"completions/mean_terminated_length": 1546.15625,
"completions/min_length": 656.0,
"completions/min_terminated_length": 656.0,
"entropy": 0.1884195413440466,
"epoch": 0.00514,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.3824015855789185,
"kl": 0.3445777054876089,
"learning_rate": 9.999909554309565e-06,
"loss": -0.0226,
"num_tokens": 9477160.0,
"reward": 1.7944698333740234,
"reward_std": 7.204285621643066,
"rewards/rollout_reward_func/mean": 1.7944698333740234,
"rewards/rollout_reward_func/std": 15.566790580749512,
"sampling/importance_sampling_ratio/max": 1.7864809036254883,
"sampling/importance_sampling_ratio/mean": 0.9369933605194092,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.003820776939392,
"sampling/sampling_logp_difference/mean": 0.04316379129886627,
"step": 257,
"step_time": 36.06274823200329
},
{
"clip_ratio/high_max": 0.013020833488553762,
"clip_ratio/high_mean": 0.008463541977107525,
"clip_ratio/low_mean": 0.004557291744276881,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013020833721384406,
"entropy": 0.19156392104923725,
"epoch": 0.00516,
"grad_norm": 1.3535003662109375,
"kl": 0.3384551331400871,
"learning_rate": 9.999908733948019e-06,
"loss": -0.0275,
"step": 258,
"step_time": 5.753975410003477
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1785.0,
"completions/max_terminated_length": 1785.0,
"completions/mean_length": 1711.4375,
"completions/mean_terminated_length": 1711.4375,
"completions/min_length": 1467.0,
"completions/min_terminated_length": 1467.0,
"entropy": 0.22483949549496174,
"epoch": 0.00518,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9391635656356812,
"kl": 0.404471006244421,
"learning_rate": 9.999907909882866e-06,
"loss": -0.0004,
"num_tokens": 9552804.0,
"reward": -1.872456431388855,
"reward_std": 3.4258460998535156,
"rewards/rollout_reward_func/mean": -1.872456431388855,
"rewards/rollout_reward_func/std": 10.646695137023926,
"sampling/importance_sampling_ratio/max": 2.742706537246704,
"sampling/importance_sampling_ratio/mean": 0.9974928498268127,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.1340551376342773,
"sampling/sampling_logp_difference/mean": 0.04761877655982971,
"step": 259,
"step_time": 35.838881236002635
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.22717351838946342,
"epoch": 0.0052,
"grad_norm": 1.7140471935272217,
"kl": 0.41914040222764015,
"learning_rate": 9.999907082114113e-06,
"loss": -0.0055,
"step": 260,
"step_time": 5.778732164999383
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1773.0,
"completions/max_terminated_length": 1773.0,
"completions/mean_length": 1634.71875,
"completions/mean_terminated_length": 1634.71875,
"completions/min_length": 1312.0,
"completions/min_terminated_length": 1312.0,
"entropy": 0.22770886309444904,
"epoch": 0.00522,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3558635711669922,
"kl": 0.603220384567976,
"learning_rate": 9.999906250641757e-06,
"loss": -0.0501,
"num_tokens": 9625522.0,
"reward": 8.935173988342285,
"reward_std": 8.57618522644043,
"rewards/rollout_reward_func/mean": 8.935173988342285,
"rewards/rollout_reward_func/std": 14.713119506835938,
"sampling/importance_sampling_ratio/max": 1.785384178161621,
"sampling/importance_sampling_ratio/mean": 0.9235571622848511,
"sampling/importance_sampling_ratio/min": 4.5317635797921906e-17,
"sampling/sampling_logp_difference/max": 21.840742111206055,
"sampling/sampling_logp_difference/mean": 0.10905331373214722,
"step": 261,
"step_time": 34.28906107799594
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.2287269402295351,
"epoch": 0.00524,
"grad_norm": 1.2795958518981934,
"kl": 0.6055284570902586,
"learning_rate": 9.9999054154658e-06,
"loss": -0.0522,
"step": 262,
"step_time": 5.695764250996945
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1779.0,
"completions/max_terminated_length": 1779.0,
"completions/mean_length": 1626.8125,
"completions/mean_terminated_length": 1626.8125,
"completions/min_length": 286.0,
"completions/min_terminated_length": 286.0,
"entropy": 0.25801716931164265,
"epoch": 0.00526,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.5012454986572266,
"kl": 1.5967573653906584,
"learning_rate": 9.999904576586242e-06,
"loss": -0.1271,
"num_tokens": 9698251.0,
"reward": 6.984275817871094,
"reward_std": 8.85349178314209,
"rewards/rollout_reward_func/mean": 6.984275817871094,
"rewards/rollout_reward_func/std": 9.481241226196289,
"sampling/importance_sampling_ratio/max": 2.636712074279785,
"sampling/importance_sampling_ratio/mean": 0.9890180826187134,
"sampling/importance_sampling_ratio/min": 6.474818459167864e-10,
"sampling/sampling_logp_difference/max": 21.65591049194336,
"sampling/sampling_logp_difference/mean": 0.08674205839633942,
"step": 263,
"step_time": 37.276569184001346
},
{
"clip_ratio/high_max": 0.011488970601931214,
"clip_ratio/high_mean": 0.007697610300965607,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009650735300965607,
"entropy": 0.26125726476311684,
"epoch": 0.00528,
"grad_norm": 2.118901491165161,
"kl": 1.4512584786862135,
"learning_rate": 9.999903734003084e-06,
"loss": -0.1336,
"step": 264,
"step_time": 5.745041152002159
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1776.0,
"completions/max_terminated_length": 1776.0,
"completions/mean_length": 1624.78125,
"completions/mean_terminated_length": 1624.78125,
"completions/min_length": 893.0,
"completions/min_terminated_length": 893.0,
"entropy": 0.22747422195971012,
"epoch": 0.0053,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3240665197372437,
"kl": 0.2730750413611531,
"learning_rate": 9.999902887716329e-06,
"loss": -0.2542,
"num_tokens": 9771080.0,
"reward": -2.6601641178131104,
"reward_std": 5.340880870819092,
"rewards/rollout_reward_func/mean": -2.6601641178131104,
"rewards/rollout_reward_func/std": 8.568302154541016,
"sampling/importance_sampling_ratio/max": 2.1538095474243164,
"sampling/importance_sampling_ratio/mean": 1.0698328018188477,
"sampling/importance_sampling_ratio/min": 2.7574214611965437e-18,
"sampling/sampling_logp_difference/max": 25.709339141845703,
"sampling/sampling_logp_difference/mean": 0.1604633778333664,
"step": 265,
"step_time": 35.148489481998695
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.004356971243396401,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004356971243396401,
"entropy": 0.22533436864614487,
"epoch": 0.00532,
"grad_norm": 1.1646156311035156,
"kl": 0.27127676364034414,
"learning_rate": 9.999902037725978e-06,
"loss": -0.2582,
"step": 266,
"step_time": 5.661973868998757
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1758.0,
"completions/max_terminated_length": 1758.0,
"completions/mean_length": 1673.6875,
"completions/mean_terminated_length": 1673.6875,
"completions/min_length": 1527.0,
"completions/min_terminated_length": 1527.0,
"entropy": 0.2219645120203495,
"epoch": 0.00534,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.302032709121704,
"kl": 0.7969283424317837,
"learning_rate": 9.999901184032026e-06,
"loss": -0.0049,
"num_tokens": 9845301.0,
"reward": 4.670074939727783,
"reward_std": 4.829067230224609,
"rewards/rollout_reward_func/mean": 4.670074939727783,
"rewards/rollout_reward_func/std": 12.953495025634766,
"sampling/importance_sampling_ratio/max": 2.7272145748138428,
"sampling/importance_sampling_ratio/mean": 0.9917882680892944,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.0262956619262695,
"sampling/sampling_logp_difference/mean": 0.052393145859241486,
"step": 267,
"step_time": 35.151963327001795
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.01171875,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017578125,
"entropy": 0.21915441751480103,
"epoch": 0.00536,
"grad_norm": 1.4033211469650269,
"kl": 0.8538418840616941,
"learning_rate": 9.999900326634479e-06,
"loss": -0.0088,
"step": 268,
"step_time": 6.602220959000988
},
{
"clip_ratio/high_max": 0.008333333767950535,
"clip_ratio/high_mean": 0.004166666883975267,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006119791883975267,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1815.0,
"completions/max_terminated_length": 1815.0,
"completions/mean_length": 1656.53125,
"completions/mean_terminated_length": 1656.53125,
"completions/min_length": 1121.0,
"completions/min_terminated_length": 1121.0,
"entropy": 0.22007588855922222,
"epoch": 0.00538,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6505411863327026,
"kl": 0.35358257219195366,
"learning_rate": 9.999899465533338e-06,
"loss": -0.0248,
"num_tokens": 9919144.0,
"reward": -0.7222501039505005,
"reward_std": 6.3980913162231445,
"rewards/rollout_reward_func/mean": -0.7222501039505005,
"rewards/rollout_reward_func/std": 10.620811462402344,
"sampling/importance_sampling_ratio/max": 2.1663742065429688,
"sampling/importance_sampling_ratio/mean": 0.9528242349624634,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.454941749572754,
"sampling/sampling_logp_difference/mean": 0.04491892457008362,
"step": 269,
"step_time": 35.42834161200153
},
{
"clip_ratio/high_max": 0.008370535913854837,
"clip_ratio/high_mean": 0.006268601398915052,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006268601398915052,
"entropy": 0.22122930735349655,
"epoch": 0.0054,
"grad_norm": 1.4210882186889648,
"kl": 0.3376936595886946,
"learning_rate": 9.999898600728599e-06,
"loss": -0.0281,
"step": 270,
"step_time": 5.819409946001542
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0018382353009656072,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0018382353009656072,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1794.0,
"completions/max_terminated_length": 1794.0,
"completions/mean_length": 1610.53125,
"completions/mean_terminated_length": 1610.53125,
"completions/min_length": 267.0,
"completions/min_terminated_length": 267.0,
"entropy": 0.2343030981719494,
"epoch": 0.00542,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.257897138595581,
"kl": 0.5152953676879406,
"learning_rate": 9.99989773222027e-06,
"loss": -0.1168,
"num_tokens": 9991321.0,
"reward": 5.170373439788818,
"reward_std": 8.166898727416992,
"rewards/rollout_reward_func/mean": 5.170373439788818,
"rewards/rollout_reward_func/std": 11.35286808013916,
"sampling/importance_sampling_ratio/max": 2.139758825302124,
"sampling/importance_sampling_ratio/mean": 0.9269818663597107,
"sampling/importance_sampling_ratio/min": 4.740564885086229e-11,
"sampling/sampling_logp_difference/max": 23.737991333007812,
"sampling/sampling_logp_difference/mean": 0.08782394975423813,
"step": 271,
"step_time": 33.72024956799942
},
{
"clip_ratio/high_max": 0.014062500093132257,
"clip_ratio/high_mean": 0.008869485231116414,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008869485231116414,
"entropy": 0.23815979063510895,
"epoch": 0.00544,
"grad_norm": 1.2076594829559326,
"kl": 0.5196094233542681,
"learning_rate": 9.999896860008346e-06,
"loss": -0.1191,
"step": 272,
"step_time": 5.770830136003497
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1821.0,
"completions/max_terminated_length": 1821.0,
"completions/mean_length": 1693.09375,
"completions/mean_terminated_length": 1693.09375,
"completions/min_length": 1545.0,
"completions/min_terminated_length": 1545.0,
"entropy": 0.24442918226122856,
"epoch": 0.00546,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7162142992019653,
"kl": 0.4226865954697132,
"learning_rate": 9.999895984092831e-06,
"loss": -0.0679,
"num_tokens": 10066041.0,
"reward": -1.6974170207977295,
"reward_std": 6.5335187911987305,
"rewards/rollout_reward_func/mean": -1.6974170207977295,
"rewards/rollout_reward_func/std": 12.975322723388672,
"sampling/importance_sampling_ratio/max": 2.2359983921051025,
"sampling/importance_sampling_ratio/mean": 1.0108340978622437,
"sampling/importance_sampling_ratio/min": 0.21830794215202332,
"sampling/sampling_logp_difference/max": 0.9789800643920898,
"sampling/sampling_logp_difference/mean": 0.05331780016422272,
"step": 273,
"step_time": 37.78565727399837
},
{
"clip_ratio/high_max": 0.02734375,
"clip_ratio/high_mean": 0.015625,
"clip_ratio/low_mean": 0.013671875,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.029296875,
"entropy": 0.24502579309046268,
"epoch": 0.00548,
"grad_norm": 1.5775851011276245,
"kl": 0.41350132040679455,
"learning_rate": 9.999895104473725e-06,
"loss": -0.0729,
"step": 274,
"step_time": 6.474478788997658
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1760.0,
"completions/max_terminated_length": 1760.0,
"completions/mean_length": 1534.1875,
"completions/mean_terminated_length": 1534.1875,
"completions/min_length": 670.0,
"completions/min_terminated_length": 670.0,
"entropy": 0.22560410387814045,
"epoch": 0.0055,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2990355491638184,
"kl": 0.43298108130693436,
"learning_rate": 9.99989422115103e-06,
"loss": -0.004,
"num_tokens": 10135257.0,
"reward": -7.943665027618408,
"reward_std": 6.444216251373291,
"rewards/rollout_reward_func/mean": -7.943665027618408,
"rewards/rollout_reward_func/std": 17.156274795532227,
"sampling/importance_sampling_ratio/max": 1.7411776781082153,
"sampling/importance_sampling_ratio/mean": 0.9610604643821716,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.30423903465271,
"sampling/sampling_logp_difference/mean": 0.03806076943874359,
"step": 275,
"step_time": 31.760855431999516
},
{
"clip_ratio/high_max": 0.0052083334885537624,
"clip_ratio/high_mean": 0.0026041667442768812,
"clip_ratio/low_mean": 0.006510416744276881,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009114583488553762,
"entropy": 0.2212864998728037,
"epoch": 0.00552,
"grad_norm": 1.0952664613723755,
"kl": 0.43978017941117287,
"learning_rate": 9.999893334124745e-06,
"loss": -0.0056,
"step": 276,
"step_time": 5.71456800100168
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1782.0,
"completions/max_terminated_length": 1782.0,
"completions/mean_length": 1633.125,
"completions/mean_terminated_length": 1633.125,
"completions/min_length": 641.0,
"completions/min_terminated_length": 641.0,
"entropy": 0.2532899435609579,
"epoch": 0.00554,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.429874062538147,
"kl": 0.6599202789366245,
"learning_rate": 9.99989244339487e-06,
"loss": -0.2194,
"num_tokens": 10208192.0,
"reward": -1.6404476165771484,
"reward_std": 7.98501443862915,
"rewards/rollout_reward_func/mean": -1.6404476165771484,
"rewards/rollout_reward_func/std": 10.619029998779297,
"sampling/importance_sampling_ratio/max": 2.4701716899871826,
"sampling/importance_sampling_ratio/mean": 0.91197669506073,
"sampling/importance_sampling_ratio/min": 0.14245294034481049,
"sampling/sampling_logp_difference/max": 1.5070490837097168,
"sampling/sampling_logp_difference/mean": 0.05014316365122795,
"step": 277,
"step_time": 36.03027680699779
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.005744485300965607,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009650735300965607,
"entropy": 0.24842147901654243,
"epoch": 0.00556,
"grad_norm": 1.1689871549606323,
"kl": 0.6852821763604879,
"learning_rate": 9.999891548961409e-06,
"loss": -0.2256,
"step": 278,
"step_time": 5.809108069001013
},
{
"clip_ratio/high_max": 0.010156250093132257,
"clip_ratio/high_mean": 0.005078125046566129,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005078125046566129,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1799.0,
"completions/max_terminated_length": 1799.0,
"completions/mean_length": 1578.4375,
"completions/mean_terminated_length": 1578.4375,
"completions/min_length": 272.0,
"completions/min_terminated_length": 272.0,
"entropy": 0.1794843440875411,
"epoch": 0.00558,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.132340908050537,
"kl": 0.5266461782157421,
"learning_rate": 9.999890650824362e-06,
"loss": -0.032,
"num_tokens": 10279208.0,
"reward": 7.001322269439697,
"reward_std": 10.167577743530273,
"rewards/rollout_reward_func/mean": 7.001322269439697,
"rewards/rollout_reward_func/std": 19.354318618774414,
"sampling/importance_sampling_ratio/max": 1.6495786905288696,
"sampling/importance_sampling_ratio/mean": 0.8468691110610962,
"sampling/importance_sampling_ratio/min": 0.30830448865890503,
"sampling/sampling_logp_difference/max": 1.2979681491851807,
"sampling/sampling_logp_difference/mean": 0.04318168759346008,
"step": 279,
"step_time": 35.29856429800202
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.17729349061846733,
"epoch": 0.0056,
"grad_norm": 0.9538998007774353,
"kl": 0.5198981948196888,
"learning_rate": 9.999889748983727e-06,
"loss": -0.034,
"step": 280,
"step_time": 6.418356822001442
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1810.0,
"completions/max_terminated_length": 1810.0,
"completions/mean_length": 1660.09375,
"completions/mean_terminated_length": 1660.09375,
"completions/min_length": 1468.0,
"completions/min_terminated_length": 1468.0,
"entropy": 0.22451071441173553,
"epoch": 0.00562,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.166965365409851,
"kl": 0.46352874115109444,
"learning_rate": 9.999888843439508e-06,
"loss": -0.0105,
"num_tokens": 10352876.0,
"reward": 0.566585898399353,
"reward_std": 3.541024684906006,
"rewards/rollout_reward_func/mean": 0.566585898399353,
"rewards/rollout_reward_func/std": 5.582573890686035,
"sampling/importance_sampling_ratio/max": 2.2927284240722656,
"sampling/importance_sampling_ratio/mean": 0.962428092956543,
"sampling/importance_sampling_ratio/min": 0.35809534788131714,
"sampling/sampling_logp_difference/max": 0.8381476402282715,
"sampling/sampling_logp_difference/mean": 0.04447294771671295,
"step": 281,
"step_time": 38.91772174799917
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 0.2204912230372429,
"epoch": 0.00564,
"grad_norm": 1.1434154510498047,
"kl": 0.4616197645664215,
"learning_rate": 9.999887934191706e-06,
"loss": -0.0105,
"step": 282,
"step_time": 5.863146633000724
},
{
"clip_ratio/high_max": 0.0036764706019312143,
"clip_ratio/high_mean": 0.0018382353009656072,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0018382353009656072,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1792.0,
"completions/max_terminated_length": 1792.0,
"completions/mean_length": 1650.9375,
"completions/mean_terminated_length": 1650.9375,
"completions/min_length": 1334.0,
"completions/min_terminated_length": 1334.0,
"entropy": 0.1282420428469777,
"epoch": 0.00566,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.2290998697280884,
"kl": 0.8328725211322308,
"learning_rate": 9.99988702124032e-06,
"loss": -0.0243,
"num_tokens": 10425939.0,
"reward": 3.5679147243499756,
"reward_std": 4.225099086761475,
"rewards/rollout_reward_func/mean": 3.5679147243499756,
"rewards/rollout_reward_func/std": 6.293881893157959,
"sampling/importance_sampling_ratio/max": 1.7022098302841187,
"sampling/importance_sampling_ratio/mean": 0.8366431593894958,
"sampling/importance_sampling_ratio/min": 3.1675653853341368e-12,
"sampling/sampling_logp_difference/max": 26.352497100830078,
"sampling/sampling_logp_difference/mean": 0.08530725538730621,
"step": 283,
"step_time": 36.011625641003775
},
{
"clip_ratio/high_max": 0.007582720601931214,
"clip_ratio/high_mean": 0.003791360300965607,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005744485300965607,
"entropy": 0.12602030392736197,
"epoch": 0.00568,
"grad_norm": 1.02045476436615,
"kl": 0.7334638945758343,
"learning_rate": 9.99988610458535e-06,
"loss": -0.0291,
"step": 284,
"step_time": 5.740358126000501
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.004126082407310605,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004126082407310605,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1760.0,
"completions/max_terminated_length": 1760.0,
"completions/mean_length": 1634.625,
"completions/mean_terminated_length": 1634.625,
"completions/min_length": 1178.0,
"completions/min_terminated_length": 1178.0,
"entropy": 0.18211907986551523,
"epoch": 0.0057,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0849609375,
"kl": 0.623099785298109,
"learning_rate": 9.999885184226803e-06,
"loss": -0.0595,
"num_tokens": 10498682.0,
"reward": -1.8584654331207275,
"reward_std": 3.9287822246551514,
"rewards/rollout_reward_func/mean": -1.8584654331207275,
"rewards/rollout_reward_func/std": 12.91176986694336,
"sampling/importance_sampling_ratio/max": 2.443671464920044,
"sampling/importance_sampling_ratio/mean": 1.0301685333251953,
"sampling/importance_sampling_ratio/min": 0.1954619437456131,
"sampling/sampling_logp_difference/max": 1.7249445915222168,
"sampling/sampling_logp_difference/mean": 0.034441880881786346,
"step": 285,
"step_time": 35.48645443800342
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.1817738525569439,
"epoch": 0.00572,
"grad_norm": 1.5344396829605103,
"kl": 0.6733249872922897,
"learning_rate": 9.999884260164672e-06,
"loss": -0.0645,
"step": 286,
"step_time": 5.6632803509983205
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1776.0,
"completions/max_terminated_length": 1776.0,
"completions/mean_length": 1682.5,
"completions/mean_terminated_length": 1682.5,
"completions/min_length": 1124.0,
"completions/min_terminated_length": 1124.0,
"entropy": 0.16152677033096552,
"epoch": 0.00574,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1796907186508179,
"kl": 0.7577519491314888,
"learning_rate": 9.999883332398963e-06,
"loss": -0.1018,
"num_tokens": 10573032.0,
"reward": 3.490713119506836,
"reward_std": 6.548834800720215,
"rewards/rollout_reward_func/mean": 3.490713119506836,
"rewards/rollout_reward_func/std": 9.092996597290039,
"sampling/importance_sampling_ratio/max": 1.64301335811615,
"sampling/importance_sampling_ratio/mean": 0.9178614020347595,
"sampling/importance_sampling_ratio/min": 0.23202794790267944,
"sampling/sampling_logp_difference/max": 0.8612185120582581,
"sampling/sampling_logp_difference/mean": 0.03624614328145981,
"step": 287,
"step_time": 35.95332784499624
},
{
"clip_ratio/high_max": 0.020089285913854837,
"clip_ratio/high_mean": 0.010044642956927419,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011997767956927419,
"entropy": 0.16308189649134874,
"epoch": 0.00576,
"grad_norm": 0.9740341901779175,
"kl": 0.7837924063205719,
"learning_rate": 9.999882400929674e-06,
"loss": -0.1047,
"step": 288,
"step_time": 5.759164831997623
},
{
"clip_ratio/high_max": 0.013392857275903225,
"clip_ratio/high_mean": 0.0066964286379516125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008649553637951612,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1803.0,
"completions/max_terminated_length": 1803.0,
"completions/mean_length": 1571.25,
"completions/mean_terminated_length": 1571.25,
"completions/min_length": 297.0,
"completions/min_terminated_length": 297.0,
"entropy": 0.1844630278646946,
"epoch": 0.00578,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.0486916303634644,
"kl": 0.8353649526834488,
"learning_rate": 9.99988146575681e-06,
"loss": -0.0224,
"num_tokens": 10643454.0,
"reward": 2.7757768630981445,
"reward_std": 6.543875694274902,
"rewards/rollout_reward_func/mean": 2.7757768630981445,
"rewards/rollout_reward_func/std": 10.743654251098633,
"sampling/importance_sampling_ratio/max": 2.0809102058410645,
"sampling/importance_sampling_ratio/mean": 0.8418534994125366,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.082690954208374,
"sampling/sampling_logp_difference/mean": 0.04390227794647217,
"step": 289,
"step_time": 34.91289532799965
},
{
"clip_ratio/high_max": 0.017299107275903225,
"clip_ratio/high_mean": 0.008649553637951612,
"clip_ratio/low_mean": 0.0031250000465661287,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011774553684517741,
"entropy": 0.18694760277867317,
"epoch": 0.0058,
"grad_norm": 1.0219008922576904,
"kl": 0.8292638063430786,
"learning_rate": 9.999880526880366e-06,
"loss": -0.0233,
"step": 290,
"step_time": 5.847304276998329
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1817.0,
"completions/max_terminated_length": 1817.0,
"completions/mean_length": 1715.9375,
"completions/mean_terminated_length": 1715.9375,
"completions/min_length": 1599.0,
"completions/min_terminated_length": 1599.0,
"entropy": 0.21534875221550465,
"epoch": 0.00582,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4715776443481445,
"kl": 0.5219082608819008,
"learning_rate": 9.999879584300349e-06,
"loss": -0.1126,
"num_tokens": 10719360.0,
"reward": 4.551000118255615,
"reward_std": 5.435481071472168,
"rewards/rollout_reward_func/mean": 4.551000118255615,
"rewards/rollout_reward_func/std": 8.276861190795898,
"sampling/importance_sampling_ratio/max": 1.947494387626648,
"sampling/importance_sampling_ratio/mean": 1.0701745748519897,
"sampling/importance_sampling_ratio/min": 0.39692434668540955,
"sampling/sampling_logp_difference/max": 0.9076583385467529,
"sampling/sampling_logp_difference/mean": 0.038155265152454376,
"step": 291,
"step_time": 37.32454300199788
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.21467485465109348,
"epoch": 0.00584,
"grad_norm": 1.3881194591522217,
"kl": 0.511255357414484,
"learning_rate": 9.999878638016756e-06,
"loss": -0.1134,
"step": 292,
"step_time": 5.892084002003685
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1781.0,
"completions/max_terminated_length": 1781.0,
"completions/mean_length": 1696.09375,
"completions/mean_terminated_length": 1696.09375,
"completions/min_length": 1594.0,
"completions/min_terminated_length": 1594.0,
"entropy": 0.14797488693147898,
"epoch": 0.00586,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.0587458610534668,
"kl": 0.39671463891863823,
"learning_rate": 9.99987768802959e-06,
"loss": 0.025,
"num_tokens": 10794350.0,
"reward": 5.8876543045043945,
"reward_std": 4.631452560424805,
"rewards/rollout_reward_func/mean": 5.8876543045043945,
"rewards/rollout_reward_func/std": 9.654105186462402,
"sampling/importance_sampling_ratio/max": 1.8918991088867188,
"sampling/importance_sampling_ratio/mean": 1.0400714874267578,
"sampling/importance_sampling_ratio/min": 0.2940644919872284,
"sampling/sampling_logp_difference/max": 0.8748996257781982,
"sampling/sampling_logp_difference/mean": 0.02706913650035858,
"step": 293,
"step_time": 36.40785624799901
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.14734902698546648,
"epoch": 0.00588,
"grad_norm": 1.0053006410598755,
"kl": 0.3986317291855812,
"learning_rate": 9.99987673433885e-06,
"loss": 0.0219,
"step": 294,
"step_time": 5.827138864997323
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1826.0,
"completions/max_terminated_length": 1826.0,
"completions/mean_length": 1693.59375,
"completions/mean_terminated_length": 1693.59375,
"completions/min_length": 1599.0,
"completions/min_terminated_length": 1599.0,
"entropy": 0.19574224390089512,
"epoch": 0.0059,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.5421663522720337,
"kl": 0.4815644100308418,
"learning_rate": 9.999875776944539e-06,
"loss": -0.16,
"num_tokens": 10869166.0,
"reward": 3.4602537155151367,
"reward_std": 4.304999351501465,
"rewards/rollout_reward_func/mean": 3.4602537155151367,
"rewards/rollout_reward_func/std": 7.722873210906982,
"sampling/importance_sampling_ratio/max": 1.93669593334198,
"sampling/importance_sampling_ratio/mean": 1.0370523929595947,
"sampling/importance_sampling_ratio/min": 0.21605992317199707,
"sampling/sampling_logp_difference/max": 1.3471612930297852,
"sampling/sampling_logp_difference/mean": 0.036538854241371155,
"step": 295,
"step_time": 36.86570157999995
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.01171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"entropy": 0.19377287477254868,
"epoch": 0.00592,
"grad_norm": 1.3266241550445557,
"kl": 0.5180523283779621,
"learning_rate": 9.999874815846656e-06,
"loss": -0.1637,
"step": 296,
"step_time": 6.44450496600075
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1785.0,
"completions/max_terminated_length": 1785.0,
"completions/mean_length": 1699.59375,
"completions/mean_terminated_length": 1699.59375,
"completions/min_length": 1464.0,
"completions/min_terminated_length": 1464.0,
"entropy": 0.22864390537142754,
"epoch": 0.00594,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4940208196640015,
"kl": 0.3851666897535324,
"learning_rate": 9.999873851045202e-06,
"loss": -0.0427,
"num_tokens": 10944209.0,
"reward": 0.1467803716659546,
"reward_std": 6.4113945960998535,
"rewards/rollout_reward_func/mean": 0.1467803716659546,
"rewards/rollout_reward_func/std": 11.033947944641113,
"sampling/importance_sampling_ratio/max": 2.8933799266815186,
"sampling/importance_sampling_ratio/mean": 1.0202676057815552,
"sampling/importance_sampling_ratio/min": 1.6225042143158674e-11,
"sampling/sampling_logp_difference/max": 24.00332260131836,
"sampling/sampling_logp_difference/mean": 0.08574045449495316,
"step": 297,
"step_time": 36.83895620099793
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.22310753911733627,
"epoch": 0.00596,
"grad_norm": 1.392610788345337,
"kl": 0.3954205773770809,
"learning_rate": 9.999872882540181e-06,
"loss": -0.0431,
"step": 298,
"step_time": 5.832336111001496
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1768.0,
"completions/max_terminated_length": 1768.0,
"completions/mean_length": 1678.71875,
"completions/mean_terminated_length": 1678.71875,
"completions/min_length": 1547.0,
"completions/min_terminated_length": 1547.0,
"entropy": 0.18405075185000896,
"epoch": 0.00598,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8699392080307007,
"kl": 0.937393918633461,
"learning_rate": 9.999871910331592e-06,
"loss": 0.0681,
"num_tokens": 11018366.0,
"reward": 3.816401958465576,
"reward_std": 3.8979978561401367,
"rewards/rollout_reward_func/mean": 3.816401958465576,
"rewards/rollout_reward_func/std": 7.810318946838379,
"sampling/importance_sampling_ratio/max": 2.2833974361419678,
"sampling/importance_sampling_ratio/mean": 1.0459489822387695,
"sampling/importance_sampling_ratio/min": 0.1880369782447815,
"sampling/sampling_logp_difference/max": 1.4234182834625244,
"sampling/sampling_logp_difference/mean": 0.048677269369363785,
"step": 299,
"step_time": 36.90797988000122
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0038470644503831863,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011659564450383186,
"entropy": 0.18204295076429844,
"epoch": 0.006,
"grad_norm": 1.5673085451126099,
"kl": 0.9782428927719593,
"learning_rate": 9.999870934419434e-06,
"loss": 0.0681,
"step": 300,
"step_time": 5.689609519002261
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009765625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1801.0,
"completions/max_terminated_length": 1801.0,
"completions/mean_length": 1530.09375,
"completions/mean_terminated_length": 1530.09375,
"completions/min_length": 200.0,
"completions/min_terminated_length": 200.0,
"entropy": 0.18989801779389381,
"epoch": 0.00602,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.7532182931900024,
"kl": 0.6885460149496794,
"learning_rate": 9.999869954803708e-06,
"loss": -0.0852,
"num_tokens": 11088088.0,
"reward": -2.989968776702881,
"reward_std": 3.5043578147888184,
"rewards/rollout_reward_func/mean": -2.989968776702881,
"rewards/rollout_reward_func/std": 11.289237976074219,
"sampling/importance_sampling_ratio/max": 2.31423020362854,
"sampling/importance_sampling_ratio/mean": 0.9575661420822144,
"sampling/importance_sampling_ratio/min": 0.04499343782663345,
"sampling/sampling_logp_difference/max": 1.339418888092041,
"sampling/sampling_logp_difference/mean": 0.05697993189096451,
"step": 301,
"step_time": 33.824989040998844
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.18932953476905823,
"epoch": 0.00604,
"grad_norm": 2.035459041595459,
"kl": 0.6695586815476418,
"learning_rate": 9.999868971484418e-06,
"loss": -0.0882,
"step": 302,
"step_time": 6.704577034002796
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1802.0,
"completions/max_terminated_length": 1802.0,
"completions/mean_length": 1654.34375,
"completions/mean_terminated_length": 1654.34375,
"completions/min_length": 1392.0,
"completions/min_terminated_length": 1392.0,
"entropy": 0.16859493404626846,
"epoch": 0.00606,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.2433701753616333,
"kl": 0.3330823089927435,
"learning_rate": 9.999867984461565e-06,
"loss": -0.1347,
"num_tokens": 11161407.0,
"reward": 13.324233055114746,
"reward_std": 5.227249622344971,
"rewards/rollout_reward_func/mean": 13.324233055114746,
"rewards/rollout_reward_func/std": 18.111255645751953,
"sampling/importance_sampling_ratio/max": 2.574556827545166,
"sampling/importance_sampling_ratio/mean": 1.0179166793823242,
"sampling/importance_sampling_ratio/min": 4.849272411824878e-19,
"sampling/sampling_logp_difference/max": 21.414751052856445,
"sampling/sampling_logp_difference/mean": 0.11061778664588928,
"step": 303,
"step_time": 35.15028436499961
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.16938963159918785,
"epoch": 0.00608,
"grad_norm": 1.2953479290008545,
"kl": 0.32322092168033123,
"learning_rate": 9.999866993735148e-06,
"loss": -0.1376,
"step": 304,
"step_time": 5.814720251999461
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1763.0,
"completions/max_terminated_length": 1763.0,
"completions/mean_length": 1696.5,
"completions/mean_terminated_length": 1696.5,
"completions/min_length": 1519.0,
"completions/min_terminated_length": 1519.0,
"entropy": 0.17914600856602192,
"epoch": 0.0061,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.047422409057617,
"kl": 0.9124260395765305,
"learning_rate": 9.99986599930517e-06,
"loss": 0.0776,
"num_tokens": 11236260.0,
"reward": -3.4361958503723145,
"reward_std": 5.964578628540039,
"rewards/rollout_reward_func/mean": -3.4361958503723145,
"rewards/rollout_reward_func/std": 12.477890014648438,
"sampling/importance_sampling_ratio/max": 1.9305469989776611,
"sampling/importance_sampling_ratio/mean": 1.063968300819397,
"sampling/importance_sampling_ratio/min": 0.14159531891345978,
"sampling/sampling_logp_difference/max": 1.7825407981872559,
"sampling/sampling_logp_difference/mean": 0.03782513365149498,
"step": 305,
"step_time": 36.04515673100104
},
{
"clip_ratio/high_max": 0.01953125,
"clip_ratio/high_mean": 0.009765625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.1830028723925352,
"epoch": 0.00612,
"grad_norm": 2.021735906600952,
"kl": 0.8423058073967695,
"learning_rate": 9.999865001171628e-06,
"loss": 0.0726,
"step": 306,
"step_time": 5.666386218999833
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1779.0,
"completions/max_terminated_length": 1779.0,
"completions/mean_length": 1692.84375,
"completions/mean_terminated_length": 1692.84375,
"completions/min_length": 1583.0,
"completions/min_terminated_length": 1583.0,
"entropy": 0.26686032861471176,
"epoch": 0.00614,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5039609670639038,
"kl": 0.6252574324607849,
"learning_rate": 9.999863999334527e-06,
"loss": -0.043,
"num_tokens": 11311146.0,
"reward": 0.6570155620574951,
"reward_std": 3.567687749862671,
"rewards/rollout_reward_func/mean": 0.6570155620574951,
"rewards/rollout_reward_func/std": 5.788002014160156,
"sampling/importance_sampling_ratio/max": 1.9915469884872437,
"sampling/importance_sampling_ratio/mean": 0.9949770569801331,
"sampling/importance_sampling_ratio/min": 0.28174740076065063,
"sampling/sampling_logp_difference/max": 1.0719170570373535,
"sampling/sampling_logp_difference/mean": 0.04079074412584305,
"step": 307,
"step_time": 36.82269417800126
},
{
"clip_ratio/high_max": 0.017153532709926367,
"clip_ratio/high_mean": 0.010529891587793827,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010529891587793827,
"entropy": 0.2710601706057787,
"epoch": 0.00616,
"grad_norm": 1.2901983261108398,
"kl": 0.6167760603129864,
"learning_rate": 9.999862993793865e-06,
"loss": -0.0489,
"step": 308,
"step_time": 6.247776632999376
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1789.0,
"completions/max_terminated_length": 1789.0,
"completions/mean_length": 1684.09375,
"completions/mean_terminated_length": 1684.09375,
"completions/min_length": 1566.0,
"completions/min_terminated_length": 1566.0,
"entropy": 0.19205584935843945,
"epoch": 0.00618,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.8880062103271484,
"kl": 0.5570718720555305,
"learning_rate": 9.999861984549646e-06,
"loss": 0.0028,
"num_tokens": 11385685.0,
"reward": 1.6213198900222778,
"reward_std": 5.165748596191406,
"rewards/rollout_reward_func/mean": 1.6213198900222778,
"rewards/rollout_reward_func/std": 10.621575355529785,
"sampling/importance_sampling_ratio/max": 1.731754183769226,
"sampling/importance_sampling_ratio/mean": 0.9596688747406006,
"sampling/importance_sampling_ratio/min": 0.27524271607398987,
"sampling/sampling_logp_difference/max": 0.9177696704864502,
"sampling/sampling_logp_difference/mean": 0.040488895028829575,
"step": 309,
"step_time": 36.28644880200045
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.005859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01171875,
"entropy": 0.19744597002863884,
"epoch": 0.0062,
"grad_norm": 0.904339075088501,
"kl": 0.5681257173418999,
"learning_rate": 9.99986097160187e-06,
"loss": -0.0008,
"step": 310,
"step_time": 5.7859738810002455
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1817.0,
"completions/max_terminated_length": 1817.0,
"completions/mean_length": 1667.03125,
"completions/mean_terminated_length": 1667.03125,
"completions/min_length": 1510.0,
"completions/min_terminated_length": 1510.0,
"entropy": 0.19374503754079342,
"epoch": 0.00622,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.271896243095398,
"kl": 0.5406992174685001,
"learning_rate": 9.999859954950535e-06,
"loss": 0.0642,
"num_tokens": 11459322.0,
"reward": 4.310371398925781,
"reward_std": 5.449921607971191,
"rewards/rollout_reward_func/mean": 4.310371398925781,
"rewards/rollout_reward_func/std": 10.012842178344727,
"sampling/importance_sampling_ratio/max": 2.3058712482452393,
"sampling/importance_sampling_ratio/mean": 1.0564281940460205,
"sampling/importance_sampling_ratio/min": 0.29696083068847656,
"sampling/sampling_logp_difference/max": 1.0833686590194702,
"sampling/sampling_logp_difference/mean": 0.03807468339800835,
"step": 311,
"step_time": 39.552110792998064
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.01171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01953125,
"entropy": 0.19619728438556194,
"epoch": 0.00624,
"grad_norm": 1.1710026264190674,
"kl": 0.5339053124189377,
"learning_rate": 9.999858934595648e-06,
"loss": 0.0602,
"step": 312,
"step_time": 5.784206759000881
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1820.0,
"completions/max_terminated_length": 1820.0,
"completions/mean_length": 1676.375,
"completions/mean_terminated_length": 1676.375,
"completions/min_length": 884.0,
"completions/min_terminated_length": 884.0,
"entropy": 0.23158937133848667,
"epoch": 0.00626,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2677243947982788,
"kl": 0.8580169584602118,
"learning_rate": 9.999857910537204e-06,
"loss": -0.112,
"num_tokens": 11533666.0,
"reward": 4.958746433258057,
"reward_std": 8.339164733886719,
"rewards/rollout_reward_func/mean": 4.958746433258057,
"rewards/rollout_reward_func/std": 11.135608673095703,
"sampling/importance_sampling_ratio/max": 2.853358745574951,
"sampling/importance_sampling_ratio/mean": 0.9754152297973633,
"sampling/importance_sampling_ratio/min": 7.993090099672372e-18,
"sampling/sampling_logp_difference/max": 19.442655563354492,
"sampling/sampling_logp_difference/mean": 0.12714992463588715,
"step": 313,
"step_time": 36.64092221500323
},
{
"clip_ratio/high_max": 0.020432692486792803,
"clip_ratio/high_mean": 0.012169471243396401,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0160757212433964,
"entropy": 0.22766240686178207,
"epoch": 0.00628,
"grad_norm": 1.0967687368392944,
"kl": 0.8410563804209232,
"learning_rate": 9.999856882775207e-06,
"loss": -0.1182,
"step": 314,
"step_time": 6.334955206000814
},
{
"clip_ratio/high_max": 0.004464285913854837,
"clip_ratio/high_mean": 0.004185267724096775,
"clip_ratio/low_mean": 0.0020833334419876337,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006268601166084409,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1786.0,
"completions/max_terminated_length": 1786.0,
"completions/mean_length": 1605.5625,
"completions/mean_terminated_length": 1605.5625,
"completions/min_length": 1013.0,
"completions/min_terminated_length": 1013.0,
"entropy": 0.2169735934585333,
"epoch": 0.0063,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8618820905685425,
"kl": 0.4524347148835659,
"learning_rate": 9.999855851309658e-06,
"loss": -0.033,
"num_tokens": 11605988.0,
"reward": 4.734364032745361,
"reward_std": 8.99662971496582,
"rewards/rollout_reward_func/mean": 4.734364032745361,
"rewards/rollout_reward_func/std": 19.494327545166016,
"sampling/importance_sampling_ratio/max": 2.5793893337249756,
"sampling/importance_sampling_ratio/mean": 1.028910756111145,
"sampling/importance_sampling_ratio/min": 0.28705474734306335,
"sampling/sampling_logp_difference/max": 1.0486016273498535,
"sampling/sampling_logp_difference/mean": 0.04716923087835312,
"step": 315,
"step_time": 32.90857644800053
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.007942708441987634,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.013802083441987634,
"entropy": 0.21472137793898582,
"epoch": 0.00632,
"grad_norm": 1.891473412513733,
"kl": 0.4403880871832371,
"learning_rate": 9.999854816140558e-06,
"loss": -0.0362,
"step": 316,
"step_time": 5.758545238999432
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1774.0,
"completions/max_terminated_length": 1774.0,
"completions/mean_length": 1642.8125,
"completions/mean_terminated_length": 1642.8125,
"completions/min_length": 1517.0,
"completions/min_terminated_length": 1517.0,
"entropy": 0.2397665549069643,
"epoch": 0.00634,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6888549327850342,
"kl": 0.5931244716048241,
"learning_rate": 9.999853777267907e-06,
"loss": -0.1138,
"num_tokens": 11679099.0,
"reward": -2.320553779602051,
"reward_std": 8.914901733398438,
"rewards/rollout_reward_func/mean": -2.320553779602051,
"rewards/rollout_reward_func/std": 15.233270645141602,
"sampling/importance_sampling_ratio/max": 2.8582732677459717,
"sampling/importance_sampling_ratio/mean": 1.0699962377548218,
"sampling/importance_sampling_ratio/min": 4.828970068867372e-14,
"sampling/sampling_logp_difference/max": 30.196020126342773,
"sampling/sampling_logp_difference/mean": 0.11480410397052765,
"step": 317,
"step_time": 35.76147220499843
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.009765625,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.013671875,
"entropy": 0.24086195416748524,
"epoch": 0.00636,
"grad_norm": 1.3944815397262573,
"kl": 0.5775415897369385,
"learning_rate": 9.999852734691707e-06,
"loss": -0.1189,
"step": 318,
"step_time": 6.165533587000027
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1805.0,
"completions/max_terminated_length": 1805.0,
"completions/mean_length": 1617.375,
"completions/mean_terminated_length": 1617.375,
"completions/min_length": 1386.0,
"completions/min_terminated_length": 1386.0,
"entropy": 0.2378612495958805,
"epoch": 0.00638,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.602665662765503,
"kl": 0.5662538819015026,
"learning_rate": 9.999851688411959e-06,
"loss": 0.0126,
"num_tokens": 11751577.0,
"reward": 5.039939880371094,
"reward_std": 6.29449462890625,
"rewards/rollout_reward_func/mean": 5.039939880371094,
"rewards/rollout_reward_func/std": 8.958051681518555,
"sampling/importance_sampling_ratio/max": 2.9241015911102295,
"sampling/importance_sampling_ratio/mean": 1.070831060409546,
"sampling/importance_sampling_ratio/min": 0.13934408128261566,
"sampling/sampling_logp_difference/max": 1.6112802028656006,
"sampling/sampling_logp_difference/mean": 0.06019854545593262,
"step": 319,
"step_time": 36.286148504001176
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.00390625,
"clip_ratio/region_mean": 0.009765625,
"entropy": 0.23569470085203648,
"epoch": 0.0064,
"grad_norm": 1.5978208780288696,
"kl": 0.5462238825857639,
"learning_rate": 9.999850638428662e-06,
"loss": 0.0095,
"step": 320,
"step_time": 6.246102994995454
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2001.0,
"completions/max_terminated_length": 2001.0,
"completions/mean_length": 1882.96875,
"completions/mean_terminated_length": 1882.96875,
"completions/min_length": 1658.0,
"completions/min_terminated_length": 1658.0,
"entropy": 0.20035061426460743,
"epoch": 0.00642,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2168484926223755,
"kl": 0.5520738363265991,
"learning_rate": 9.99984958474182e-06,
"loss": 0.0107,
"num_tokens": 11832614.0,
"reward": 1.5634541511535645,
"reward_std": 4.704677581787109,
"rewards/rollout_reward_func/mean": 1.5634541511535645,
"rewards/rollout_reward_func/std": 6.349411964416504,
"sampling/importance_sampling_ratio/max": 1.8423182964324951,
"sampling/importance_sampling_ratio/mean": 0.8929311633110046,
"sampling/importance_sampling_ratio/min": 2.0857898741510894e-11,
"sampling/sampling_logp_difference/max": 21.57457160949707,
"sampling/sampling_logp_difference/mean": 0.08621137589216232,
"step": 321,
"step_time": 38.96727589300099
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.00685307034291327,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008589181466959417,
"entropy": 0.2013362180441618,
"epoch": 0.00644,
"grad_norm": 1.1177294254302979,
"kl": 0.5334508754312992,
"learning_rate": 9.999848527351434e-06,
"loss": 0.0072,
"step": 322,
"step_time": 6.2201312969991704
},
{
"clip_ratio/high_max": 0.0062806373462080956,
"clip_ratio/high_mean": 0.0031403186731040478,
"clip_ratio/low_mean": 0.0030381944961845875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006178513169288635,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1941.0,
"completions/max_terminated_length": 1941.0,
"completions/mean_length": 1820.03125,
"completions/mean_terminated_length": 1820.03125,
"completions/min_length": 1447.0,
"completions/min_terminated_length": 1447.0,
"entropy": 0.1882778126746416,
"epoch": 0.00646,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5726302862167358,
"kl": 0.49259715899825096,
"learning_rate": 9.999847466257501e-06,
"loss": -0.0134,
"num_tokens": 11911368.0,
"reward": 9.643357276916504,
"reward_std": 7.6050519943237305,
"rewards/rollout_reward_func/mean": 9.643357276916504,
"rewards/rollout_reward_func/std": 13.524218559265137,
"sampling/importance_sampling_ratio/max": 2.3086092472076416,
"sampling/importance_sampling_ratio/mean": 0.9916382431983948,
"sampling/importance_sampling_ratio/min": 1.4195225812507194e-19,
"sampling/sampling_logp_difference/max": 22.288389205932617,
"sampling/sampling_logp_difference/mean": 0.1148686632514,
"step": 323,
"step_time": 38.506465823003964
},
{
"clip_ratio/high_max": 0.016697304090484977,
"clip_ratio/high_mean": 0.008348652045242488,
"clip_ratio/low_mean": 0.0069444444961845875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015293096657842398,
"entropy": 0.1865296196192503,
"epoch": 0.00648,
"grad_norm": 1.318217158317566,
"kl": 0.4571525566279888,
"learning_rate": 9.999846401460027e-06,
"loss": -0.0185,
"step": 324,
"step_time": 6.599780938004187
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.005208333372138441,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666744276881,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1973.0,
"completions/max_terminated_length": 1973.0,
"completions/mean_length": 1880.78125,
"completions/mean_terminated_length": 1880.78125,
"completions/min_length": 1628.0,
"completions/min_terminated_length": 1628.0,
"entropy": 0.1915543656796217,
"epoch": 0.0065,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4827704429626465,
"kl": 0.4624154604971409,
"learning_rate": 9.999845332959009e-06,
"loss": 0.1599,
"num_tokens": 11991754.0,
"reward": 7.904694557189941,
"reward_std": 11.988627433776855,
"rewards/rollout_reward_func/mean": 7.904694557189941,
"rewards/rollout_reward_func/std": 14.8968505859375,
"sampling/importance_sampling_ratio/max": 2.7791874408721924,
"sampling/importance_sampling_ratio/mean": 1.0366264581680298,
"sampling/importance_sampling_ratio/min": 0.18443353474140167,
"sampling/sampling_logp_difference/max": 1.049269437789917,
"sampling/sampling_logp_difference/mean": 0.03614144027233124,
"step": 325,
"step_time": 38.895849883001574
},
{
"clip_ratio/high_max": 0.024305555736646056,
"clip_ratio/high_mean": 0.013888889108784497,
"clip_ratio/low_mean": 0.015625000232830644,
"clip_ratio/low_min": 0.0069444444961845875,
"clip_ratio/region_mean": 0.029513889458030462,
"entropy": 0.1898924522101879,
"epoch": 0.00652,
"grad_norm": 1.1585363149642944,
"kl": 0.4361311122775078,
"learning_rate": 9.999844260754452e-06,
"loss": 0.1548,
"step": 326,
"step_time": 6.17809536400091
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1976.0,
"completions/max_terminated_length": 1976.0,
"completions/mean_length": 1838.875,
"completions/mean_terminated_length": 1838.875,
"completions/min_length": 1662.0,
"completions/min_terminated_length": 1662.0,
"entropy": 0.15969707537442446,
"epoch": 0.00654,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8774885535240173,
"kl": 0.4521552287042141,
"learning_rate": 9.999843184846355e-06,
"loss": -0.253,
"num_tokens": 12071372.0,
"reward": 5.943417549133301,
"reward_std": 6.620966911315918,
"rewards/rollout_reward_func/mean": 5.943417549133301,
"rewards/rollout_reward_func/std": 13.051533699035645,
"sampling/importance_sampling_ratio/max": 2.3298087120056152,
"sampling/importance_sampling_ratio/mean": 1.028981328010559,
"sampling/importance_sampling_ratio/min": 5.5275731908333015e-12,
"sampling/sampling_logp_difference/max": 24.37394905090332,
"sampling/sampling_logp_difference/mean": 0.08017994463443756,
"step": 327,
"step_time": 36.11553010699936
},
{
"clip_ratio/high_max": 0.010438166558742523,
"clip_ratio/high_mean": 0.005219083279371262,
"clip_ratio/low_mean": 0.005219083279371262,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010438166558742523,
"entropy": 0.1542342221364379,
"epoch": 0.00656,
"grad_norm": 0.9030295014381409,
"kl": 0.4645962119102478,
"learning_rate": 9.999842105234718e-06,
"loss": -0.2544,
"step": 328,
"step_time": 6.171020930996747
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1962.0,
"completions/max_terminated_length": 1962.0,
"completions/mean_length": 1846.5,
"completions/mean_terminated_length": 1846.5,
"completions/min_length": 1606.0,
"completions/min_terminated_length": 1606.0,
"entropy": 0.1713305152952671,
"epoch": 0.00658,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3594589233398438,
"kl": 0.41948344372212887,
"learning_rate": 9.999841021919543e-06,
"loss": -0.1754,
"num_tokens": 12150665.0,
"reward": 6.950708866119385,
"reward_std": 8.736629486083984,
"rewards/rollout_reward_func/mean": 6.950708866119385,
"rewards/rollout_reward_func/std": 12.616031646728516,
"sampling/importance_sampling_ratio/max": 2.0336740016937256,
"sampling/importance_sampling_ratio/mean": 1.0751011371612549,
"sampling/importance_sampling_ratio/min": 8.473521770314615e-14,
"sampling/sampling_logp_difference/max": 30.0931339263916,
"sampling/sampling_logp_difference/mean": 0.08403386175632477,
"step": 329,
"step_time": 37.165045843999906
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010325292591005564,
"clip_ratio/low_min": 0.006761695956811309,
"clip_ratio/region_mean": 0.010325292591005564,
"entropy": 0.16616453044116497,
"epoch": 0.0066,
"grad_norm": 1.1647483110427856,
"kl": 0.4359829295426607,
"learning_rate": 9.999839934900832e-06,
"loss": -0.179,
"step": 330,
"step_time": 6.586650393001037
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1966.0,
"completions/max_terminated_length": 1966.0,
"completions/mean_length": 1874.34375,
"completions/mean_terminated_length": 1874.34375,
"completions/min_length": 1775.0,
"completions/min_terminated_length": 1775.0,
"entropy": 0.1693086363375187,
"epoch": 0.00662,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4499868154525757,
"kl": 0.6940734013915062,
"learning_rate": 9.999838844178584e-06,
"loss": -0.1581,
"num_tokens": 12231256.0,
"reward": 9.051456451416016,
"reward_std": 5.1797261238098145,
"rewards/rollout_reward_func/mean": 9.051456451416016,
"rewards/rollout_reward_func/std": 9.840641975402832,
"sampling/importance_sampling_ratio/max": 1.9182367324829102,
"sampling/importance_sampling_ratio/mean": 0.8677526116371155,
"sampling/importance_sampling_ratio/min": 0.21601787209510803,
"sampling/sampling_logp_difference/max": 1.2895514965057373,
"sampling/sampling_logp_difference/mean": 0.04901476204395294,
"step": 331,
"step_time": 40.41131205700003
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.01215277798473835,
"clip_ratio/low_min": 0.0069444444961845875,
"clip_ratio/region_mean": 0.013888889108784497,
"entropy": 0.1622045375406742,
"epoch": 0.00664,
"grad_norm": 1.332248330116272,
"kl": 0.7565370872616768,
"learning_rate": 9.999837749752804e-06,
"loss": -0.16,
"step": 332,
"step_time": 6.1724167310003395
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2012.0,
"completions/max_terminated_length": 2012.0,
"completions/mean_length": 1871.0625,
"completions/mean_terminated_length": 1871.0625,
"completions/min_length": 1509.0,
"completions/min_terminated_length": 1509.0,
"entropy": 0.15144985355436802,
"epoch": 0.00666,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.2533303499221802,
"kl": 0.31190009601414204,
"learning_rate": 9.999836651623489e-06,
"loss": 0.0864,
"num_tokens": 12311600.0,
"reward": 2.6044352054595947,
"reward_std": 5.959412574768066,
"rewards/rollout_reward_func/mean": 2.6044352054595947,
"rewards/rollout_reward_func/std": 14.666269302368164,
"sampling/importance_sampling_ratio/max": 1.9139474630355835,
"sampling/importance_sampling_ratio/mean": 0.953168511390686,
"sampling/importance_sampling_ratio/min": 1.1792734045952113e-14,
"sampling/sampling_logp_difference/max": 17.824750900268555,
"sampling/sampling_logp_difference/mean": 0.07734020054340363,
"step": 333,
"step_time": 39.06632298299701
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0029861110961064696,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006458333344198763,
"entropy": 0.1482429150491953,
"epoch": 0.00668,
"grad_norm": 1.049380898475647,
"kl": 0.31447366066277027,
"learning_rate": 9.99983554979064e-06,
"loss": 0.0833,
"step": 334,
"step_time": 6.242611449002652
},
{
"clip_ratio/high_max": 0.01736111124046147,
"clip_ratio/high_mean": 0.008680555620230734,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666744276881,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2005.0,
"completions/max_terminated_length": 2005.0,
"completions/mean_length": 1884.21875,
"completions/mean_terminated_length": 1884.21875,
"completions/min_length": 1752.0,
"completions/min_terminated_length": 1752.0,
"entropy": 0.1837616004049778,
"epoch": 0.0067,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5765018463134766,
"kl": 0.6941291987895966,
"learning_rate": 9.999834444254261e-06,
"loss": -0.0101,
"num_tokens": 12392452.0,
"reward": 2.9032256603240967,
"reward_std": 8.3020601272583,
"rewards/rollout_reward_func/mean": 2.9032256603240967,
"rewards/rollout_reward_func/std": 10.438464164733887,
"sampling/importance_sampling_ratio/max": 2.3026082515716553,
"sampling/importance_sampling_ratio/mean": 0.8688405752182007,
"sampling/importance_sampling_ratio/min": 0.06608447432518005,
"sampling/sampling_logp_difference/max": 1.0330348014831543,
"sampling/sampling_logp_difference/mean": 0.04626619443297386,
"step": 335,
"step_time": 38.017552094001076
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008680555620230734,
"entropy": 0.1806764118373394,
"epoch": 0.00672,
"grad_norm": 1.4784622192382812,
"kl": 0.6689932979643345,
"learning_rate": 9.999833335014352e-06,
"loss": -0.0126,
"step": 336,
"step_time": 6.23428071699891
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1985.0,
"completions/max_terminated_length": 1985.0,
"completions/mean_length": 1826.21875,
"completions/mean_terminated_length": 1826.21875,
"completions/min_length": 1612.0,
"completions/min_terminated_length": 1612.0,
"entropy": 0.16721994522958994,
"epoch": 0.00674,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.0283710956573486,
"kl": 0.5365333259105682,
"learning_rate": 9.999832222070915e-06,
"loss": -0.0433,
"num_tokens": 12471329.0,
"reward": 5.5746378898620605,
"reward_std": 9.998064041137695,
"rewards/rollout_reward_func/mean": 5.5746378898620605,
"rewards/rollout_reward_func/std": 12.20602035522461,
"sampling/importance_sampling_ratio/max": 1.3266735076904297,
"sampling/importance_sampling_ratio/mean": 0.7684075832366943,
"sampling/importance_sampling_ratio/min": 2.1960254242760603e-20,
"sampling/sampling_logp_difference/max": 29.37782096862793,
"sampling/sampling_logp_difference/mean": 0.16268780827522278,
"step": 337,
"step_time": 37.611019209001824
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.0069444444961845875,
"clip_ratio/low_mean": 0.0069444444961845875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013888889108784497,
"entropy": 0.16553544532507658,
"epoch": 0.00676,
"grad_norm": 1.0066049098968506,
"kl": 0.5513498038053513,
"learning_rate": 9.999831105423947e-06,
"loss": -0.0434,
"step": 338,
"step_time": 6.205691562003267
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1959.0,
"completions/max_terminated_length": 1959.0,
"completions/mean_length": 1858.40625,
"completions/mean_terminated_length": 1858.40625,
"completions/min_length": 1771.0,
"completions/min_terminated_length": 1771.0,
"entropy": 0.13813489489257336,
"epoch": 0.00678,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.2954949140548706,
"kl": 0.46643203496932983,
"learning_rate": 9.999829985073454e-06,
"loss": -0.0691,
"num_tokens": 12551129.0,
"reward": 4.567227363586426,
"reward_std": 4.1049957275390625,
"rewards/rollout_reward_func/mean": 4.567227363586426,
"rewards/rollout_reward_func/std": 9.307647705078125,
"sampling/importance_sampling_ratio/max": 2.0119011402130127,
"sampling/importance_sampling_ratio/mean": 1.006131887435913,
"sampling/importance_sampling_ratio/min": 0.2738450765609741,
"sampling/sampling_logp_difference/max": 1.310835838317871,
"sampling/sampling_logp_difference/mean": 0.03615020588040352,
"step": 339,
"step_time": 38.483633726000335
},
{
"clip_ratio/high_max": 0.013888888992369175,
"clip_ratio/high_mean": 0.0069444444961845875,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"entropy": 0.13802602514624596,
"epoch": 0.0068,
"grad_norm": 1.058447003364563,
"kl": 0.43485408648848534,
"learning_rate": 9.999828861019437e-06,
"loss": -0.0715,
"step": 340,
"step_time": 6.148799068003427
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333372138441,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1973.0,
"completions/max_terminated_length": 1973.0,
"completions/mean_length": 1823.65625,
"completions/mean_terminated_length": 1823.65625,
"completions/min_length": 274.0,
"completions/min_terminated_length": 274.0,
"entropy": 0.14465732499957085,
"epoch": 0.00682,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9615039825439453,
"kl": 1.5087429098784924,
"learning_rate": 9.999827733261892e-06,
"loss": 0.0479,
"num_tokens": 12629652.0,
"reward": 2.9783196449279785,
"reward_std": 6.66061544418335,
"rewards/rollout_reward_func/mean": 2.9783196449279785,
"rewards/rollout_reward_func/std": 10.88323974609375,
"sampling/importance_sampling_ratio/max": 1.377437949180603,
"sampling/importance_sampling_ratio/mean": 0.8548128604888916,
"sampling/importance_sampling_ratio/min": 0.24347716569900513,
"sampling/sampling_logp_difference/max": 1.1083741188049316,
"sampling/sampling_logp_difference/mean": 0.030560657382011414,
"step": 341,
"step_time": 38.09668472299927
},
{
"clip_ratio/high_max": 0.01609848509542644,
"clip_ratio/high_mean": 0.011521464679390192,
"clip_ratio/low_mean": 0.009785353671759367,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.02130681835114956,
"entropy": 0.14760607294738293,
"epoch": 0.00684,
"grad_norm": 1.4034626483917236,
"kl": 1.1786439195275307,
"learning_rate": 9.999826601800824e-06,
"loss": 0.0401,
"step": 342,
"step_time": 6.157314127001882
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1998.0,
"completions/max_terminated_length": 1998.0,
"completions/mean_length": 1769.34375,
"completions/mean_terminated_length": 1769.34375,
"completions/min_length": 286.0,
"completions/min_terminated_length": 286.0,
"entropy": 0.15295273158699274,
"epoch": 0.00686,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.4741613864898682,
"kl": 0.3553418479859829,
"learning_rate": 9.999825466636233e-06,
"loss": -0.0553,
"num_tokens": 12707171.0,
"reward": 10.171146392822266,
"reward_std": 4.205057144165039,
"rewards/rollout_reward_func/mean": 10.171146392822266,
"rewards/rollout_reward_func/std": 19.68203353881836,
"sampling/importance_sampling_ratio/max": 1.8806638717651367,
"sampling/importance_sampling_ratio/mean": 1.025322437286377,
"sampling/importance_sampling_ratio/min": 0.20550324022769928,
"sampling/sampling_logp_difference/max": 0.8229107856750488,
"sampling/sampling_logp_difference/mean": 0.025263220071792603,
"step": 343,
"step_time": 37.784647938999115
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0069444444961845875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666744276881,
"entropy": 0.156339006498456,
"epoch": 0.00688,
"grad_norm": 1.2938135862350464,
"kl": 0.3518723715096712,
"learning_rate": 9.999824327768121e-06,
"loss": -0.0577,
"step": 344,
"step_time": 6.225835901999744
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0012499999720603228,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0012499999720603228,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2030.0,
"completions/max_terminated_length": 2030.0,
"completions/mean_length": 1841.6875,
"completions/mean_terminated_length": 1841.6875,
"completions/min_length": 1579.0,
"completions/min_terminated_length": 1579.0,
"entropy": 0.15647383406758308,
"epoch": 0.0069,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5851750373840332,
"kl": 0.4456126391887665,
"learning_rate": 9.99982318519649e-06,
"loss": 0.0497,
"num_tokens": 12786883.0,
"reward": 4.234543323516846,
"reward_std": 7.14749813079834,
"rewards/rollout_reward_func/mean": 4.234543323516846,
"rewards/rollout_reward_func/std": 11.683938980102539,
"sampling/importance_sampling_ratio/max": 2.441897392272949,
"sampling/importance_sampling_ratio/mean": 0.9281522035598755,
"sampling/importance_sampling_ratio/min": 5.669828341647121e-19,
"sampling/sampling_logp_difference/max": 24.185270309448242,
"sampling/sampling_logp_difference/mean": 0.1490144282579422,
"step": 345,
"step_time": 38.19426675700015
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.0069444444961845875,
"clip_ratio/low_mean": 0.005208333372138441,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012152777868323028,
"entropy": 0.1573875080794096,
"epoch": 0.00692,
"grad_norm": 1.2656595706939697,
"kl": 0.42021266743540764,
"learning_rate": 9.999822038921339e-06,
"loss": 0.0471,
"step": 346,
"step_time": 6.839767702995232
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333372138441,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2032.0,
"completions/max_terminated_length": 2032.0,
"completions/mean_length": 1904.75,
"completions/mean_terminated_length": 1904.75,
"completions/min_length": 1528.0,
"completions/min_terminated_length": 1528.0,
"entropy": 0.14398007839918137,
"epoch": 0.00694,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1126422882080078,
"kl": 0.39460230618715286,
"learning_rate": 9.99982088894267e-06,
"loss": -0.0541,
"num_tokens": 12868503.0,
"reward": 4.4482550621032715,
"reward_std": 6.246906757354736,
"rewards/rollout_reward_func/mean": 4.4482550621032715,
"rewards/rollout_reward_func/std": 11.603610038757324,
"sampling/importance_sampling_ratio/max": 1.5690655708312988,
"sampling/importance_sampling_ratio/mean": 0.9412245750427246,
"sampling/importance_sampling_ratio/min": 0.37497684359550476,
"sampling/sampling_logp_difference/max": 1.0045862197875977,
"sampling/sampling_logp_difference/mean": 0.0292842797935009,
"step": 347,
"step_time": 36.1456481869991
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"entropy": 0.14603436551988125,
"epoch": 0.00696,
"grad_norm": 1.0782148838043213,
"kl": 0.4032270349562168,
"learning_rate": 9.999819735260483e-06,
"loss": -0.0577,
"step": 348,
"step_time": 6.739517917001649
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2039.0,
"completions/max_terminated_length": 2039.0,
"completions/mean_length": 1886.75,
"completions/mean_terminated_length": 1886.75,
"completions/min_length": 1634.0,
"completions/min_terminated_length": 1634.0,
"entropy": 0.1570840571075678,
"epoch": 0.00698,
"frac_reward_zero_std": 0.25,
"grad_norm": 2.835125684738159,
"kl": 1.7887609116733074,
"learning_rate": 9.999818577874782e-06,
"loss": -0.1581,
"num_tokens": 12949689.0,
"reward": 9.66476821899414,
"reward_std": 5.8628249168396,
"rewards/rollout_reward_func/mean": 9.66476821899414,
"rewards/rollout_reward_func/std": 10.391725540161133,
"sampling/importance_sampling_ratio/max": 1.674692153930664,
"sampling/importance_sampling_ratio/mean": 0.9785152673721313,
"sampling/importance_sampling_ratio/min": 0.15405651926994324,
"sampling/sampling_logp_difference/max": 1.66461181640625,
"sampling/sampling_logp_difference/mean": 0.033790212124586105,
"step": 349,
"step_time": 38.58619033600189
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"entropy": 0.15997820533812046,
"epoch": 0.007,
"grad_norm": 2.284522771835327,
"kl": 1.5180134773254395,
"learning_rate": 9.999817416785565e-06,
"loss": -0.1585,
"step": 350,
"step_time": 6.308404929000972
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1867.0,
"completions/max_terminated_length": 1867.0,
"completions/mean_length": 1791.1875,
"completions/mean_terminated_length": 1791.1875,
"completions/min_length": 1090.0,
"completions/min_terminated_length": 1090.0,
"entropy": 0.17174389213323593,
"epoch": 0.00702,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4891026020050049,
"kl": 1.113628275692463,
"learning_rate": 9.999816251992836e-06,
"loss": -0.0978,
"num_tokens": 13027385.0,
"reward": 2.3329522609710693,
"reward_std": 7.594513416290283,
"rewards/rollout_reward_func/mean": 2.3329522609710693,
"rewards/rollout_reward_func/std": 12.151657104492188,
"sampling/importance_sampling_ratio/max": 1.5326077938079834,
"sampling/importance_sampling_ratio/mean": 0.874591052532196,
"sampling/importance_sampling_ratio/min": 0.13500602543354034,
"sampling/sampling_logp_difference/max": 1.4848179817199707,
"sampling/sampling_logp_difference/mean": 0.03452059626579285,
"step": 351,
"step_time": 38.5899223109991
},
{
"clip_ratio/high_max": 0.013888888992369175,
"clip_ratio/high_mean": 0.0069444444961845875,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"entropy": 0.17624721303582191,
"epoch": 0.00704,
"grad_norm": 0.9316499829292297,
"kl": 0.7885981798171997,
"learning_rate": 9.999815083496593e-06,
"loss": -0.1029,
"step": 352,
"step_time": 6.3719571820001875
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1968.0,
"completions/max_terminated_length": 1968.0,
"completions/mean_length": 1887.71875,
"completions/mean_terminated_length": 1887.71875,
"completions/min_length": 1804.0,
"completions/min_terminated_length": 1804.0,
"entropy": 0.18038272112607956,
"epoch": 0.00706,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4058349132537842,
"kl": 0.5268550999462605,
"learning_rate": 9.99981391129684e-06,
"loss": -0.1831,
"num_tokens": 13108436.0,
"reward": 7.138474941253662,
"reward_std": 9.193456649780273,
"rewards/rollout_reward_func/mean": 7.138474941253662,
"rewards/rollout_reward_func/std": 11.277852058410645,
"sampling/importance_sampling_ratio/max": 2.048379898071289,
"sampling/importance_sampling_ratio/mean": 0.9898632168769836,
"sampling/importance_sampling_ratio/min": 0.31908681988716125,
"sampling/sampling_logp_difference/max": 1.2619354724884033,
"sampling/sampling_logp_difference/mean": 0.04246928542852402,
"step": 353,
"step_time": 36.779780131000734
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.005208333372138441,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.010416666744276881,
"entropy": 0.18332521431148052,
"epoch": 0.00708,
"grad_norm": 1.734959363937378,
"kl": 0.6400851979851723,
"learning_rate": 9.999812735393578e-06,
"loss": -0.1861,
"step": 354,
"step_time": 6.657307188001141
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2001.0,
"completions/max_terminated_length": 2001.0,
"completions/mean_length": 1855.8125,
"completions/mean_terminated_length": 1855.8125,
"completions/min_length": 1417.0,
"completions/min_terminated_length": 1417.0,
"entropy": 0.15774557553231716,
"epoch": 0.0071,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1801131963729858,
"kl": 0.28479940071702003,
"learning_rate": 9.999811555786805e-06,
"loss": -0.0644,
"num_tokens": 13188201.0,
"reward": 6.061254501342773,
"reward_std": 11.283492088317871,
"rewards/rollout_reward_func/mean": 6.061254501342773,
"rewards/rollout_reward_func/std": 20.36028480529785,
"sampling/importance_sampling_ratio/max": 1.5566022396087646,
"sampling/importance_sampling_ratio/mean": 0.9729775786399841,
"sampling/importance_sampling_ratio/min": 0.22793923318386078,
"sampling/sampling_logp_difference/max": 0.8676626682281494,
"sampling/sampling_logp_difference/mean": 0.031204868108034134,
"step": 355,
"step_time": 35.57014237199837
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333372138441,
"entropy": 0.15768986009061337,
"epoch": 0.00712,
"grad_norm": 1.0285789966583252,
"kl": 0.2967808712273836,
"learning_rate": 9.999810372476526e-06,
"loss": -0.0677,
"step": 356,
"step_time": 6.207597920001717
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2018.0,
"completions/max_terminated_length": 2018.0,
"completions/mean_length": 1912.25,
"completions/mean_terminated_length": 1912.25,
"completions/min_length": 1727.0,
"completions/min_terminated_length": 1727.0,
"entropy": 0.15802463050931692,
"epoch": 0.00714,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.4909172058105469,
"kl": 0.6946889795362949,
"learning_rate": 9.99980918546274e-06,
"loss": -0.064,
"num_tokens": 13269840.0,
"reward": 8.720025062561035,
"reward_std": 8.605905532836914,
"rewards/rollout_reward_func/mean": 8.720025062561035,
"rewards/rollout_reward_func/std": 13.064443588256836,
"sampling/importance_sampling_ratio/max": 2.113987922668457,
"sampling/importance_sampling_ratio/mean": 0.9241708517074585,
"sampling/importance_sampling_ratio/min": 0.17220593988895416,
"sampling/sampling_logp_difference/max": 1.0862641334533691,
"sampling/sampling_logp_difference/mean": 0.03762374818325043,
"step": 357,
"step_time": 39.26908052300132
},
{
"clip_ratio/high_max": 0.013888888992369175,
"clip_ratio/high_mean": 0.0069444444961845875,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008680555620230734,
"entropy": 0.15692736767232418,
"epoch": 0.00716,
"grad_norm": 1.3770283460617065,
"kl": 0.682164192199707,
"learning_rate": 9.999807994745449e-06,
"loss": -0.0657,
"step": 358,
"step_time": 6.277640965996397
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1936.0,
"completions/max_terminated_length": 1936.0,
"completions/mean_length": 1804.21875,
"completions/mean_terminated_length": 1804.21875,
"completions/min_length": 1351.0,
"completions/min_terminated_length": 1351.0,
"entropy": 0.15549181029200554,
"epoch": 0.00718,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.9340437650680542,
"kl": 0.30343155562877655,
"learning_rate": 9.999806800324652e-06,
"loss": -0.0304,
"num_tokens": 13347887.0,
"reward": -1.006221055984497,
"reward_std": 8.046869277954102,
"rewards/rollout_reward_func/mean": -1.006221055984497,
"rewards/rollout_reward_func/std": 12.83417797088623,
"sampling/importance_sampling_ratio/max": 1.6891496181488037,
"sampling/importance_sampling_ratio/mean": 0.9606433510780334,
"sampling/importance_sampling_ratio/min": 3.726897696704201e-12,
"sampling/sampling_logp_difference/max": 25.532629013061523,
"sampling/sampling_logp_difference/mean": 0.06780634075403214,
"step": 359,
"step_time": 37.24431940499744
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.007161458255723119,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.010633680503815413,
"entropy": 0.15154966711997986,
"epoch": 0.0072,
"grad_norm": 0.7721959948539734,
"kl": 0.30717028118669987,
"learning_rate": 9.999805602200355e-06,
"loss": -0.0332,
"step": 360,
"step_time": 6.086639444998582
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1945.0,
"completions/max_terminated_length": 1945.0,
"completions/mean_length": 1784.75,
"completions/mean_terminated_length": 1784.75,
"completions/min_length": 897.0,
"completions/min_terminated_length": 897.0,
"entropy": 0.1756394449621439,
"epoch": 0.00722,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9486719369888306,
"kl": 0.5046870671212673,
"learning_rate": 9.999804400372553e-06,
"loss": 0.0591,
"num_tokens": 13425212.0,
"reward": 1.9548215866088867,
"reward_std": 7.599405765533447,
"rewards/rollout_reward_func/mean": 1.9548215866088867,
"rewards/rollout_reward_func/std": 11.06623363494873,
"sampling/importance_sampling_ratio/max": 2.453582525253296,
"sampling/importance_sampling_ratio/mean": 1.1315981149673462,
"sampling/importance_sampling_ratio/min": 0.2927221357822418,
"sampling/sampling_logp_difference/max": 0.9818147420883179,
"sampling/sampling_logp_difference/mean": 0.02989993989467621,
"step": 361,
"step_time": 36.09027150499969
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.006944444612599909,
"clip_ratio/low_mean": 0.005208333372138441,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01215277798473835,
"entropy": 0.17177880927920341,
"epoch": 0.00724,
"grad_norm": 1.6040363311767578,
"kl": 0.5513662751764059,
"learning_rate": 9.999803194841253e-06,
"loss": 0.0551,
"step": 362,
"step_time": 6.112345547999212
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333372138441,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2001.0,
"completions/max_terminated_length": 2001.0,
"completions/mean_length": 1889.8125,
"completions/mean_terminated_length": 1889.8125,
"completions/min_length": 1789.0,
"completions/min_terminated_length": 1789.0,
"entropy": 0.1901466716080904,
"epoch": 0.00726,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.559001922607422,
"kl": 1.604296986013651,
"learning_rate": 9.999801985606451e-06,
"loss": -0.0359,
"num_tokens": 13506437.0,
"reward": 12.421402931213379,
"reward_std": 7.902709484100342,
"rewards/rollout_reward_func/mean": 12.421402931213379,
"rewards/rollout_reward_func/std": 11.493805885314941,
"sampling/importance_sampling_ratio/max": 2.2942121028900146,
"sampling/importance_sampling_ratio/mean": 0.9992972612380981,
"sampling/importance_sampling_ratio/min": 0.1637697070837021,
"sampling/sampling_logp_difference/max": 1.1397209167480469,
"sampling/sampling_logp_difference/mean": 0.038260094821453094,
"step": 363,
"step_time": 38.05201607199888
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"entropy": 0.18919656239449978,
"epoch": 0.00728,
"grad_norm": 2.6743459701538086,
"kl": 1.4820591136813164,
"learning_rate": 9.999800772668154e-06,
"loss": -0.038,
"step": 364,
"step_time": 6.24283971099976
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1969.0,
"completions/max_terminated_length": 1969.0,
"completions/mean_length": 1859.03125,
"completions/mean_terminated_length": 1859.03125,
"completions/min_length": 1485.0,
"completions/min_terminated_length": 1485.0,
"entropy": 0.16470451280474663,
"epoch": 0.0073,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2065528631210327,
"kl": 0.41300537437200546,
"learning_rate": 9.999799556026358e-06,
"loss": -0.0763,
"num_tokens": 13586301.0,
"reward": 11.647848129272461,
"reward_std": 5.665955543518066,
"rewards/rollout_reward_func/mean": 11.647848129272461,
"rewards/rollout_reward_func/std": 16.600069046020508,
"sampling/importance_sampling_ratio/max": 1.5297025442123413,
"sampling/importance_sampling_ratio/mean": 1.0060566663742065,
"sampling/importance_sampling_ratio/min": 0.3258414566516876,
"sampling/sampling_logp_difference/max": 0.5945889353752136,
"sampling/sampling_logp_difference/mean": 0.02704280987381935,
"step": 365,
"step_time": 38.81224459800069
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.005208333372138441,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666744276881,
"entropy": 0.16255084611475468,
"epoch": 0.00732,
"grad_norm": 0.9401413202285767,
"kl": 0.4195688497275114,
"learning_rate": 9.999798335681066e-06,
"loss": -0.0813,
"step": 366,
"step_time": 6.165453665998939
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1964.0,
"completions/max_terminated_length": 1964.0,
"completions/mean_length": 1833.46875,
"completions/mean_terminated_length": 1833.46875,
"completions/min_length": 1624.0,
"completions/min_terminated_length": 1624.0,
"entropy": 0.17754663713276386,
"epoch": 0.00734,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9974569082260132,
"kl": 1.130824089050293,
"learning_rate": 9.99979711163228e-06,
"loss": -0.0937,
"num_tokens": 13665217.0,
"reward": 5.195849418640137,
"reward_std": 10.393113136291504,
"rewards/rollout_reward_func/mean": 5.195849418640137,
"rewards/rollout_reward_func/std": 12.448369026184082,
"sampling/importance_sampling_ratio/max": 1.792784571647644,
"sampling/importance_sampling_ratio/mean": 0.8298041820526123,
"sampling/importance_sampling_ratio/min": 4.1797170108673343e-13,
"sampling/sampling_logp_difference/max": 27.452041625976562,
"sampling/sampling_logp_difference/mean": 0.1302998661994934,
"step": 367,
"step_time": 36.412708622001446
},
{
"clip_ratio/high_max": 0.010233918204903603,
"clip_ratio/high_mean": 0.005116959102451801,
"clip_ratio/low_mean": 0.0016891892300918698,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006806148332543671,
"entropy": 0.17670008912682533,
"epoch": 0.00736,
"grad_norm": 1.4283668994903564,
"kl": 0.8111900072544813,
"learning_rate": 9.999795883880002e-06,
"loss": -0.0963,
"step": 368,
"step_time": 6.141397028004576
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2000.0,
"completions/max_terminated_length": 2000.0,
"completions/mean_length": 1903.15625,
"completions/mean_terminated_length": 1903.15625,
"completions/min_length": 1771.0,
"completions/min_terminated_length": 1771.0,
"entropy": 0.14921395294368267,
"epoch": 0.00738,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.2093360424041748,
"kl": 0.44058969616889954,
"learning_rate": 9.999794652424228e-06,
"loss": 0.0713,
"num_tokens": 13746632.0,
"reward": 11.435522079467773,
"reward_std": 3.840212345123291,
"rewards/rollout_reward_func/mean": 11.435522079467773,
"rewards/rollout_reward_func/std": 13.30839729309082,
"sampling/importance_sampling_ratio/max": 2.1941890716552734,
"sampling/importance_sampling_ratio/mean": 0.9962909817695618,
"sampling/importance_sampling_ratio/min": 0.30016008019447327,
"sampling/sampling_logp_difference/max": 0.8217124938964844,
"sampling/sampling_logp_difference/mean": 0.029038339853286743,
"step": 369,
"step_time": 37.36318951800058
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0069444444961845875,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.01215277798473835,
"entropy": 0.14946964103728533,
"epoch": 0.0074,
"grad_norm": 1.5157506465911865,
"kl": 0.4827451854944229,
"learning_rate": 9.999793417264967e-06,
"loss": 0.0668,
"step": 370,
"step_time": 6.7108045629993285
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333372138441,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2005.0,
"completions/max_terminated_length": 2005.0,
"completions/mean_length": 1862.59375,
"completions/mean_terminated_length": 1862.59375,
"completions/min_length": 1764.0,
"completions/min_terminated_length": 1764.0,
"entropy": 0.1261264430359006,
"epoch": 0.00742,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1795105934143066,
"kl": 0.7585394252091646,
"learning_rate": 9.999792178402215e-06,
"loss": -0.0116,
"num_tokens": 13827147.0,
"reward": 7.017940521240234,
"reward_std": 5.937340259552002,
"rewards/rollout_reward_func/mean": 7.017940521240234,
"rewards/rollout_reward_func/std": 14.372506141662598,
"sampling/importance_sampling_ratio/max": 1.6552984714508057,
"sampling/importance_sampling_ratio/mean": 0.9012160301208496,
"sampling/importance_sampling_ratio/min": 0.2452741116285324,
"sampling/sampling_logp_difference/max": 1.3038992881774902,
"sampling/sampling_logp_difference/mean": 0.02553473599255085,
"step": 371,
"step_time": 37.5200551709986
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"entropy": 0.12507631909102201,
"epoch": 0.00744,
"grad_norm": 1.228213906288147,
"kl": 0.8153098970651627,
"learning_rate": 9.999790935835974e-06,
"loss": -0.0135,
"step": 372,
"step_time": 6.234621702000368
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1977.0,
"completions/max_terminated_length": 1977.0,
"completions/mean_length": 1880.25,
"completions/mean_terminated_length": 1880.25,
"completions/min_length": 1754.0,
"completions/min_terminated_length": 1754.0,
"entropy": 0.11569633707404137,
"epoch": 0.00746,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.3457225561141968,
"kl": 0.887390311807394,
"learning_rate": 9.999789689566245e-06,
"loss": -0.0406,
"num_tokens": 13907765.0,
"reward": 12.74990463256836,
"reward_std": 5.970038414001465,
"rewards/rollout_reward_func/mean": 12.74990463256836,
"rewards/rollout_reward_func/std": 13.113883018493652,
"sampling/importance_sampling_ratio/max": 1.5790919065475464,
"sampling/importance_sampling_ratio/mean": 0.9145287275314331,
"sampling/importance_sampling_ratio/min": 2.7017030648258944e-13,
"sampling/sampling_logp_difference/max": 28.88240623474121,
"sampling/sampling_logp_difference/mean": 0.07740553468465805,
"step": 373,
"step_time": 37.90379121299884
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"entropy": 0.11660288367420435,
"epoch": 0.00748,
"grad_norm": 1.325151801109314,
"kl": 0.7547343485057354,
"learning_rate": 9.999788439593031e-06,
"loss": -0.0431,
"step": 374,
"step_time": 6.616651901998921
},
{
"clip_ratio/high_max": 0.011111111380159855,
"clip_ratio/high_mean": 0.0055555556900799274,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007291666814126074,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2020.0,
"completions/max_terminated_length": 2020.0,
"completions/mean_length": 1847.59375,
"completions/mean_terminated_length": 1847.59375,
"completions/min_length": 1001.0,
"completions/min_terminated_length": 1001.0,
"entropy": 0.17600342631340027,
"epoch": 0.0075,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.814244031906128,
"kl": 0.8092060312628746,
"learning_rate": 9.999787185916332e-06,
"loss": 0.0845,
"num_tokens": 13987733.0,
"reward": 5.100118160247803,
"reward_std": 9.183606147766113,
"rewards/rollout_reward_func/mean": 5.100118160247803,
"rewards/rollout_reward_func/std": 14.02592658996582,
"sampling/importance_sampling_ratio/max": 1.662461519241333,
"sampling/importance_sampling_ratio/mean": 0.9842205047607422,
"sampling/importance_sampling_ratio/min": 0.22752133011817932,
"sampling/sampling_logp_difference/max": 0.6112067699432373,
"sampling/sampling_logp_difference/mean": 0.027879422530531883,
"step": 375,
"step_time": 36.84108008799922
},
{
"clip_ratio/high_max": 0.013888888992369175,
"clip_ratio/high_mean": 0.0069444444961845875,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666744276881,
"entropy": 0.17556224018335342,
"epoch": 0.00752,
"grad_norm": 1.3789916038513184,
"kl": 0.7915375307202339,
"learning_rate": 9.999785928536149e-06,
"loss": 0.0834,
"step": 376,
"step_time": 6.73390211300466
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2020.0,
"completions/max_terminated_length": 2020.0,
"completions/mean_length": 1849.84375,
"completions/mean_terminated_length": 1849.84375,
"completions/min_length": 1106.0,
"completions/min_terminated_length": 1106.0,
"entropy": 0.1529197357594967,
"epoch": 0.00754,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.711385726928711,
"kl": 0.8515200167894363,
"learning_rate": 9.999784667452484e-06,
"loss": -0.1104,
"num_tokens": 14067130.0,
"reward": 4.861234664916992,
"reward_std": 7.06196403503418,
"rewards/rollout_reward_func/mean": 4.861234664916992,
"rewards/rollout_reward_func/std": 13.409940719604492,
"sampling/importance_sampling_ratio/max": 1.5862915515899658,
"sampling/importance_sampling_ratio/mean": 0.9584461450576782,
"sampling/importance_sampling_ratio/min": 0.14316152036190033,
"sampling/sampling_logp_difference/max": 1.0026142597198486,
"sampling/sampling_logp_difference/mean": 0.03037048876285553,
"step": 377,
"step_time": 36.54281629899742
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"entropy": 0.15627212449908257,
"epoch": 0.00756,
"grad_norm": 1.1648268699645996,
"kl": 0.8427795469760895,
"learning_rate": 9.999783402665337e-06,
"loss": -0.1098,
"step": 378,
"step_time": 6.265584359001878
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1941.0,
"completions/max_terminated_length": 1941.0,
"completions/mean_length": 1823.875,
"completions/mean_terminated_length": 1823.1290283203125,
"completions/min_length": 1391.0,
"completions/min_terminated_length": 1391.0,
"entropy": 0.14846901781857014,
"epoch": 0.00758,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3327255249023438,
"kl": 0.391092361882329,
"learning_rate": 9.999782134174711e-06,
"loss": 0.0327,
"num_tokens": 14146165.0,
"reward": -0.792694091796875,
"reward_std": 8.190790176391602,
"rewards/rollout_reward_func/mean": -0.792694091796875,
"rewards/rollout_reward_func/std": 14.544111251831055,
"sampling/importance_sampling_ratio/max": 1.8940695524215698,
"sampling/importance_sampling_ratio/mean": 0.9296935200691223,
"sampling/importance_sampling_ratio/min": 0.392696738243103,
"sampling/sampling_logp_difference/max": 0.906486988067627,
"sampling/sampling_logp_difference/mean": 0.025111418217420578,
"step": 379,
"step_time": 35.90425824399972
},
{
"clip_ratio/high_max": 0.007378472248092294,
"clip_ratio/high_mean": 0.005425347131676972,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007161458255723119,
"entropy": 0.1500188522040844,
"epoch": 0.0076,
"grad_norm": 1.1629754304885864,
"kl": 0.3764154892414808,
"learning_rate": 9.999780861980606e-06,
"loss": 0.0307,
"step": 380,
"step_time": 6.591761033001603
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0012499999720603228,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0012499999720603228,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1956.0,
"completions/max_terminated_length": 1956.0,
"completions/mean_length": 1859.5,
"completions/mean_terminated_length": 1859.5,
"completions/min_length": 1667.0,
"completions/min_terminated_length": 1667.0,
"entropy": 0.16048664320260286,
"epoch": 0.00762,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.7579671144485474,
"kl": 0.33646042086184025,
"learning_rate": 9.999779586083026e-06,
"loss": -0.198,
"num_tokens": 14226076.0,
"reward": 5.697696685791016,
"reward_std": 4.209428787231445,
"rewards/rollout_reward_func/mean": 5.697696685791016,
"rewards/rollout_reward_func/std": 13.195571899414062,
"sampling/importance_sampling_ratio/max": 1.7836365699768066,
"sampling/importance_sampling_ratio/mean": 1.0036367177963257,
"sampling/importance_sampling_ratio/min": 1.983860382622171e-20,
"sampling/sampling_logp_difference/max": 24.610870361328125,
"sampling/sampling_logp_difference/mean": 0.09851048141717911,
"step": 381,
"step_time": 37.674988107999525
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0012499999720603228,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0047222222201526165,
"entropy": 0.16013844590634108,
"epoch": 0.00764,
"grad_norm": 1.1893305778503418,
"kl": 0.32176281698048115,
"learning_rate": 9.999778306481967e-06,
"loss": -0.2009,
"step": 382,
"step_time": 6.6418502909982635
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2022.0,
"completions/max_terminated_length": 2022.0,
"completions/mean_length": 1888.46875,
"completions/mean_terminated_length": 1888.46875,
"completions/min_length": 1740.0,
"completions/min_terminated_length": 1740.0,
"entropy": 0.1790903713554144,
"epoch": 0.00766,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2004278898239136,
"kl": 0.6035262942314148,
"learning_rate": 9.999777023177434e-06,
"loss": -0.0882,
"num_tokens": 14306910.0,
"reward": 5.2071757316589355,
"reward_std": 9.08076286315918,
"rewards/rollout_reward_func/mean": 5.2071757316589355,
"rewards/rollout_reward_func/std": 11.35473918914795,
"sampling/importance_sampling_ratio/max": 1.9644986391067505,
"sampling/importance_sampling_ratio/mean": 0.9806682467460632,
"sampling/importance_sampling_ratio/min": 0.4284982681274414,
"sampling/sampling_logp_difference/max": 0.6676270961761475,
"sampling/sampling_logp_difference/mean": 0.029503734782338142,
"step": 383,
"step_time": 38.31665977399825
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0069444444961845875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012152777868323028,
"entropy": 0.17896115221083164,
"epoch": 0.00768,
"grad_norm": 1.2121306657791138,
"kl": 0.6167402379214764,
"learning_rate": 9.999775736169428e-06,
"loss": -0.0902,
"step": 384,
"step_time": 6.277814937000585
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2019.0,
"completions/max_terminated_length": 2019.0,
"completions/mean_length": 1908.3125,
"completions/mean_terminated_length": 1908.3125,
"completions/min_length": 1820.0,
"completions/min_terminated_length": 1820.0,
"entropy": 0.1345885144546628,
"epoch": 0.0077,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.196207880973816,
"kl": 0.572353832423687,
"learning_rate": 9.99977444545795e-06,
"loss": -0.1616,
"num_tokens": 14389050.0,
"reward": 7.360363006591797,
"reward_std": 7.1288251876831055,
"rewards/rollout_reward_func/mean": 7.360363006591797,
"rewards/rollout_reward_func/std": 11.084699630737305,
"sampling/importance_sampling_ratio/max": 1.8938790559768677,
"sampling/importance_sampling_ratio/mean": 1.0094572305679321,
"sampling/importance_sampling_ratio/min": 0.4132111072540283,
"sampling/sampling_logp_difference/max": 1.0937575101852417,
"sampling/sampling_logp_difference/mean": 0.02933787927031517,
"step": 385,
"step_time": 39.433605923999494
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"entropy": 0.13119102269411087,
"epoch": 0.00772,
"grad_norm": 0.9113112688064575,
"kl": 0.5914033465087414,
"learning_rate": 9.999773151043e-06,
"loss": -0.1639,
"step": 386,
"step_time": 6.731554907000827
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1996.0,
"completions/max_terminated_length": 1996.0,
"completions/mean_length": 1888.1875,
"completions/mean_terminated_length": 1888.1875,
"completions/min_length": 1796.0,
"completions/min_terminated_length": 1796.0,
"entropy": 0.14891765639185905,
"epoch": 0.00774,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.6409744024276733,
"kl": 0.8021159842610359,
"learning_rate": 9.999771852924581e-06,
"loss": -0.0082,
"num_tokens": 14469970.0,
"reward": 6.270630359649658,
"reward_std": 8.007278442382812,
"rewards/rollout_reward_func/mean": 6.270630359649658,
"rewards/rollout_reward_func/std": 12.005971908569336,
"sampling/importance_sampling_ratio/max": 1.5516520738601685,
"sampling/importance_sampling_ratio/mean": 0.9231661558151245,
"sampling/importance_sampling_ratio/min": 0.45910143852233887,
"sampling/sampling_logp_difference/max": 0.6149110794067383,
"sampling/sampling_logp_difference/mean": 0.027469176799058914,
"step": 387,
"step_time": 36.89286300000276
},
{
"clip_ratio/high_max": 0.013888888992369175,
"clip_ratio/high_mean": 0.008680555620230734,
"clip_ratio/low_mean": 0.008680555620230734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01736111124046147,
"entropy": 0.142063251696527,
"epoch": 0.00776,
"grad_norm": 1.1793173551559448,
"kl": 0.7793877236545086,
"learning_rate": 9.999770551102692e-06,
"loss": -0.0135,
"step": 388,
"step_time": 6.243275303997507
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1997.0,
"completions/max_terminated_length": 1997.0,
"completions/mean_length": 1927.0625,
"completions/mean_terminated_length": 1927.0625,
"completions/min_length": 1840.0,
"completions/min_terminated_length": 1840.0,
"entropy": 0.15283752977848053,
"epoch": 0.00778,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9657156467437744,
"kl": 0.3845696784555912,
"learning_rate": 9.999769245577337e-06,
"loss": -0.2581,
"num_tokens": 14552332.0,
"reward": 10.149282455444336,
"reward_std": 9.99736213684082,
"rewards/rollout_reward_func/mean": 10.149282455444336,
"rewards/rollout_reward_func/std": 15.273449897766113,
"sampling/importance_sampling_ratio/max": 2.829928398132324,
"sampling/importance_sampling_ratio/mean": 1.1655187606811523,
"sampling/importance_sampling_ratio/min": 0.29218825697898865,
"sampling/sampling_logp_difference/max": 1.0828132629394531,
"sampling/sampling_logp_difference/mean": 0.03259303420782089,
"step": 389,
"step_time": 35.86770247999448
},
{
"clip_ratio/high_max": 0.013888888992369175,
"clip_ratio/high_mean": 0.0069444444961845875,
"clip_ratio/low_mean": 0.005208333372138441,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012152777868323028,
"entropy": 0.15223741345107555,
"epoch": 0.0078,
"grad_norm": 1.65627121925354,
"kl": 0.39045586064457893,
"learning_rate": 9.999767936348516e-06,
"loss": -0.2629,
"step": 390,
"step_time": 6.266667372001393
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2014.0,
"completions/max_terminated_length": 2014.0,
"completions/mean_length": 1907.625,
"completions/mean_terminated_length": 1907.625,
"completions/min_length": 1783.0,
"completions/min_terminated_length": 1783.0,
"entropy": 0.14941192418336868,
"epoch": 0.00782,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.701319694519043,
"kl": 0.4128375668078661,
"learning_rate": 9.999766623416231e-06,
"loss": -0.0513,
"num_tokens": 14633831.0,
"reward": 12.501077651977539,
"reward_std": 6.659691333770752,
"rewards/rollout_reward_func/mean": 12.501077651977539,
"rewards/rollout_reward_func/std": 15.406060218811035,
"sampling/importance_sampling_ratio/max": 1.780453085899353,
"sampling/importance_sampling_ratio/mean": 1.020936131477356,
"sampling/importance_sampling_ratio/min": 0.2042788863182068,
"sampling/sampling_logp_difference/max": 0.9652459621429443,
"sampling/sampling_logp_difference/mean": 0.027463870123028755,
"step": 391,
"step_time": 37.91702412400082
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"entropy": 0.1454289434477687,
"epoch": 0.00784,
"grad_norm": 1.6639035940170288,
"kl": 0.4206230044364929,
"learning_rate": 9.999765306780483e-06,
"loss": -0.0547,
"step": 392,
"step_time": 6.735865983999247
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2019.0,
"completions/max_terminated_length": 2019.0,
"completions/mean_length": 1883.46875,
"completions/mean_terminated_length": 1883.46875,
"completions/min_length": 1663.0,
"completions/min_terminated_length": 1663.0,
"entropy": 0.1706612892448902,
"epoch": 0.00786,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9211045503616333,
"kl": 0.5242457240819931,
"learning_rate": 9.999763986441271e-06,
"loss": -0.2397,
"num_tokens": 14714639.0,
"reward": 5.736878395080566,
"reward_std": 10.025382995605469,
"rewards/rollout_reward_func/mean": 5.736878395080566,
"rewards/rollout_reward_func/std": 15.901412963867188,
"sampling/importance_sampling_ratio/max": 2.1215860843658447,
"sampling/importance_sampling_ratio/mean": 0.9944747686386108,
"sampling/importance_sampling_ratio/min": 0.3210683763027191,
"sampling/sampling_logp_difference/max": 1.1642231941223145,
"sampling/sampling_logp_difference/mean": 0.03731720149517059,
"step": 393,
"step_time": 38.35620686900438
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0069444444961845875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666744276881,
"entropy": 0.16822869144380093,
"epoch": 0.00788,
"grad_norm": 1.7900792360305786,
"kl": 0.563917126506567,
"learning_rate": 9.999762662398599e-06,
"loss": -0.2418,
"step": 394,
"step_time": 6.265376314000605
},
{
"clip_ratio/high_max": 0.013888888992369175,
"clip_ratio/high_mean": 0.0069444444961845875,
"clip_ratio/low_mean": 0.0020833334419876337,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009027778054587543,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2005.0,
"completions/max_terminated_length": 2005.0,
"completions/mean_length": 1748.75,
"completions/mean_terminated_length": 1748.75,
"completions/min_length": 639.0,
"completions/min_terminated_length": 639.0,
"entropy": 0.15377911366522312,
"epoch": 0.0079,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.2754374742507935,
"kl": 0.6537161991000175,
"learning_rate": 9.999761334652469e-06,
"loss": 0.037,
"num_tokens": 14791302.0,
"reward": 3.2958414554595947,
"reward_std": 6.388683319091797,
"rewards/rollout_reward_func/mean": 3.2958414554595947,
"rewards/rollout_reward_func/std": 14.59126091003418,
"sampling/importance_sampling_ratio/max": 1.8531484603881836,
"sampling/importance_sampling_ratio/mean": 0.9730424880981445,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.1739468574523926,
"sampling/sampling_logp_difference/mean": 0.030640186741948128,
"step": 395,
"step_time": 34.660050579997915
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008680555620230734,
"entropy": 0.15109152905642986,
"epoch": 0.00792,
"grad_norm": 1.3109846115112305,
"kl": 0.6577336862683296,
"learning_rate": 9.999760003202882e-06,
"loss": 0.0344,
"step": 396,
"step_time": 6.238461292003194
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333372138441,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1997.0,
"completions/max_terminated_length": 1997.0,
"completions/mean_length": 1870.09375,
"completions/mean_terminated_length": 1870.09375,
"completions/min_length": 1662.0,
"completions/min_terminated_length": 1662.0,
"entropy": 0.15775778237730265,
"epoch": 0.00794,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4070671796798706,
"kl": 0.6230524256825447,
"learning_rate": 9.999758668049834e-06,
"loss": -0.0548,
"num_tokens": 14871639.0,
"reward": 10.42796802520752,
"reward_std": 8.001551628112793,
"rewards/rollout_reward_func/mean": 10.42796802520752,
"rewards/rollout_reward_func/std": 13.233992576599121,
"sampling/importance_sampling_ratio/max": 1.9538705348968506,
"sampling/importance_sampling_ratio/mean": 0.9155502319335938,
"sampling/importance_sampling_ratio/min": 9.152499071130027e-12,
"sampling/sampling_logp_difference/max": 24.498069763183594,
"sampling/sampling_logp_difference/mean": 0.10757236927747726,
"step": 397,
"step_time": 37.941033778999554
},
{
"clip_ratio/high_max": 0.006761695956811309,
"clip_ratio/high_mean": 0.0033808479784056544,
"clip_ratio/low_mean": 0.005208333372138441,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008589181350544095,
"entropy": 0.15627853386104107,
"epoch": 0.00796,
"grad_norm": 1.5508379936218262,
"kl": 0.6376640871167183,
"learning_rate": 9.999757329193334e-06,
"loss": -0.0563,
"step": 398,
"step_time": 6.230076591997204
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1998.0,
"completions/max_terminated_length": 1998.0,
"completions/mean_length": 1835.84375,
"completions/mean_terminated_length": 1835.84375,
"completions/min_length": 1691.0,
"completions/min_terminated_length": 1691.0,
"entropy": 0.13853330817073584,
"epoch": 0.00798,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.629981517791748,
"kl": 0.5573707222938538,
"learning_rate": 9.999755986633378e-06,
"loss": 0.0474,
"num_tokens": 14950879.0,
"reward": 2.0207910537719727,
"reward_std": 7.94326114654541,
"rewards/rollout_reward_func/mean": 2.0207910537719727,
"rewards/rollout_reward_func/std": 15.21265983581543,
"sampling/importance_sampling_ratio/max": 1.5181382894515991,
"sampling/importance_sampling_ratio/mean": 0.9647513628005981,
"sampling/importance_sampling_ratio/min": 0.04904457926750183,
"sampling/sampling_logp_difference/max": 1.2537705898284912,
"sampling/sampling_logp_difference/mean": 0.03419237583875656,
"step": 399,
"step_time": 37.17983689799985
},
{
"clip_ratio/high_max": 0.01736111124046147,
"clip_ratio/high_mean": 0.008680555620230734,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012152777868323028,
"entropy": 0.14058438315987587,
"epoch": 0.008,
"grad_norm": 0.6722509264945984,
"kl": 0.5397394970059395,
"learning_rate": 9.999754640369969e-06,
"loss": 0.0452,
"step": 400,
"step_time": 6.206834619999427
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1975.0,
"completions/max_terminated_length": 1975.0,
"completions/mean_length": 1883.46875,
"completions/mean_terminated_length": 1882.0,
"completions/min_length": 1799.0,
"completions/min_terminated_length": 1799.0,
"entropy": 0.13088950607925653,
"epoch": 0.00802,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.2955660820007324,
"kl": 0.44351304322481155,
"learning_rate": 9.99975329040311e-06,
"loss": -0.0213,
"num_tokens": 15031668.0,
"reward": 7.0081658363342285,
"reward_std": 4.4600934982299805,
"rewards/rollout_reward_func/mean": 7.0081658363342285,
"rewards/rollout_reward_func/std": 6.16019344329834,
"sampling/importance_sampling_ratio/max": 1.7051112651824951,
"sampling/importance_sampling_ratio/mean": 1.0153717994689941,
"sampling/importance_sampling_ratio/min": 0.27369314432144165,
"sampling/sampling_logp_difference/max": 1.2073643207550049,
"sampling/sampling_logp_difference/mean": 0.02993970364332199,
"step": 401,
"step_time": 37.11414306700135
},
{
"clip_ratio/high_max": 0.01736111124046147,
"clip_ratio/high_mean": 0.010416666744276881,
"clip_ratio/low_mean": 0.005208333372138441,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625000116415322,
"entropy": 0.13208830822259188,
"epoch": 0.00804,
"grad_norm": 1.393228530883789,
"kl": 0.4656967334449291,
"learning_rate": 9.9997519367328e-06,
"loss": -0.0256,
"step": 402,
"step_time": 6.654871525000999
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2016.0,
"completions/max_terminated_length": 2016.0,
"completions/mean_length": 1908.125,
"completions/mean_terminated_length": 1908.125,
"completions/min_length": 1816.0,
"completions/min_terminated_length": 1816.0,
"entropy": 0.10632855352014303,
"epoch": 0.00806,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.1967393159866333,
"kl": 0.6505869217216969,
"learning_rate": 9.999750579359042e-06,
"loss": 0.0394,
"num_tokens": 15113625.0,
"reward": 7.391304969787598,
"reward_std": 4.511048316955566,
"rewards/rollout_reward_func/mean": 7.391304969787598,
"rewards/rollout_reward_func/std": 12.610478401184082,
"sampling/importance_sampling_ratio/max": 1.6840932369232178,
"sampling/importance_sampling_ratio/mean": 1.005039930343628,
"sampling/importance_sampling_ratio/min": 0.2297038733959198,
"sampling/sampling_logp_difference/max": 1.0730221271514893,
"sampling/sampling_logp_difference/mean": 0.023349303752183914,
"step": 403,
"step_time": 37.35070921799888
},
{
"clip_ratio/high_max": 0.013888888992369175,
"clip_ratio/high_mean": 0.0069444444961845875,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"entropy": 0.10929813794791698,
"epoch": 0.00808,
"grad_norm": 0.7991507649421692,
"kl": 0.564202968031168,
"learning_rate": 9.999749218281836e-06,
"loss": 0.0374,
"step": 404,
"step_time": 6.741287571996509
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1994.0,
"completions/max_terminated_length": 1994.0,
"completions/mean_length": 1666.90625,
"completions/mean_terminated_length": 1666.90625,
"completions/min_length": 657.0,
"completions/min_terminated_length": 657.0,
"entropy": 0.15686850529164076,
"epoch": 0.0081,
"frac_reward_zero_std": 0.125,
"grad_norm": 3.0047717094421387,
"kl": 0.44304889999330044,
"learning_rate": 9.999747853501184e-06,
"loss": 0.0822,
"num_tokens": 15187523.0,
"reward": 12.523664474487305,
"reward_std": 6.427609443664551,
"rewards/rollout_reward_func/mean": 12.523664474487305,
"rewards/rollout_reward_func/std": 15.645023345947266,
"sampling/importance_sampling_ratio/max": 2.3568639755249023,
"sampling/importance_sampling_ratio/mean": 1.1489346027374268,
"sampling/importance_sampling_ratio/min": 0.22255395352840424,
"sampling/sampling_logp_difference/max": 0.6474602222442627,
"sampling/sampling_logp_difference/mean": 0.030321069061756134,
"step": 405,
"step_time": 33.77085295799952
},
{
"clip_ratio/high_max": 0.0130876072216779,
"clip_ratio/high_mean": 0.008279914851300418,
"clip_ratio/low_mean": 0.0069444444961845875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015224359231069684,
"entropy": 0.16008016001433134,
"epoch": 0.00812,
"grad_norm": 1.3276525735855103,
"kl": 0.4322241246700287,
"learning_rate": 9.999746485017087e-06,
"loss": 0.08,
"step": 406,
"step_time": 6.167374660999485
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1979.0,
"completions/max_terminated_length": 1979.0,
"completions/mean_length": 1787.96875,
"completions/mean_terminated_length": 1787.96875,
"completions/min_length": 675.0,
"completions/min_terminated_length": 675.0,
"entropy": 0.1374366506934166,
"epoch": 0.00814,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.628904938697815,
"kl": 0.4086805544793606,
"learning_rate": 9.999745112829547e-06,
"loss": -0.128,
"num_tokens": 15265477.0,
"reward": 9.787149429321289,
"reward_std": 11.904574394226074,
"rewards/rollout_reward_func/mean": 9.787149429321289,
"rewards/rollout_reward_func/std": 14.022072792053223,
"sampling/importance_sampling_ratio/max": 2.199753999710083,
"sampling/importance_sampling_ratio/mean": 1.1246109008789062,
"sampling/importance_sampling_ratio/min": 0.6423606276512146,
"sampling/sampling_logp_difference/max": 0.681222677230835,
"sampling/sampling_logp_difference/mean": 0.022375933825969696,
"step": 407,
"step_time": 36.12690065800416
},
{
"clip_ratio/high_max": 0.013888888992369175,
"clip_ratio/high_mean": 0.0069444444961845875,
"clip_ratio/low_mean": 0.0069444444961845875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.013888888992369175,
"entropy": 0.13620022125542164,
"epoch": 0.00816,
"grad_norm": 1.2189370393753052,
"kl": 0.39171487279236317,
"learning_rate": 9.999743736938565e-06,
"loss": -0.1329,
"step": 408,
"step_time": 6.64129104499807
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1995.0,
"completions/max_terminated_length": 1995.0,
"completions/mean_length": 1771.5,
"completions/mean_terminated_length": 1771.5,
"completions/min_length": 696.0,
"completions/min_terminated_length": 696.0,
"entropy": 0.16864918265491724,
"epoch": 0.00818,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.9522514939308167,
"kl": 0.5428624674677849,
"learning_rate": 9.999742357344142e-06,
"loss": -0.0073,
"num_tokens": 15342877.0,
"reward": 9.139594078063965,
"reward_std": 3.8079919815063477,
"rewards/rollout_reward_func/mean": 9.139594078063965,
"rewards/rollout_reward_func/std": 14.156567573547363,
"sampling/importance_sampling_ratio/max": 2.7841055393218994,
"sampling/importance_sampling_ratio/mean": 0.9111255407333374,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.997159481048584,
"sampling/sampling_logp_difference/mean": 0.03262122720479965,
"step": 409,
"step_time": 36.596215775998644
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333372138441,
"entropy": 0.16705580614507198,
"epoch": 0.0082,
"grad_norm": 1.053119421005249,
"kl": 0.5281503275036812,
"learning_rate": 9.999740974046281e-06,
"loss": -0.0093,
"step": 410,
"step_time": 6.708477361000405
},
{
"clip_ratio/high_max": 0.011054942850023508,
"clip_ratio/high_mean": 0.005527471425011754,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005527471425011754,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1967.0,
"completions/max_terminated_length": 1967.0,
"completions/mean_length": 1785.96875,
"completions/mean_terminated_length": 1785.96875,
"completions/min_length": 704.0,
"completions/min_terminated_length": 704.0,
"entropy": 0.18522152118384838,
"epoch": 0.00822,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.4419904947280884,
"kl": 0.6027393564581871,
"learning_rate": 9.999739587044982e-06,
"loss": -0.0636,
"num_tokens": 15420534.0,
"reward": 2.8079020977020264,
"reward_std": 6.569200038909912,
"rewards/rollout_reward_func/mean": 2.8079020977020264,
"rewards/rollout_reward_func/std": 14.606973648071289,
"sampling/importance_sampling_ratio/max": 1.9993548393249512,
"sampling/importance_sampling_ratio/mean": 0.8649469614028931,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 27.178010940551758,
"sampling/sampling_logp_difference/mean": 0.08464864641427994,
"step": 411,
"step_time": 35.07087018799393
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.010542306350544095,
"clip_ratio/low_min": 0.003289473708719015,
"clip_ratio/region_mean": 0.014014528598636389,
"entropy": 0.18445036374032497,
"epoch": 0.00824,
"grad_norm": 1.382118582725525,
"kl": 0.6093035973608494,
"learning_rate": 9.999738196340246e-06,
"loss": -0.0667,
"step": 412,
"step_time": 6.159181504002845
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2001.0,
"completions/max_terminated_length": 2001.0,
"completions/mean_length": 1912.59375,
"completions/mean_terminated_length": 1912.59375,
"completions/min_length": 1538.0,
"completions/min_terminated_length": 1538.0,
"entropy": 0.15822596568614244,
"epoch": 0.00826,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.5164858102798462,
"kl": 0.3464791551232338,
"learning_rate": 9.999736801932072e-06,
"loss": -0.092,
"num_tokens": 15502267.0,
"reward": 10.420038223266602,
"reward_std": 5.580462455749512,
"rewards/rollout_reward_func/mean": 10.420038223266602,
"rewards/rollout_reward_func/std": 16.989425659179688,
"sampling/importance_sampling_ratio/max": 2.6757559776306152,
"sampling/importance_sampling_ratio/mean": 0.9447764158248901,
"sampling/importance_sampling_ratio/min": 1.2121374737272816e-10,
"sampling/sampling_logp_difference/max": 22.602230072021484,
"sampling/sampling_logp_difference/mean": 0.06792205572128296,
"step": 413,
"step_time": 37.483222671000476
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.005116959218867123,
"clip_ratio/low_min": 0.003289473708719015,
"clip_ratio/region_mean": 0.00685307034291327,
"entropy": 0.15620272234082222,
"epoch": 0.00828,
"grad_norm": 1.027227759361267,
"kl": 0.34561512246727943,
"learning_rate": 9.999735403820467e-06,
"loss": -0.0951,
"step": 414,
"step_time": 6.721208181003021
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.003689236007630825,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005425347131676972,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2002.0,
"completions/max_terminated_length": 2002.0,
"completions/mean_length": 1852.9375,
"completions/mean_terminated_length": 1852.9375,
"completions/min_length": 1307.0,
"completions/min_terminated_length": 1307.0,
"entropy": 0.19122237898409367,
"epoch": 0.0083,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.5304763317108154,
"kl": 0.4231012538075447,
"learning_rate": 9.999734002005428e-06,
"loss": -0.0365,
"num_tokens": 15581901.0,
"reward": 6.599876403808594,
"reward_std": 4.858182907104492,
"rewards/rollout_reward_func/mean": 6.599876403808594,
"rewards/rollout_reward_func/std": 10.637002944946289,
"sampling/importance_sampling_ratio/max": 1.754501223564148,
"sampling/importance_sampling_ratio/mean": 1.1050997972488403,
"sampling/importance_sampling_ratio/min": 0.25575336813926697,
"sampling/sampling_logp_difference/max": 0.7238872051239014,
"sampling/sampling_logp_difference/mean": 0.03033018298447132,
"step": 415,
"step_time": 36.794177785999636
},
{
"clip_ratio/high_max": 0.013134058099240065,
"clip_ratio/high_mean": 0.008303140290081501,
"clip_ratio/low_mean": 0.008680555620230734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.016983695910312235,
"entropy": 0.18879046849906445,
"epoch": 0.00832,
"grad_norm": 1.5424190759658813,
"kl": 0.4105800986289978,
"learning_rate": 9.99973259648696e-06,
"loss": -0.0417,
"step": 416,
"step_time": 6.229540733997055
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333372138441,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2016.0,
"completions/max_terminated_length": 2016.0,
"completions/mean_length": 1897.0,
"completions/mean_terminated_length": 1897.0,
"completions/min_length": 1626.0,
"completions/min_terminated_length": 1626.0,
"entropy": 0.12648662831634283,
"epoch": 0.00834,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.2874664068222046,
"kl": 0.5799643807113171,
"learning_rate": 9.999731187265061e-06,
"loss": -0.0132,
"num_tokens": 15663076.0,
"reward": 9.055243492126465,
"reward_std": 5.7032575607299805,
"rewards/rollout_reward_func/mean": 9.055243492126465,
"rewards/rollout_reward_func/std": 21.282480239868164,
"sampling/importance_sampling_ratio/max": 1.8616880178451538,
"sampling/importance_sampling_ratio/mean": 1.0218424797058105,
"sampling/importance_sampling_ratio/min": 0.2051515281200409,
"sampling/sampling_logp_difference/max": 0.9839389324188232,
"sampling/sampling_logp_difference/mean": 0.025066856294870377,
"step": 417,
"step_time": 36.59973449099925
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.005310457549057901,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008782679797150195,
"entropy": 0.12579865287989378,
"epoch": 0.00836,
"grad_norm": 1.2702187299728394,
"kl": 0.5678221099078655,
"learning_rate": 9.999729774339734e-06,
"loss": -0.0158,
"step": 418,
"step_time": 6.2664286570015975
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008680555620230734,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1971.0,
"completions/max_terminated_length": 1971.0,
"completions/mean_length": 1882.1875,
"completions/mean_terminated_length": 1882.1875,
"completions/min_length": 1722.0,
"completions/min_terminated_length": 1722.0,
"entropy": 0.1713708434253931,
"epoch": 0.00838,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8853299617767334,
"kl": 0.40060317888855934,
"learning_rate": 9.999728357710979e-06,
"loss": -0.1656,
"num_tokens": 15743610.0,
"reward": -1.3480520248413086,
"reward_std": 6.245339393615723,
"rewards/rollout_reward_func/mean": -1.3480520248413086,
"rewards/rollout_reward_func/std": 14.265408515930176,
"sampling/importance_sampling_ratio/max": 2.072874069213867,
"sampling/importance_sampling_ratio/mean": 0.9604393243789673,
"sampling/importance_sampling_ratio/min": 0.2704717814922333,
"sampling/sampling_logp_difference/max": 1.3232176303863525,
"sampling/sampling_logp_difference/mean": 0.03345388546586037,
"step": 419,
"step_time": 38.095200251995266
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.008680555620230734,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.010416666860692203,
"entropy": 0.16807264648377895,
"epoch": 0.0084,
"grad_norm": 1.7478289604187012,
"kl": 0.4048056434839964,
"learning_rate": 9.999726937378799e-06,
"loss": -0.1689,
"step": 420,
"step_time": 6.216570853004669
},
{
"clip_ratio/high_max": 0.004166666883975267,
"clip_ratio/high_mean": 0.0020833334419876337,
"clip_ratio/low_mean": 0.004315476398915052,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006398809840902686,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1977.0,
"completions/max_terminated_length": 1977.0,
"completions/mean_length": 1700.3125,
"completions/mean_terminated_length": 1700.3125,
"completions/min_length": 996.0,
"completions/min_terminated_length": 996.0,
"entropy": 0.13674810901284218,
"epoch": 0.00842,
"frac_reward_zero_std": 0.125,
"grad_norm": 2.3826522827148438,
"kl": 0.8743790201842785,
"learning_rate": 9.999725513343196e-06,
"loss": 0.1654,
"num_tokens": 15818763.0,
"reward": 5.071457862854004,
"reward_std": 5.412266254425049,
"rewards/rollout_reward_func/mean": 5.071457862854004,
"rewards/rollout_reward_func/std": 18.597326278686523,
"sampling/importance_sampling_ratio/max": 2.0363876819610596,
"sampling/importance_sampling_ratio/mean": 1.0418556928634644,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.9595718383789062,
"sampling/sampling_logp_difference/mean": 0.029438920319080353,
"step": 421,
"step_time": 32.92347357299877
},
{
"clip_ratio/high_max": 0.02738095331005752,
"clip_ratio/high_mean": 0.015426587779074907,
"clip_ratio/low_mean": 0.007787698763422668,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.023214286658912897,
"entropy": 0.13682854734361172,
"epoch": 0.00844,
"grad_norm": 1.5287761688232422,
"kl": 0.8074955753982067,
"learning_rate": 9.99972408560417e-06,
"loss": 0.1613,
"step": 422,
"step_time": 6.16248918799829
},
{
"clip_ratio/high_max": 0.0036764706019312143,
"clip_ratio/high_mean": 0.0018382353009656072,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0018382353009656072,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1986.0,
"completions/max_terminated_length": 1986.0,
"completions/mean_length": 1831.90625,
"completions/mean_terminated_length": 1831.90625,
"completions/min_length": 1436.0,
"completions/min_terminated_length": 1436.0,
"entropy": 0.14473330415785313,
"epoch": 0.00846,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.183998703956604,
"kl": 0.39037713408470154,
"learning_rate": 9.999722654161723e-06,
"loss": -0.0322,
"num_tokens": 15897649.0,
"reward": 12.873044967651367,
"reward_std": 3.1303045749664307,
"rewards/rollout_reward_func/mean": 12.873044967651367,
"rewards/rollout_reward_func/std": 16.81613540649414,
"sampling/importance_sampling_ratio/max": 2.278865337371826,
"sampling/importance_sampling_ratio/mean": 0.936629593372345,
"sampling/importance_sampling_ratio/min": 1.0781457107297832e-12,
"sampling/sampling_logp_difference/max": 27.504920959472656,
"sampling/sampling_logp_difference/mean": 0.08040793240070343,
"step": 423,
"step_time": 34.50863980300164
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.003689236124046147,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003689236124046147,
"entropy": 0.14420728944242,
"epoch": 0.00848,
"grad_norm": 1.123458743095398,
"kl": 0.3863917402923107,
"learning_rate": 9.999721219015855e-06,
"loss": -0.0343,
"step": 424,
"step_time": 6.203808428001139
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006944444612599909,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1986.0,
"completions/max_terminated_length": 1986.0,
"completions/mean_length": 1907.59375,
"completions/mean_terminated_length": 1907.59375,
"completions/min_length": 1846.0,
"completions/min_terminated_length": 1846.0,
"entropy": 0.1717237215489149,
"epoch": 0.0085,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.2574822902679443,
"kl": 0.45831191912293434,
"learning_rate": 9.999719780166568e-06,
"loss": -0.0116,
"num_tokens": 15979198.0,
"reward": 12.526996612548828,
"reward_std": 7.190093994140625,
"rewards/rollout_reward_func/mean": 12.526996612548828,
"rewards/rollout_reward_func/std": 11.75661563873291,
"sampling/importance_sampling_ratio/max": 2.549557685852051,
"sampling/importance_sampling_ratio/mean": 1.156769037246704,
"sampling/importance_sampling_ratio/min": 0.5638684034347534,
"sampling/sampling_logp_difference/max": 0.8117094039916992,
"sampling/sampling_logp_difference/mean": 0.031035717576742172,
"step": 425,
"step_time": 37.6721021070025
},
{
"clip_ratio/high_max": 0.013888888992369175,
"clip_ratio/high_mean": 0.0069444444961845875,
"clip_ratio/low_mean": 0.005208333372138441,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012152777868323028,
"entropy": 0.17174869030714035,
"epoch": 0.00852,
"grad_norm": 1.9043995141983032,
"kl": 0.4626782052218914,
"learning_rate": 9.999718337613866e-06,
"loss": -0.0129,
"step": 426,
"step_time": 6.2495746619970305
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1981.0,
"completions/max_terminated_length": 1981.0,
"completions/mean_length": 1863.90625,
"completions/mean_terminated_length": 1863.90625,
"completions/min_length": 1682.0,
"completions/min_terminated_length": 1682.0,
"entropy": 0.15208253264427185,
"epoch": 0.00854,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7153781652450562,
"kl": 0.6745771653950214,
"learning_rate": 9.999716891357747e-06,
"loss": 0.0219,
"num_tokens": 16059187.0,
"reward": 5.894711017608643,
"reward_std": 5.455031394958496,
"rewards/rollout_reward_func/mean": 5.894711017608643,
"rewards/rollout_reward_func/std": 12.694339752197266,
"sampling/importance_sampling_ratio/max": 2.351686716079712,
"sampling/importance_sampling_ratio/mean": 1.042860746383667,
"sampling/importance_sampling_ratio/min": 5.802730258103184e-13,
"sampling/sampling_logp_difference/max": 27.137590408325195,
"sampling/sampling_logp_difference/mean": 0.0840056762099266,
"step": 427,
"step_time": 37.95969708299526
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008680555620230734,
"entropy": 0.1548395473510027,
"epoch": 0.00856,
"grad_norm": 1.2103983163833618,
"kl": 0.6520356982946396,
"learning_rate": 9.999715441398214e-06,
"loss": 0.0201,
"step": 428,
"step_time": 6.195645353001964
},
{
"clip_ratio/high_max": 0.006761695956811309,
"clip_ratio/high_mean": 0.0033808479784056544,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005116959102451801,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1967.0,
"completions/max_terminated_length": 1967.0,
"completions/mean_length": 1780.53125,
"completions/mean_terminated_length": 1780.53125,
"completions/min_length": 1025.0,
"completions/min_terminated_length": 1025.0,
"entropy": 0.13885898049920797,
"epoch": 0.00858,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0375118255615234,
"kl": 0.5098150111734867,
"learning_rate": 9.99971398773527e-06,
"loss": -0.0296,
"num_tokens": 16136334.0,
"reward": 12.420064926147461,
"reward_std": 8.215323448181152,
"rewards/rollout_reward_func/mean": 12.420064926147461,
"rewards/rollout_reward_func/std": 17.566680908203125,
"sampling/importance_sampling_ratio/max": 2.0209267139434814,
"sampling/importance_sampling_ratio/mean": 1.0481691360473633,
"sampling/importance_sampling_ratio/min": 9.511323152688878e-12,
"sampling/sampling_logp_difference/max": 24.24110984802246,
"sampling/sampling_logp_difference/mean": 0.07432256639003754,
"step": 429,
"step_time": 34.64603473199713
},
{
"clip_ratio/high_max": 0.003289473708719015,
"clip_ratio/high_mean": 0.0016447368543595076,
"clip_ratio/low_mean": 0.01215277798473835,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.013797514839097857,
"entropy": 0.1355869797989726,
"epoch": 0.0086,
"grad_norm": 1.5481940507888794,
"kl": 0.5054982472211123,
"learning_rate": 9.999712530368912e-06,
"loss": -0.0355,
"step": 430,
"step_time": 6.177134515002763
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333372138441,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 1976.0,
"completions/max_terminated_length": 1976.0,
"completions/mean_length": 1879.0,
"completions/mean_terminated_length": 1880.4193115234375,
"completions/min_length": 1756.0,
"completions/min_terminated_length": 1756.0,
"entropy": 0.15621676482260227,
"epoch": 0.00862,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5531624555587769,
"kl": 0.5274980738759041,
"learning_rate": 9.999711069299145e-06,
"loss": -0.0229,
"num_tokens": 16217117.0,
"reward": 10.379293441772461,
"reward_std": 6.365335464477539,
"rewards/rollout_reward_func/mean": 10.379293441772461,
"rewards/rollout_reward_func/std": 9.409771919250488,
"sampling/importance_sampling_ratio/max": 1.8403072357177734,
"sampling/importance_sampling_ratio/mean": 0.9792887568473816,
"sampling/importance_sampling_ratio/min": 0.1641586869955063,
"sampling/sampling_logp_difference/max": 1.2717455625534058,
"sampling/sampling_logp_difference/mean": 0.0406506210565567,
"step": 431,
"step_time": 36.80640686400329
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.006458333344198763,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009930555592291057,
"entropy": 0.1552266152575612,
"epoch": 0.00864,
"grad_norm": 1.1488696336746216,
"kl": 0.5495500713586807,
"learning_rate": 9.999709604525971e-06,
"loss": -0.027,
"step": 432,
"step_time": 6.695867144002477
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333372138441,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1980.0,
"completions/max_terminated_length": 1980.0,
"completions/mean_length": 1859.1875,
"completions/mean_terminated_length": 1859.1875,
"completions/min_length": 1568.0,
"completions/min_terminated_length": 1568.0,
"entropy": 0.1606634296476841,
"epoch": 0.00866,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.1394176483154297,
"kl": 0.5905432924628258,
"learning_rate": 9.999708136049389e-06,
"loss": -0.0213,
"num_tokens": 16297219.0,
"reward": 10.730379104614258,
"reward_std": 10.182910919189453,
"rewards/rollout_reward_func/mean": 10.730379104614258,
"rewards/rollout_reward_func/std": 14.116110801696777,
"sampling/importance_sampling_ratio/max": 1.9054006338119507,
"sampling/importance_sampling_ratio/mean": 1.0243314504623413,
"sampling/importance_sampling_ratio/min": 0.28989264369010925,
"sampling/sampling_logp_difference/max": 1.0515797138214111,
"sampling/sampling_logp_difference/mean": 0.03440077230334282,
"step": 433,
"step_time": 37.53456605399697
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.008782679797150195,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.012254902045242488,
"entropy": 0.1584420707076788,
"epoch": 0.00868,
"grad_norm": 1.6995211839675903,
"kl": 0.6047380901873112,
"learning_rate": 9.9997066638694e-06,
"loss": -0.0252,
"step": 434,
"step_time": 6.179048164000051
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005425347248092294,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2027.0,
"completions/max_terminated_length": 2027.0,
"completions/mean_length": 1828.78125,
"completions/mean_terminated_length": 1828.78125,
"completions/min_length": 1343.0,
"completions/min_terminated_length": 1343.0,
"entropy": 0.12900556158274412,
"epoch": 0.0087,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.3848490715026855,
"kl": 0.5210922621190548,
"learning_rate": 9.99970518798601e-06,
"loss": -0.0456,
"num_tokens": 16375969.0,
"reward": 9.6665620803833,
"reward_std": 6.750264644622803,
"rewards/rollout_reward_func/mean": 9.6665620803833,
"rewards/rollout_reward_func/std": 19.660367965698242,
"sampling/importance_sampling_ratio/max": 2.6867897510528564,
"sampling/importance_sampling_ratio/mean": 1.0070596933364868,
"sampling/importance_sampling_ratio/min": 0.13064204156398773,
"sampling/sampling_logp_difference/max": 1.4016146659851074,
"sampling/sampling_logp_difference/mean": 0.03491336107254028,
"step": 435,
"step_time": 35.39866553399952
},
{
"clip_ratio/high_max": 0.018229166977107525,
"clip_ratio/high_mean": 0.009114583488553762,
"clip_ratio/low_mean": 0.007161458255723119,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.01627604174427688,
"entropy": 0.12771923653781414,
"epoch": 0.00872,
"grad_norm": 1.2393711805343628,
"kl": 0.5349139347672462,
"learning_rate": 9.999703708399216e-06,
"loss": -0.0498,
"step": 436,
"step_time": 6.720962448001956
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0069444444961845875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666744276881,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2016.0,
"completions/max_terminated_length": 2016.0,
"completions/mean_length": 1850.8125,
"completions/mean_terminated_length": 1850.8125,
"completions/min_length": 995.0,
"completions/min_terminated_length": 995.0,
"entropy": 0.14382985513657331,
"epoch": 0.00874,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.190502405166626,
"kl": 0.5796581096947193,
"learning_rate": 9.99970222510902e-06,
"loss": -0.0328,
"num_tokens": 16456143.0,
"reward": 9.858323097229004,
"reward_std": 9.86341381072998,
"rewards/rollout_reward_func/mean": 9.858323097229004,
"rewards/rollout_reward_func/std": 15.427154541015625,
"sampling/importance_sampling_ratio/max": 1.8721684217453003,
"sampling/importance_sampling_ratio/mean": 1.100501537322998,
"sampling/importance_sampling_ratio/min": 0.25606006383895874,
"sampling/sampling_logp_difference/max": 0.986447811126709,
"sampling/sampling_logp_difference/mean": 0.031652357429265976,
"step": 437,
"step_time": 35.963873100005
},
{
"clip_ratio/high_max": 0.007638889132067561,
"clip_ratio/high_mean": 0.005555555806495249,
"clip_ratio/low_mean": 0.008680555620230734,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.014236111426725984,
"entropy": 0.13910409063100815,
"epoch": 0.00876,
"grad_norm": 1.3543274402618408,
"kl": 0.6213464625179768,
"learning_rate": 9.999700738115424e-06,
"loss": -0.0356,
"step": 438,
"step_time": 6.740783157998521
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1965.0,
"completions/max_terminated_length": 1965.0,
"completions/mean_length": 1871.875,
"completions/mean_terminated_length": 1871.875,
"completions/min_length": 1788.0,
"completions/min_terminated_length": 1788.0,
"entropy": 0.14204278402030468,
"epoch": 0.00878,
"frac_reward_zero_std": 0.125,
"grad_norm": 2.5433712005615234,
"kl": 0.40115803107619286,
"learning_rate": 9.999699247418431e-06,
"loss": 0.0396,
"num_tokens": 16536454.0,
"reward": 3.6463284492492676,
"reward_std": 7.665958404541016,
"rewards/rollout_reward_func/mean": 3.6463284492492676,
"rewards/rollout_reward_func/std": 10.984175682067871,
"sampling/importance_sampling_ratio/max": 1.6518703699111938,
"sampling/importance_sampling_ratio/mean": 1.0737829208374023,
"sampling/importance_sampling_ratio/min": 0.4997839033603668,
"sampling/sampling_logp_difference/max": 0.924556314945221,
"sampling/sampling_logp_difference/mean": 0.02629711665213108,
"step": 439,
"step_time": 38.01396525000018
},
{
"clip_ratio/high_max": 0.013888888992369175,
"clip_ratio/high_mean": 0.008680555620230734,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012152777868323028,
"entropy": 0.14073345065116882,
"epoch": 0.0088,
"grad_norm": 1.6631256341934204,
"kl": 0.4030180908739567,
"learning_rate": 9.999697753018042e-06,
"loss": 0.0307,
"step": 440,
"step_time": 6.178545857001154
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1881.0,
"completions/max_terminated_length": 1881.0,
"completions/mean_length": 1798.34375,
"completions/mean_terminated_length": 1798.34375,
"completions/min_length": 1601.0,
"completions/min_terminated_length": 1601.0,
"entropy": 0.10215338412672281,
"epoch": 0.00882,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.50302255153656,
"kl": 0.37106287851929665,
"learning_rate": 9.999696254914256e-06,
"loss": -0.0679,
"num_tokens": 16614920.0,
"reward": 6.441880226135254,
"reward_std": 6.458446502685547,
"rewards/rollout_reward_func/mean": 6.441880226135254,
"rewards/rollout_reward_func/std": 15.004520416259766,
"sampling/importance_sampling_ratio/max": 2.006425380706787,
"sampling/importance_sampling_ratio/mean": 1.0931893587112427,
"sampling/importance_sampling_ratio/min": 0.43766847252845764,
"sampling/sampling_logp_difference/max": 0.5878493785858154,
"sampling/sampling_logp_difference/mean": 0.014898103661835194,
"step": 441,
"step_time": 36.059925193998424
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0069444444961845875,
"entropy": 0.10211750213056803,
"epoch": 0.00884,
"grad_norm": 1.5077581405639648,
"kl": 0.3891483061015606,
"learning_rate": 9.999694753107077e-06,
"loss": -0.0722,
"step": 442,
"step_time": 6.373549443997035
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1980.0,
"completions/max_terminated_length": 1980.0,
"completions/mean_length": 1874.3125,
"completions/mean_terminated_length": 1874.3125,
"completions/min_length": 1676.0,
"completions/min_terminated_length": 1676.0,
"entropy": 0.12804421968758106,
"epoch": 0.00886,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.4372204542160034,
"kl": 0.612975912168622,
"learning_rate": 9.999693247596505e-06,
"loss": -0.0372,
"num_tokens": 16695143.0,
"reward": 9.780478477478027,
"reward_std": 10.539976119995117,
"rewards/rollout_reward_func/mean": 9.780478477478027,
"rewards/rollout_reward_func/std": 18.792755126953125,
"sampling/importance_sampling_ratio/max": 1.8610286712646484,
"sampling/importance_sampling_ratio/mean": 0.9242000579833984,
"sampling/importance_sampling_ratio/min": 7.589735326295589e-18,
"sampling/sampling_logp_difference/max": 23.898208618164062,
"sampling/sampling_logp_difference/mean": 0.09211704879999161,
"step": 443,
"step_time": 40.27333003700005
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008680555620230734,
"entropy": 0.13046453520655632,
"epoch": 0.00888,
"grad_norm": 1.1817734241485596,
"kl": 0.6209859363734722,
"learning_rate": 9.999691738382544e-06,
"loss": -0.0414,
"step": 444,
"step_time": 6.1905242890006775
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333372138441,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2033.0,
"completions/max_terminated_length": 2033.0,
"completions/mean_length": 1881.84375,
"completions/mean_terminated_length": 1881.84375,
"completions/min_length": 1777.0,
"completions/min_terminated_length": 1777.0,
"entropy": 0.10738935228437185,
"epoch": 0.0089,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.587774634361267,
"kl": 0.6463839821517467,
"learning_rate": 9.999690225465193e-06,
"loss": -0.1007,
"num_tokens": 16776207.0,
"reward": 3.2240819931030273,
"reward_std": 3.1243185997009277,
"rewards/rollout_reward_func/mean": 3.2240819931030273,
"rewards/rollout_reward_func/std": 8.129782676696777,
"sampling/importance_sampling_ratio/max": 2.0063812732696533,
"sampling/importance_sampling_ratio/mean": 0.9566425681114197,
"sampling/importance_sampling_ratio/min": 0.057423632591962814,
"sampling/sampling_logp_difference/max": 1.6200556755065918,
"sampling/sampling_logp_difference/mean": 0.027168117463588715,
"step": 445,
"step_time": 39.72193665600025
},
{
"clip_ratio/high_max": 0.013888888992369175,
"clip_ratio/high_mean": 0.008680555620230734,
"clip_ratio/low_mean": 0.0069444444961845875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625000232830644,
"entropy": 0.10969764646142721,
"epoch": 0.00892,
"grad_norm": 1.0914807319641113,
"kl": 0.7086209692060947,
"learning_rate": 9.999688708844452e-06,
"loss": -0.1024,
"step": 446,
"step_time": 6.31956128499769
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.003289473708719015,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003289473708719015,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2002.0,
"completions/max_terminated_length": 2002.0,
"completions/mean_length": 1893.3125,
"completions/mean_terminated_length": 1893.3125,
"completions/min_length": 1636.0,
"completions/min_terminated_length": 1636.0,
"entropy": 0.13962982781231403,
"epoch": 0.00894,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.1110786199569702,
"kl": 0.31114745885133743,
"learning_rate": 9.999687188520328e-06,
"loss": -0.0955,
"num_tokens": 16857449.0,
"reward": 9.09254264831543,
"reward_std": 6.603111267089844,
"rewards/rollout_reward_func/mean": 9.09254264831543,
"rewards/rollout_reward_func/std": 12.929021835327148,
"sampling/importance_sampling_ratio/max": 1.9707322120666504,
"sampling/importance_sampling_ratio/mean": 0.9114285111427307,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 25.772886276245117,
"sampling/sampling_logp_difference/mean": 0.07303920388221741,
"step": 447,
"step_time": 38.44917293900107
},
{
"clip_ratio/high_max": 0.020833333488553762,
"clip_ratio/high_mean": 0.010416666744276881,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012152777868323028,
"entropy": 0.14250141568481922,
"epoch": 0.00896,
"grad_norm": 0.9470910429954529,
"kl": 0.30660218372941017,
"learning_rate": 9.999685664492816e-06,
"loss": -0.0989,
"step": 448,
"step_time": 6.258842459999869
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.003574346425011754,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005310457549057901,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1969.0,
"completions/max_terminated_length": 1969.0,
"completions/mean_length": 1866.1875,
"completions/mean_terminated_length": 1866.1875,
"completions/min_length": 1566.0,
"completions/min_terminated_length": 1566.0,
"entropy": 0.1336675025522709,
"epoch": 0.00898,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4246325492858887,
"kl": 0.35916537791490555,
"learning_rate": 9.999684136761924e-06,
"loss": 0.043,
"num_tokens": 16937767.0,
"reward": 2.6347501277923584,
"reward_std": 11.172698974609375,
"rewards/rollout_reward_func/mean": 2.6347501277923584,
"rewards/rollout_reward_func/std": 17.202756881713867,
"sampling/importance_sampling_ratio/max": 1.9445165395736694,
"sampling/importance_sampling_ratio/mean": 1.0331871509552002,
"sampling/importance_sampling_ratio/min": 0.4090438485145569,
"sampling/sampling_logp_difference/max": 0.6792287826538086,
"sampling/sampling_logp_difference/mean": 0.019970860332250595,
"step": 449,
"step_time": 37.926194604002376
},
{
"clip_ratio/high_max": 0.013888888992369175,
"clip_ratio/high_mean": 0.008680555620230734,
"clip_ratio/low_mean": 0.008986928150989115,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.01766748377121985,
"entropy": 0.13738374412059784,
"epoch": 0.009,
"grad_norm": 1.1024707555770874,
"kl": 0.357426542788744,
"learning_rate": 9.999682605327648e-06,
"loss": 0.0382,
"step": 450,
"step_time": 6.199961440999687
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1979.0,
"completions/max_terminated_length": 1979.0,
"completions/mean_length": 1895.5,
"completions/mean_terminated_length": 1895.5,
"completions/min_length": 1636.0,
"completions/min_terminated_length": 1636.0,
"entropy": 0.15979362651705742,
"epoch": 0.00902,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.3939236402511597,
"kl": 0.31783993169665337,
"learning_rate": 9.99968107018999e-06,
"loss": -0.028,
"num_tokens": 17019020.0,
"reward": 19.81011199951172,
"reward_std": 7.101508140563965,
"rewards/rollout_reward_func/mean": 19.81011199951172,
"rewards/rollout_reward_func/std": 21.871509552001953,
"sampling/importance_sampling_ratio/max": 2.6384923458099365,
"sampling/importance_sampling_ratio/mean": 1.0206067562103271,
"sampling/importance_sampling_ratio/min": 2.8838329294011977e-11,
"sampling/sampling_logp_difference/max": 24.231542587280273,
"sampling/sampling_logp_difference/mean": 0.06482739746570587,
"step": 451,
"step_time": 37.558782669999346
},
{
"clip_ratio/high_max": 0.003289473708719015,
"clip_ratio/high_mean": 0.0016447368543595076,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0033808479784056544,
"entropy": 0.15879021026194096,
"epoch": 0.00904,
"grad_norm": 1.243542194366455,
"kl": 0.31884971633553505,
"learning_rate": 9.999679531348956e-06,
"loss": -0.032,
"step": 452,
"step_time": 6.199976066998715
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333372138441,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1983.0,
"completions/max_terminated_length": 1983.0,
"completions/mean_length": 1731.46875,
"completions/mean_terminated_length": 1731.46875,
"completions/min_length": 679.0,
"completions/min_terminated_length": 679.0,
"entropy": 0.12962097860872746,
"epoch": 0.00906,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.4479097127914429,
"kl": 0.382347758859396,
"learning_rate": 9.999677988804544e-06,
"loss": -0.056,
"num_tokens": 17094633.0,
"reward": 7.968657970428467,
"reward_std": 5.176619529724121,
"rewards/rollout_reward_func/mean": 7.968657970428467,
"rewards/rollout_reward_func/std": 13.325113296508789,
"sampling/importance_sampling_ratio/max": 1.7371830940246582,
"sampling/importance_sampling_ratio/mean": 0.9786045551300049,
"sampling/importance_sampling_ratio/min": 0.3828772306442261,
"sampling/sampling_logp_difference/max": 0.9057672023773193,
"sampling/sampling_logp_difference/mean": 0.021090377122163773,
"step": 453,
"step_time": 35.78325894400041
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.01215277798473835,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01736111135687679,
"entropy": 0.1288425624370575,
"epoch": 0.00908,
"grad_norm": 0.9137100577354431,
"kl": 0.39299850165843964,
"learning_rate": 9.999676442556757e-06,
"loss": -0.0591,
"step": 454,
"step_time": 6.22740626700579
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0017361111240461469,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006944444612599909,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2023.0,
"completions/max_terminated_length": 2023.0,
"completions/mean_length": 1862.90625,
"completions/mean_terminated_length": 1862.90625,
"completions/min_length": 1430.0,
"completions/min_terminated_length": 1430.0,
"entropy": 0.10491521190851927,
"epoch": 0.0091,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.0899358987808228,
"kl": 0.327092919498682,
"learning_rate": 9.999674892605596e-06,
"loss": -0.1829,
"num_tokens": 17174752.0,
"reward": 11.411552429199219,
"reward_std": 6.536225318908691,
"rewards/rollout_reward_func/mean": 11.411552429199219,
"rewards/rollout_reward_func/std": 17.084299087524414,
"sampling/importance_sampling_ratio/max": 1.9590842723846436,
"sampling/importance_sampling_ratio/mean": 0.8986088037490845,
"sampling/importance_sampling_ratio/min": 0.19634754955768585,
"sampling/sampling_logp_difference/max": 1.189605712890625,
"sampling/sampling_logp_difference/mean": 0.02513366937637329,
"step": 455,
"step_time": 36.65933866699925
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0069444444961845875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012152777868323028,
"entropy": 0.10256014112383127,
"epoch": 0.00912,
"grad_norm": 1.0379598140716553,
"kl": 0.3651005197316408,
"learning_rate": 9.99967333895106e-06,
"loss": -0.1862,
"step": 456,
"step_time": 6.2405210320011975
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0047222222201526165,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008194444584660232,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2046.0,
"completions/max_terminated_length": 2046.0,
"completions/mean_length": 1919.5,
"completions/mean_terminated_length": 1919.5,
"completions/min_length": 1692.0,
"completions/min_terminated_length": 1692.0,
"entropy": 0.1095434408634901,
"epoch": 0.00914,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.1724107265472412,
"kl": 0.329721899703145,
"learning_rate": 9.999671781593154e-06,
"loss": 0.0069,
"num_tokens": 17256864.0,
"reward": 11.119063377380371,
"reward_std": 8.48747730255127,
"rewards/rollout_reward_func/mean": 11.119063377380371,
"rewards/rollout_reward_func/std": 14.583742141723633,
"sampling/importance_sampling_ratio/max": 1.7749106884002686,
"sampling/importance_sampling_ratio/mean": 0.9545077085494995,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 22.036113739013672,
"sampling/sampling_logp_difference/mean": 0.08479472994804382,
"step": 457,
"step_time": 38.51757043299949
},
{
"clip_ratio/high_max": 0.009444444440305233,
"clip_ratio/high_mean": 0.0047222222201526165,
"clip_ratio/low_mean": 0.01340277784038335,
"clip_ratio/low_min": 0.0069444444961845875,
"clip_ratio/region_mean": 0.018125000409781933,
"entropy": 0.10692847240716219,
"epoch": 0.00916,
"grad_norm": 1.1692982912063599,
"kl": 0.3310940358787775,
"learning_rate": 9.999670220531878e-06,
"loss": 0.0021,
"step": 458,
"step_time": 6.407080308001241
},
{
"clip_ratio/high_max": 0.010928362840786576,
"clip_ratio/high_mean": 0.005464181420393288,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005464181420393288,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1974.0,
"completions/max_terminated_length": 1974.0,
"completions/mean_length": 1843.53125,
"completions/mean_terminated_length": 1843.53125,
"completions/min_length": 1070.0,
"completions/min_terminated_length": 1070.0,
"entropy": 0.10005591344088316,
"epoch": 0.00918,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.392657995223999,
"kl": 0.6404759250581264,
"learning_rate": 9.999668655767235e-06,
"loss": 0.0567,
"num_tokens": 17336639.0,
"reward": 10.774712562561035,
"reward_std": 5.9175920486450195,
"rewards/rollout_reward_func/mean": 10.774712562561035,
"rewards/rollout_reward_func/std": 13.042983055114746,
"sampling/importance_sampling_ratio/max": 1.5456091165542603,
"sampling/importance_sampling_ratio/mean": 0.9551990032196045,
"sampling/importance_sampling_ratio/min": 4.766808692560631e-13,
"sampling/sampling_logp_difference/max": 27.484663009643555,
"sampling/sampling_logp_difference/mean": 0.06486958265304565,
"step": 459,
"step_time": 37.665269682000144
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0033808479784056544,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005116959102451801,
"entropy": 0.09817711263895035,
"epoch": 0.0092,
"grad_norm": 1.214490532875061,
"kl": 0.5807801727205515,
"learning_rate": 9.999667087299225e-06,
"loss": 0.0526,
"step": 460,
"step_time": 6.204574634999517
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1991.0,
"completions/max_terminated_length": 1991.0,
"completions/mean_length": 1865.46875,
"completions/mean_terminated_length": 1865.46875,
"completions/min_length": 1549.0,
"completions/min_terminated_length": 1549.0,
"entropy": 0.12156101502478123,
"epoch": 0.00922,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.4056850671768188,
"kl": 0.5392452217638493,
"learning_rate": 9.999665515127852e-06,
"loss": -0.043,
"num_tokens": 17416764.0,
"reward": 11.62852668762207,
"reward_std": 6.9695281982421875,
"rewards/rollout_reward_func/mean": 11.62852668762207,
"rewards/rollout_reward_func/std": 13.752988815307617,
"sampling/importance_sampling_ratio/max": 2.6276750564575195,
"sampling/importance_sampling_ratio/mean": 0.9863981604576111,
"sampling/importance_sampling_ratio/min": 7.647822804733584e-13,
"sampling/sampling_logp_difference/max": 26.564056396484375,
"sampling/sampling_logp_difference/mean": 0.07334530353546143,
"step": 461,
"step_time": 39.81283560100019
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.010416666860692203,
"clip_ratio/low_min": 0.0034722222480922937,
"clip_ratio/region_mean": 0.015625000116415322,
"entropy": 0.12390031665563583,
"epoch": 0.00924,
"grad_norm": 1.2167932987213135,
"kl": 0.5474216639995575,
"learning_rate": 9.999663939253113e-06,
"loss": -0.0466,
"step": 462,
"step_time": 6.207449816998633
},
{
"clip_ratio/high_max": 0.0069444444961845875,
"clip_ratio/high_mean": 0.0034722222480922937,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1980.0,
"completions/max_terminated_length": 1980.0,
"completions/mean_length": 1851.40625,
"completions/mean_terminated_length": 1851.40625,
"completions/min_length": 1767.0,
"completions/min_terminated_length": 1767.0,
"entropy": 0.1078771585598588,
"epoch": 0.00926,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.4674164056777954,
"kl": 0.5507515668869019,
"learning_rate": 9.999662359675012e-06,
"loss": 0.0633,
"num_tokens": 17496530.0,
"reward": 14.070526123046875,
"reward_std": 9.017099380493164,
"rewards/rollout_reward_func/mean": 14.070526123046875,
"rewards/rollout_reward_func/std": 21.767820358276367,
"sampling/importance_sampling_ratio/max": 1.8225663900375366,
"sampling/importance_sampling_ratio/mean": 0.9397376775741577,
"sampling/importance_sampling_ratio/min": 0.2997850179672241,
"sampling/sampling_logp_difference/max": 0.9843077659606934,
"sampling/sampling_logp_difference/mean": 0.029502304270863533,
"step": 463,
"step_time": 35.89330491099827
},
{
"clip_ratio/high_max": 0.01736111124046147,
"clip_ratio/high_mean": 0.008680555620230734,
"clip_ratio/low_mean": 0.008680555620230734,
"clip_ratio/low_min": 0.0069444444961845875,
"clip_ratio/region_mean": 0.01736111124046147,
"entropy": 0.10700647812336683,
"epoch": 0.00928,
"grad_norm": 1.0905040502548218,
"kl": 0.5623909011483192,
"learning_rate": 9.999660776393551e-06,
"loss": 0.0596,
"step": 464,
"step_time": 6.629168002002189
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.005208333372138441,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005208333372138441,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1961.0,
"completions/max_terminated_length": 1961.0,
"completions/mean_length": 1759.375,
"completions/mean_terminated_length": 1759.375,
"completions/min_length": 1081.0,
"completions/min_terminated_length": 1081.0,
"entropy": 0.13902374915778637,
"epoch": 0.0093,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.728746771812439,
"kl": 0.6727790012955666,
"learning_rate": 9.999659189408732e-06,
"loss": -0.0095,
"num_tokens": 17573003.0,
"reward": 2.6619279384613037,
"reward_std": 5.373678207397461,
"rewards/rollout_reward_func/mean": 2.6619279384613037,
"rewards/rollout_reward_func/std": 12.139829635620117,
"sampling/importance_sampling_ratio/max": 2.036257028579712,
"sampling/importance_sampling_ratio/mean": 1.0116875171661377,
"sampling/importance_sampling_ratio/min": 0.31144610047340393,
"sampling/sampling_logp_difference/max": 1.2303881645202637,
"sampling/sampling_logp_difference/mean": 0.02412721887230873,
"step": 465,
"step_time": 37.21715446900271
},
{
"clip_ratio/high_max": 0.010850694496184587,
"clip_ratio/high_mean": 0.005425347248092294,
"clip_ratio/low_mean": 0.007291666814126074,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012717014062218368,
"entropy": 0.13923931494355202,
"epoch": 0.00932,
"grad_norm": 1.3814212083816528,
"kl": 0.5176205858588219,
"learning_rate": 9.999657598720554e-06,
"loss": -0.0149,
"step": 466,
"step_time": 6.155722220999451
},
{
"clip_ratio/high_max": 0.010850694496184587,
"clip_ratio/high_mean": 0.005425347248092294,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005425347248092294,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2036.0,
"completions/max_terminated_length": 2036.0,
"completions/mean_length": 1842.5,
"completions/mean_terminated_length": 1842.5,
"completions/min_length": 1138.0,
"completions/min_terminated_length": 1138.0,
"entropy": 0.10431700479239225,
"epoch": 0.00934,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.7164621353149414,
"kl": 0.38612409122288227,
"learning_rate": 9.999656004329023e-06,
"loss": -0.1128,
"num_tokens": 17652584.0,
"reward": 4.594000816345215,
"reward_std": 4.123822212219238,
"rewards/rollout_reward_func/mean": 4.594000816345215,
"rewards/rollout_reward_func/std": 17.150468826293945,
"sampling/importance_sampling_ratio/max": 1.9271286725997925,
"sampling/importance_sampling_ratio/mean": 1.0217385292053223,
"sampling/importance_sampling_ratio/min": 0.3627711236476898,
"sampling/sampling_logp_difference/max": 1.0998306274414062,
"sampling/sampling_logp_difference/mean": 0.016824834048748016,
"step": 467,
"step_time": 36.32474399700004
},
{
"clip_ratio/high_max": 0.010416666744276881,
"clip_ratio/high_mean": 0.005208333372138441,
"clip_ratio/low_mean": 0.0069444444961845875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01215277798473835,
"entropy": 0.10505765955895185,
"epoch": 0.00936,
"grad_norm": 1.5758742094039917,
"kl": 0.41707968339324,
"learning_rate": 9.999654406234138e-06,
"loss": -0.1151,
"step": 468,
"step_time": 6.322126661998482
}
],
"logging_steps": 1.0,
"max_steps": 100000,
"num_input_tokens_seen": 17652584,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}