FT-GRPO / checkpoint-50 /trainer_state.json
LLucass's picture
Training in progress, step 50, checkpoint
195f196 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.05714285714285714,
"eval_steps": 500,
"global_step": 50,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.671875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1734.0,
"completions/mean_length": 1702.03125,
"completions/mean_terminated_length": 993.6190795898438,
"completions/min_length": 483.0,
"completions/min_terminated_length": 483.0,
"epoch": 0.001142857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2837817668914795,
"learning_rate": 0.0,
"loss": -0.0,
"num_tokens": 118418.0,
"reward": -0.09800112247467041,
"reward_std": 0.3028089702129364,
"rewards/cosine_scaled_reward/mean": -0.09800112992525101,
"rewards/cosine_scaled_reward/std": 0.37953105568885803,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1894.0,
"completions/mean_length": 1738.90625,
"completions/mean_terminated_length": 949.0,
"completions/min_length": 435.0,
"completions/min_terminated_length": 435.0,
"epoch": 0.002285714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2421981245279312,
"learning_rate": 2e-08,
"loss": -0.0,
"num_tokens": 239748.0,
"reward": 0.020556632429361343,
"reward_std": 0.3545936942100525,
"rewards/cosine_scaled_reward/mean": 0.020556632429361343,
"rewards/cosine_scaled_reward/std": 0.4492928683757782,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.921875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 953.0,
"completions/mean_length": 1952.234375,
"completions/mean_terminated_length": 822.2000122070312,
"completions/min_length": 703.0,
"completions/min_terminated_length": 703.0,
"epoch": 0.0034285714285714284,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24851329624652863,
"learning_rate": 4e-08,
"loss": -0.0,
"num_tokens": 375163.0,
"reward": -0.22721199691295624,
"reward_std": 0.14563649892807007,
"rewards/cosine_scaled_reward/mean": -0.22721199691295624,
"rewards/cosine_scaled_reward/std": 0.1709199845790863,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.546875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1685.0,
"completions/mean_length": 1554.109375,
"completions/mean_terminated_length": 958.0344848632812,
"completions/min_length": 504.0,
"completions/min_terminated_length": 504.0,
"epoch": 0.004571428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.29272863268852234,
"learning_rate": 6e-08,
"loss": -0.0,
"num_tokens": 484434.0,
"reward": -0.17542189359664917,
"reward_std": 0.18219107389450073,
"rewards/cosine_scaled_reward/mean": -0.17542189359664917,
"rewards/cosine_scaled_reward/std": 0.27975013852119446,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.890625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1930.0,
"completions/mean_length": 1943.0625,
"completions/mean_terminated_length": 1088.571533203125,
"completions/min_length": 344.0,
"completions/min_terminated_length": 344.0,
"epoch": 0.005714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2773251533508301,
"learning_rate": 8e-08,
"loss": 0.0,
"num_tokens": 619606.0,
"reward": -0.2648562788963318,
"reward_std": 0.21638144552707672,
"rewards/cosine_scaled_reward/mean": -0.2648562788963318,
"rewards/cosine_scaled_reward/std": 0.23959198594093323,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.828125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1824.0,
"completions/mean_length": 1854.21875,
"completions/mean_terminated_length": 920.5454711914062,
"completions/min_length": 548.0,
"completions/min_terminated_length": 548.0,
"epoch": 0.006857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.27399909496307373,
"learning_rate": 1e-07,
"loss": -0.0,
"num_tokens": 749924.0,
"reward": -0.19292885065078735,
"reward_std": 0.2666770815849304,
"rewards/cosine_scaled_reward/mean": -0.19292885065078735,
"rewards/cosine_scaled_reward/std": 0.295730322599411,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.890625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1589.0,
"completions/mean_length": 1940.5625,
"completions/mean_terminated_length": 1065.71435546875,
"completions/min_length": 773.0,
"completions/min_terminated_length": 773.0,
"epoch": 0.008,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23362359404563904,
"learning_rate": 1.2e-07,
"loss": 0.0,
"num_tokens": 884528.0,
"reward": -0.18198424577713013,
"reward_std": 0.18540163338184357,
"rewards/cosine_scaled_reward/mean": -0.18198424577713013,
"rewards/cosine_scaled_reward/std": 0.32407456636428833,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.671875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2048.0,
"completions/mean_length": 1708.5625,
"completions/mean_terminated_length": 1013.5238037109375,
"completions/min_length": 317.0,
"completions/min_terminated_length": 317.0,
"epoch": 0.009142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24677562713623047,
"learning_rate": 1.4e-07,
"loss": -0.0,
"num_tokens": 1004292.0,
"reward": -0.09573853015899658,
"reward_std": 0.22485454380512238,
"rewards/cosine_scaled_reward/mean": -0.09573852270841599,
"rewards/cosine_scaled_reward/std": 0.449250191450119,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.9375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1221.0,
"completions/mean_length": 1979.359375,
"completions/mean_terminated_length": 949.75,
"completions/min_length": 569.0,
"completions/min_terminated_length": 569.0,
"epoch": 0.010285714285714285,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.26966309547424316,
"learning_rate": 1.6e-07,
"loss": 0.0,
"num_tokens": 1142427.0,
"reward": -0.19992578029632568,
"reward_std": 0.20190927386283875,
"rewards/cosine_scaled_reward/mean": -0.19992581009864807,
"rewards/cosine_scaled_reward/std": 0.23785534501075745,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1918.0,
"completions/mean_length": 1652.59375,
"completions/mean_terminated_length": 897.727294921875,
"completions/min_length": 286.0,
"completions/min_terminated_length": 286.0,
"epoch": 0.011428571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3011312484741211,
"learning_rate": 1.8e-07,
"loss": 0.0,
"num_tokens": 1259025.0,
"reward": -0.11706389486789703,
"reward_std": 0.2934548258781433,
"rewards/cosine_scaled_reward/mean": -0.11706390231847763,
"rewards/cosine_scaled_reward/std": 0.3601698577404022,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.90625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1333.0,
"completions/mean_length": 1946.6875,
"completions/mean_terminated_length": 967.3333740234375,
"completions/min_length": 599.0,
"completions/min_terminated_length": 599.0,
"epoch": 0.012571428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2451399564743042,
"learning_rate": 2e-07,
"loss": -0.0,
"num_tokens": 1395285.0,
"reward": -0.2866281270980835,
"reward_std": 0.12184012681245804,
"rewards/cosine_scaled_reward/mean": -0.2866281270980835,
"rewards/cosine_scaled_reward/std": 0.15141677856445312,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.546875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2032.0,
"completions/mean_length": 1659.28125,
"completions/mean_terminated_length": 1190.137939453125,
"completions/min_length": 535.0,
"completions/min_terminated_length": 535.0,
"epoch": 0.013714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2733561396598816,
"learning_rate": 2.1999999999999998e-07,
"loss": 0.0,
"num_tokens": 1512423.0,
"reward": -0.13816070556640625,
"reward_std": 0.2968980073928833,
"rewards/cosine_scaled_reward/mean": -0.13816070556640625,
"rewards/cosine_scaled_reward/std": 0.3597467839717865,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.765625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1770.0,
"completions/mean_length": 1807.796875,
"completions/mean_terminated_length": 1023.1333618164062,
"completions/min_length": 697.0,
"completions/min_terminated_length": 697.0,
"epoch": 0.014857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.25238803029060364,
"learning_rate": 2.4e-07,
"loss": 0.0,
"num_tokens": 1639162.0,
"reward": -0.13488636910915375,
"reward_std": 0.2661236524581909,
"rewards/cosine_scaled_reward/mean": -0.13488635420799255,
"rewards/cosine_scaled_reward/std": 0.3444243371486664,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1866.0,
"completions/mean_length": 1846.921875,
"completions/mean_terminated_length": 1243.6875,
"completions/min_length": 698.0,
"completions/min_terminated_length": 698.0,
"epoch": 0.016,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2201598882675171,
"learning_rate": 2.6e-07,
"loss": -0.0,
"num_tokens": 1767973.0,
"reward": -0.20591925084590912,
"reward_std": 0.21505361795425415,
"rewards/cosine_scaled_reward/mean": -0.20591923594474792,
"rewards/cosine_scaled_reward/std": 0.323749840259552,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1713.0,
"completions/mean_length": 1710.421875,
"completions/mean_terminated_length": 847.7222290039062,
"completions/min_length": 450.0,
"completions/min_terminated_length": 450.0,
"epoch": 0.017142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2665213644504547,
"learning_rate": 2.8e-07,
"loss": 0.0,
"num_tokens": 1888360.0,
"reward": -0.0778750479221344,
"reward_std": 0.17502948641777039,
"rewards/cosine_scaled_reward/mean": -0.0778750628232956,
"rewards/cosine_scaled_reward/std": 0.47343766689300537,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.984375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 962.0,
"completions/mean_length": 2031.03125,
"completions/mean_terminated_length": 962.0,
"completions/min_length": 962.0,
"completions/min_terminated_length": 962.0,
"epoch": 0.018285714285714287,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23009927570819855,
"learning_rate": 3e-07,
"loss": -0.0,
"num_tokens": 2028786.0,
"reward": -0.2619968056678772,
"reward_std": 0.16954168677330017,
"rewards/cosine_scaled_reward/mean": -0.2619968056678772,
"rewards/cosine_scaled_reward/std": 0.18357795476913452,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1918.0,
"completions/mean_length": 1533.15625,
"completions/mean_terminated_length": 780.6923217773438,
"completions/min_length": 380.0,
"completions/min_terminated_length": 380.0,
"epoch": 0.019428571428571427,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3392995297908783,
"learning_rate": 3.2e-07,
"loss": -0.0,
"num_tokens": 2137428.0,
"reward": -0.11706461012363434,
"reward_std": 0.3096129894256592,
"rewards/cosine_scaled_reward/mean": -0.11706460267305374,
"rewards/cosine_scaled_reward/std": 0.3810974657535553,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1626.0,
"completions/mean_length": 1774.46875,
"completions/mean_terminated_length": 1018.2352905273438,
"completions/min_length": 516.0,
"completions/min_terminated_length": 516.0,
"epoch": 0.02057142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23254038393497467,
"learning_rate": 3.4000000000000003e-07,
"loss": 0.0,
"num_tokens": 2261370.0,
"reward": -0.18709540367126465,
"reward_std": 0.2795025110244751,
"rewards/cosine_scaled_reward/mean": -0.18709540367126465,
"rewards/cosine_scaled_reward/std": 0.3359416127204895,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1859.0,
"completions/mean_length": 1719.0,
"completions/mean_terminated_length": 995.2000122070312,
"completions/min_length": 577.0,
"completions/min_terminated_length": 577.0,
"epoch": 0.021714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.262045681476593,
"learning_rate": 3.6e-07,
"loss": -0.0,
"num_tokens": 2382642.0,
"reward": -0.02329203486442566,
"reward_std": 0.34684932231903076,
"rewards/cosine_scaled_reward/mean": -0.02329203486442566,
"rewards/cosine_scaled_reward/std": 0.47637447714805603,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1988.0,
"completions/mean_length": 1630.90625,
"completions/mean_terminated_length": 935.75,
"completions/min_length": 425.0,
"completions/min_terminated_length": 425.0,
"epoch": 0.022857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.250532329082489,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.0,
"num_tokens": 2498372.0,
"reward": -0.06319350004196167,
"reward_std": 0.2394939512014389,
"rewards/cosine_scaled_reward/mean": -0.06319350004196167,
"rewards/cosine_scaled_reward/std": 0.3889789879322052,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1818.0,
"completions/mean_length": 1735.96875,
"completions/mean_terminated_length": 1140.272705078125,
"completions/min_length": 428.0,
"completions/min_terminated_length": 428.0,
"epoch": 0.024,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2773231565952301,
"learning_rate": 4e-07,
"loss": 0.0,
"num_tokens": 2620282.0,
"reward": -0.20884393155574799,
"reward_std": 0.20233216881752014,
"rewards/cosine_scaled_reward/mean": -0.20884393155574799,
"rewards/cosine_scaled_reward/std": 0.28432920575141907,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1790.0,
"completions/mean_length": 1342.953125,
"completions/mean_terminated_length": 919.9249877929688,
"completions/min_length": 286.0,
"completions/min_terminated_length": 286.0,
"epoch": 0.025142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.34627005457878113,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.0,
"num_tokens": 2715247.0,
"reward": -0.09092864394187927,
"reward_std": 0.21042926609516144,
"rewards/cosine_scaled_reward/mean": -0.09092865139245987,
"rewards/cosine_scaled_reward/std": 0.43559205532073975,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.578125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2038.0,
"completions/mean_length": 1661.9375,
"completions/mean_terminated_length": 1132.888916015625,
"completions/min_length": 455.0,
"completions/min_terminated_length": 455.0,
"epoch": 0.026285714285714287,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2705242335796356,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0,
"num_tokens": 2832403.0,
"reward": -0.13339249789714813,
"reward_std": 0.2433384656906128,
"rewards/cosine_scaled_reward/mean": -0.13339248299598694,
"rewards/cosine_scaled_reward/std": 0.3815627098083496,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2020.0,
"completions/mean_length": 1802.296875,
"completions/mean_terminated_length": 1065.1875,
"completions/min_length": 572.0,
"completions/min_terminated_length": 572.0,
"epoch": 0.027428571428571427,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24961258471012115,
"learning_rate": 4.6e-07,
"loss": 0.0,
"num_tokens": 2958678.0,
"reward": -0.18733163177967072,
"reward_std": 0.2773033380508423,
"rewards/cosine_scaled_reward/mean": -0.1873316466808319,
"rewards/cosine_scaled_reward/std": 0.37051624059677124,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1848.0,
"completions/mean_length": 1731.53125,
"completions/mean_terminated_length": 982.0,
"completions/min_length": 406.0,
"completions/min_terminated_length": 406.0,
"epoch": 0.02857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2662124037742615,
"learning_rate": 4.8e-07,
"loss": 0.0,
"num_tokens": 3079792.0,
"reward": -0.12407588213682175,
"reward_std": 0.25581949949264526,
"rewards/cosine_scaled_reward/mean": -0.12407589703798294,
"rewards/cosine_scaled_reward/std": 0.39043793082237244,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.828125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2017.0,
"completions/mean_length": 1965.46875,
"completions/mean_terminated_length": 1567.8182373046875,
"completions/min_length": 1006.0,
"completions/min_terminated_length": 1006.0,
"epoch": 0.029714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23202598094940186,
"learning_rate": 5e-07,
"loss": 0.0,
"num_tokens": 3216214.0,
"reward": -0.0963105633854866,
"reward_std": 0.30887559056282043,
"rewards/cosine_scaled_reward/mean": -0.0963105633854866,
"rewards/cosine_scaled_reward/std": 0.39396020770072937,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.828125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2023.0,
"completions/mean_length": 1886.96875,
"completions/mean_terminated_length": 1111.0909423828125,
"completions/min_length": 498.0,
"completions/min_terminated_length": 498.0,
"epoch": 0.030857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2878379225730896,
"learning_rate": 5.2e-07,
"loss": -0.0,
"num_tokens": 3347268.0,
"reward": -0.1645491123199463,
"reward_std": 0.28629785776138306,
"rewards/cosine_scaled_reward/mean": -0.1645491123199463,
"rewards/cosine_scaled_reward/std": 0.35050687193870544,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1995.0,
"completions/mean_length": 1843.640625,
"completions/mean_terminated_length": 1230.5625,
"completions/min_length": 444.0,
"completions/min_terminated_length": 444.0,
"epoch": 0.032,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24996496737003326,
"learning_rate": 5.4e-07,
"loss": 0.0,
"num_tokens": 3475597.0,
"reward": -0.06605555862188339,
"reward_std": 0.2643629312515259,
"rewards/cosine_scaled_reward/mean": -0.06605555862188339,
"rewards/cosine_scaled_reward/std": 0.438128799200058,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.9375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2005.0,
"completions/mean_length": 2020.5,
"completions/mean_terminated_length": 1608.0,
"completions/min_length": 516.0,
"completions/min_terminated_length": 516.0,
"epoch": 0.03314285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23316837847232819,
"learning_rate": 5.6e-07,
"loss": -0.0,
"num_tokens": 3615381.0,
"reward": -0.2015206664800644,
"reward_std": 0.15312039852142334,
"rewards/cosine_scaled_reward/mean": -0.2015206664800644,
"rewards/cosine_scaled_reward/std": 0.1648881882429123,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.796875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1839.0,
"completions/mean_length": 1826.046875,
"completions/mean_terminated_length": 955.3077392578125,
"completions/min_length": 364.0,
"completions/min_terminated_length": 364.0,
"epoch": 0.03428571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2410832792520523,
"learning_rate": 5.8e-07,
"loss": -0.0,
"num_tokens": 3742784.0,
"reward": -0.17509159445762634,
"reward_std": 0.18994277715682983,
"rewards/cosine_scaled_reward/mean": -0.17509159445762634,
"rewards/cosine_scaled_reward/std": 0.22516494989395142,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.765625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1678.0,
"completions/mean_length": 1781.4375,
"completions/mean_terminated_length": 910.6666870117188,
"completions/min_length": 313.0,
"completions/min_terminated_length": 313.0,
"epoch": 0.03542857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2693414092063904,
"learning_rate": 6e-07,
"loss": 0.0,
"num_tokens": 3867292.0,
"reward": -0.24513831734657288,
"reward_std": 0.28315529227256775,
"rewards/cosine_scaled_reward/mean": -0.24513831734657288,
"rewards/cosine_scaled_reward/std": 0.3480584919452667,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1975.0,
"completions/mean_length": 1969.28125,
"completions/mean_terminated_length": 1488.2222900390625,
"completions/min_length": 1088.0,
"completions/min_terminated_length": 1088.0,
"epoch": 0.036571428571428574,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24202018976211548,
"learning_rate": 6.2e-07,
"loss": 0.0,
"num_tokens": 4003678.0,
"reward": -0.18968716263771057,
"reward_std": 0.28299200534820557,
"rewards/cosine_scaled_reward/mean": -0.18968716263771057,
"rewards/cosine_scaled_reward/std": 0.3119950294494629,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.037714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22288212180137634,
"learning_rate": 6.4e-07,
"loss": 0.0,
"num_tokens": 4145966.0,
"reward": -0.2955162525177002,
"reward_std": 0.17793573439121246,
"rewards/cosine_scaled_reward/mean": -0.2955162525177002,
"rewards/cosine_scaled_reward/std": 0.22786569595336914,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.546875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1809.0,
"completions/mean_length": 1589.640625,
"completions/mean_terminated_length": 1036.4482421875,
"completions/min_length": 515.0,
"completions/min_terminated_length": 515.0,
"epoch": 0.038857142857142854,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.31030499935150146,
"learning_rate": 6.6e-07,
"loss": 0.0,
"num_tokens": 4257255.0,
"reward": 0.008002171292901039,
"reward_std": 0.3413254916667938,
"rewards/cosine_scaled_reward/mean": 0.008002176880836487,
"rewards/cosine_scaled_reward/std": 0.4431404769420624,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.796875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1987.0,
"completions/mean_length": 1785.921875,
"completions/mean_terminated_length": 757.769287109375,
"completions/min_length": 385.0,
"completions/min_terminated_length": 385.0,
"epoch": 0.04,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3145958483219147,
"learning_rate": 6.800000000000001e-07,
"loss": -0.0,
"num_tokens": 4383050.0,
"reward": -0.16386553645133972,
"reward_std": 0.2818174958229065,
"rewards/cosine_scaled_reward/mean": -0.16386555135250092,
"rewards/cosine_scaled_reward/std": 0.3242056965827942,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1195.0,
"completions/mean_length": 2000.421875,
"completions/mean_terminated_length": 1033.0,
"completions/min_length": 863.0,
"completions/min_terminated_length": 863.0,
"epoch": 0.04114285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.25796815752983093,
"learning_rate": 7e-07,
"loss": 0.0,
"num_tokens": 4522189.0,
"reward": -0.2470606118440628,
"reward_std": 0.15509279072284698,
"rewards/cosine_scaled_reward/mean": -0.2470606118440628,
"rewards/cosine_scaled_reward/std": 0.16412879526615143,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.890625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2043.0,
"completions/mean_length": 1964.46875,
"completions/mean_terminated_length": 1284.2857666015625,
"completions/min_length": 931.0,
"completions/min_terminated_length": 931.0,
"epoch": 0.04228571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22452199459075928,
"learning_rate": 7.2e-07,
"loss": 0.0,
"num_tokens": 4658939.0,
"reward": -0.24706938862800598,
"reward_std": 0.18499845266342163,
"rewards/cosine_scaled_reward/mean": -0.24706941843032837,
"rewards/cosine_scaled_reward/std": 0.21092188358306885,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1840.0,
"completions/mean_length": 1925.234375,
"completions/mean_terminated_length": 1175.0,
"completions/min_length": 916.0,
"completions/min_terminated_length": 916.0,
"epoch": 0.04342857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23703666031360626,
"learning_rate": 7.4e-07,
"loss": -0.0,
"num_tokens": 4793866.0,
"reward": -0.11504355818033218,
"reward_std": 0.20660358667373657,
"rewards/cosine_scaled_reward/mean": -0.11504356563091278,
"rewards/cosine_scaled_reward/std": 0.3190351724624634,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.78125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1412.0,
"completions/mean_length": 1740.546875,
"completions/mean_terminated_length": 642.5,
"completions/min_length": 339.0,
"completions/min_terminated_length": 339.0,
"epoch": 0.044571428571428574,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23829001188278198,
"learning_rate": 7.599999999999999e-07,
"loss": 0.0,
"num_tokens": 4916045.0,
"reward": -0.12095541507005692,
"reward_std": 0.1958026885986328,
"rewards/cosine_scaled_reward/mean": -0.12095542997121811,
"rewards/cosine_scaled_reward/std": 0.340241402387619,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1918.0,
"completions/mean_length": 1713.203125,
"completions/mean_terminated_length": 920.26318359375,
"completions/min_length": 451.0,
"completions/min_terminated_length": 451.0,
"epoch": 0.045714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24145744740962982,
"learning_rate": 7.799999999999999e-07,
"loss": -0.0,
"num_tokens": 5035762.0,
"reward": -0.10936243832111359,
"reward_std": 0.14468500018119812,
"rewards/cosine_scaled_reward/mean": -0.10936242341995239,
"rewards/cosine_scaled_reward/std": 0.4288744330406189,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.796875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1801.0,
"completions/mean_length": 1909.71875,
"completions/mean_terminated_length": 1367.2308349609375,
"completions/min_length": 1138.0,
"completions/min_terminated_length": 1138.0,
"epoch": 0.046857142857142854,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22317881882190704,
"learning_rate": 8e-07,
"loss": 0.0,
"num_tokens": 5169136.0,
"reward": -0.2058967649936676,
"reward_std": 0.2325170338153839,
"rewards/cosine_scaled_reward/mean": -0.20589673519134521,
"rewards/cosine_scaled_reward/std": 0.28897321224212646,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.78125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1752.0,
"completions/mean_length": 1727.71875,
"completions/mean_terminated_length": 583.857177734375,
"completions/min_length": 159.0,
"completions/min_terminated_length": 159.0,
"epoch": 0.048,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.44688937067985535,
"learning_rate": 8.199999999999999e-07,
"loss": 0.0,
"num_tokens": 5290070.0,
"reward": -0.2254919707775116,
"reward_std": 0.1687203049659729,
"rewards/cosine_scaled_reward/mean": -0.2254919707775116,
"rewards/cosine_scaled_reward/std": 0.18203677237033844,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.84375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1082.0,
"completions/mean_length": 1855.328125,
"completions/mean_terminated_length": 814.9000244140625,
"completions/min_length": 588.0,
"completions/min_terminated_length": 588.0,
"epoch": 0.04914285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2430828958749771,
"learning_rate": 8.399999999999999e-07,
"loss": 0.0,
"num_tokens": 5420427.0,
"reward": -0.09104865789413452,
"reward_std": 0.18217626214027405,
"rewards/cosine_scaled_reward/mean": -0.09104865789413452,
"rewards/cosine_scaled_reward/std": 0.3521345257759094,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1675.0,
"completions/mean_length": 1727.9375,
"completions/mean_terminated_length": 767.75,
"completions/min_length": 407.0,
"completions/min_terminated_length": 407.0,
"epoch": 0.05028571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.32065215706825256,
"learning_rate": 8.599999999999999e-07,
"loss": 0.0,
"num_tokens": 5541711.0,
"reward": -0.17701950669288635,
"reward_std": 0.2957555055618286,
"rewards/cosine_scaled_reward/mean": -0.17701953649520874,
"rewards/cosine_scaled_reward/std": 0.38460060954093933,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2032.0,
"completions/mean_length": 2013.9375,
"completions/mean_terminated_length": 1321.3333740234375,
"completions/min_length": 740.0,
"completions/min_terminated_length": 740.0,
"epoch": 0.05142857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22363637387752533,
"learning_rate": 8.799999999999999e-07,
"loss": 0.0,
"num_tokens": 5682259.0,
"reward": -0.20341511070728302,
"reward_std": 0.23104795813560486,
"rewards/cosine_scaled_reward/mean": -0.20341511070728302,
"rewards/cosine_scaled_reward/std": 0.3092363774776459,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1224.0,
"completions/mean_length": 1909.0,
"completions/mean_terminated_length": 936.0,
"completions/min_length": 525.0,
"completions/min_terminated_length": 525.0,
"epoch": 0.052571428571428575,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.26306217908859253,
"learning_rate": 9e-07,
"loss": 0.0,
"num_tokens": 5815603.0,
"reward": -0.26145532727241516,
"reward_std": 0.17108051478862762,
"rewards/cosine_scaled_reward/mean": -0.2614552974700928,
"rewards/cosine_scaled_reward/std": 0.18312901258468628,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1668.0,
"completions/mean_length": 1757.1875,
"completions/mean_terminated_length": 884.75,
"completions/min_length": 477.0,
"completions/min_terminated_length": 477.0,
"epoch": 0.053714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2856813371181488,
"learning_rate": 9.2e-07,
"loss": 0.0,
"num_tokens": 5938463.0,
"reward": -0.20879247784614563,
"reward_std": 0.23861759901046753,
"rewards/cosine_scaled_reward/mean": -0.20879246294498444,
"rewards/cosine_scaled_reward/std": 0.39607998728752136,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1708.0,
"completions/mean_length": 1756.5,
"completions/mean_terminated_length": 1011.5555419921875,
"completions/min_length": 487.0,
"completions/min_terminated_length": 487.0,
"epoch": 0.054857142857142854,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.27563413977622986,
"learning_rate": 9.399999999999999e-07,
"loss": -0.0,
"num_tokens": 6061423.0,
"reward": -0.16147920489311218,
"reward_std": 0.24055320024490356,
"rewards/cosine_scaled_reward/mean": -0.16147920489311218,
"rewards/cosine_scaled_reward/std": 0.3948959410190582,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.578125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1458.0,
"completions/mean_length": 1538.078125,
"completions/mean_terminated_length": 839.2963256835938,
"completions/min_length": 284.0,
"completions/min_terminated_length": 284.0,
"epoch": 0.056,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.27617642283439636,
"learning_rate": 9.6e-07,
"loss": -0.0,
"num_tokens": 6169924.0,
"reward": -0.18436825275421143,
"reward_std": 0.27141550183296204,
"rewards/cosine_scaled_reward/mean": -0.18436823785305023,
"rewards/cosine_scaled_reward/std": 0.3920196294784546,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.765625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1938.0,
"completions/mean_length": 1749.0625,
"completions/mean_terminated_length": 772.5333862304688,
"completions/min_length": 235.0,
"completions/min_terminated_length": 235.0,
"epoch": 0.05714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23394836485385895,
"learning_rate": 9.8e-07,
"loss": 0.0,
"num_tokens": 6292680.0,
"reward": -0.10770958662033081,
"reward_std": 0.22513547539710999,
"rewards/cosine_scaled_reward/mean": -0.10770957916975021,
"rewards/cosine_scaled_reward/std": 0.421062707901001,
"step": 50
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 6292680,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}