FF-GRPO / checkpoint-50 /trainer_state.json
LLucass's picture
Training in progress, step 50, checkpoint
2aa5581 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.05714285714285714,
"eval_steps": 500,
"global_step": 50,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.671875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1734.0,
"completions/mean_length": 1702.03125,
"completions/mean_terminated_length": 993.6190795898438,
"completions/min_length": 483.0,
"completions/min_terminated_length": 483.0,
"epoch": 0.001142857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.28377610445022583,
"learning_rate": 0.0,
"loss": -0.0,
"num_tokens": 118418.0,
"reward": -0.09800112247467041,
"reward_std": 0.3028089702129364,
"rewards/cosine_scaled_reward/mean": -0.09800112992525101,
"rewards/cosine_scaled_reward/std": 0.37953105568885803,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1894.0,
"completions/mean_length": 1738.90625,
"completions/mean_terminated_length": 949.0,
"completions/min_length": 435.0,
"completions/min_terminated_length": 435.0,
"epoch": 0.002285714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24221572279930115,
"learning_rate": 2e-08,
"loss": -0.0,
"num_tokens": 239748.0,
"reward": 0.020556632429361343,
"reward_std": 0.3545936942100525,
"rewards/cosine_scaled_reward/mean": 0.020556632429361343,
"rewards/cosine_scaled_reward/std": 0.4492928683757782,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.921875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1542.0,
"completions/mean_length": 1964.078125,
"completions/mean_terminated_length": 973.7999877929688,
"completions/min_length": 733.0,
"completions/min_terminated_length": 733.0,
"epoch": 0.0034285714285714284,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2472974807024002,
"learning_rate": 4e-08,
"loss": 0.0,
"num_tokens": 375921.0,
"reward": -0.20954538881778717,
"reward_std": 0.13813795149326324,
"rewards/cosine_scaled_reward/mean": -0.20954540371894836,
"rewards/cosine_scaled_reward/std": 0.16814909875392914,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.484375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2010.0,
"completions/mean_length": 1555.6875,
"completions/mean_terminated_length": 1093.212158203125,
"completions/min_length": 502.0,
"completions/min_terminated_length": 502.0,
"epoch": 0.004571428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2868657112121582,
"learning_rate": 6e-08,
"loss": -0.0,
"num_tokens": 485293.0,
"reward": -0.12192361056804657,
"reward_std": 0.31710442900657654,
"rewards/cosine_scaled_reward/mean": -0.12192361056804657,
"rewards/cosine_scaled_reward/std": 0.35428565740585327,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1758.0,
"completions/mean_length": 1958.5625,
"completions/mean_terminated_length": 1332.5,
"completions/min_length": 932.0,
"completions/min_terminated_length": 932.0,
"epoch": 0.005714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2460148185491562,
"learning_rate": 8e-08,
"loss": -0.0,
"num_tokens": 621457.0,
"reward": -0.21145480871200562,
"reward_std": 0.14890719950199127,
"rewards/cosine_scaled_reward/mean": -0.21145479381084442,
"rewards/cosine_scaled_reward/std": 0.20399661362171173,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1670.0,
"completions/mean_length": 1908.375,
"completions/mean_terminated_length": 931.0,
"completions/min_length": 593.0,
"completions/min_terminated_length": 593.0,
"epoch": 0.006857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.26549720764160156,
"learning_rate": 1e-07,
"loss": -0.0,
"num_tokens": 755241.0,
"reward": -0.2408866286277771,
"reward_std": 0.16572487354278564,
"rewards/cosine_scaled_reward/mean": -0.2408866286277771,
"rewards/cosine_scaled_reward/std": 0.17492830753326416,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1955.0,
"completions/mean_length": 1889.296875,
"completions/mean_terminated_length": 1201.5833740234375,
"completions/min_length": 396.0,
"completions/min_terminated_length": 396.0,
"epoch": 0.008,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23518230020999908,
"learning_rate": 1.2e-07,
"loss": 0.0,
"num_tokens": 886564.0,
"reward": -0.16087877750396729,
"reward_std": 0.24579641222953796,
"rewards/cosine_scaled_reward/mean": -0.16087877750396729,
"rewards/cosine_scaled_reward/std": 0.37339961528778076,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1920.0,
"completions/mean_length": 1751.578125,
"completions/mean_terminated_length": 994.0555419921875,
"completions/min_length": 330.0,
"completions/min_terminated_length": 330.0,
"epoch": 0.009142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2354528158903122,
"learning_rate": 1.4e-07,
"loss": 0.0,
"num_tokens": 1009081.0,
"reward": -0.023812226951122284,
"reward_std": 0.2823081314563751,
"rewards/cosine_scaled_reward/mean": -0.02381223440170288,
"rewards/cosine_scaled_reward/std": 0.4484662115573883,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.9375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1889.0,
"completions/mean_length": 2000.59375,
"completions/mean_terminated_length": 1289.5,
"completions/min_length": 903.0,
"completions/min_terminated_length": 903.0,
"epoch": 0.010285714285714285,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24302220344543457,
"learning_rate": 1.6e-07,
"loss": 0.0,
"num_tokens": 1148575.0,
"reward": -0.2453702688217163,
"reward_std": 0.18811637163162231,
"rewards/cosine_scaled_reward/mean": -0.2453702688217163,
"rewards/cosine_scaled_reward/std": 0.22203005850315094,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1739.0,
"completions/mean_length": 1701.140625,
"completions/mean_terminated_length": 879.631591796875,
"completions/min_length": 484.0,
"completions/min_terminated_length": 484.0,
"epoch": 0.011428571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.25642141699790955,
"learning_rate": 1.8e-07,
"loss": -0.0,
"num_tokens": 1268280.0,
"reward": -0.15177705883979797,
"reward_std": 0.2125300019979477,
"rewards/cosine_scaled_reward/mean": -0.15177705883979797,
"rewards/cosine_scaled_reward/std": 0.3240113854408264,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.890625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1683.0,
"completions/mean_length": 1950.609375,
"completions/mean_terminated_length": 1157.571533203125,
"completions/min_length": 584.0,
"completions/min_terminated_length": 584.0,
"epoch": 0.012571428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24372951686382294,
"learning_rate": 2e-07,
"loss": 0.0,
"num_tokens": 1404791.0,
"reward": -0.23502977192401886,
"reward_std": 0.18896539509296417,
"rewards/cosine_scaled_reward/mean": -0.23502977192401886,
"rewards/cosine_scaled_reward/std": 0.24224351346492767,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.640625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1944.0,
"completions/mean_length": 1751.03125,
"completions/mean_terminated_length": 1221.6522216796875,
"completions/min_length": 489.0,
"completions/min_terminated_length": 489.0,
"epoch": 0.013714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.28422027826309204,
"learning_rate": 2.1999999999999998e-07,
"loss": -0.0,
"num_tokens": 1527801.0,
"reward": -0.14280016720294952,
"reward_std": 0.32843896746635437,
"rewards/cosine_scaled_reward/mean": -0.14280015230178833,
"rewards/cosine_scaled_reward/std": 0.41895967721939087,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1793.0,
"completions/mean_length": 1834.453125,
"completions/mean_terminated_length": 1193.8125,
"completions/min_length": 783.0,
"completions/min_terminated_length": 783.0,
"epoch": 0.014857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24033738672733307,
"learning_rate": 2.4e-07,
"loss": 0.0,
"num_tokens": 1656246.0,
"reward": -0.17057427763938904,
"reward_std": 0.24429668486118317,
"rewards/cosine_scaled_reward/mean": -0.17057427763938904,
"rewards/cosine_scaled_reward/std": 0.27816399931907654,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1474.0,
"completions/mean_length": 1800.65625,
"completions/mean_terminated_length": 1116.823486328125,
"completions/min_length": 495.0,
"completions/min_terminated_length": 495.0,
"epoch": 0.016,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2312558889389038,
"learning_rate": 2.6e-07,
"loss": 0.0,
"num_tokens": 1782096.0,
"reward": -0.11817245185375214,
"reward_std": 0.24491220712661743,
"rewards/cosine_scaled_reward/mean": -0.11817245930433273,
"rewards/cosine_scaled_reward/std": 0.3942086696624756,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1636.0,
"completions/mean_length": 1692.828125,
"completions/mean_terminated_length": 785.1666870117188,
"completions/min_length": 438.0,
"completions/min_terminated_length": 438.0,
"epoch": 0.017142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2563658654689789,
"learning_rate": 2.8e-07,
"loss": -0.0,
"num_tokens": 1901357.0,
"reward": -0.027107469737529755,
"reward_std": 0.1853453516960144,
"rewards/cosine_scaled_reward/mean": -0.027107462286949158,
"rewards/cosine_scaled_reward/std": 0.4734213352203369,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.018285714285714287,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24149107933044434,
"learning_rate": 3e-07,
"loss": -0.0,
"num_tokens": 2042869.0,
"reward": -0.2542623281478882,
"reward_std": 0.14302438497543335,
"rewards/cosine_scaled_reward/mean": -0.2542623281478882,
"rewards/cosine_scaled_reward/std": 0.160969540476799,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.578125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1827.0,
"completions/mean_length": 1548.75,
"completions/mean_terminated_length": 864.5925903320312,
"completions/min_length": 357.0,
"completions/min_terminated_length": 357.0,
"epoch": 0.019428571428571427,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.31088724732398987,
"learning_rate": 3.2e-07,
"loss": 0.0,
"num_tokens": 2152509.0,
"reward": -0.12113451957702637,
"reward_std": 0.284165620803833,
"rewards/cosine_scaled_reward/mean": -0.12113452702760696,
"rewards/cosine_scaled_reward/std": 0.4259316623210907,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1899.0,
"completions/mean_length": 1793.03125,
"completions/mean_terminated_length": 1028.125,
"completions/min_length": 531.0,
"completions/min_terminated_length": 531.0,
"epoch": 0.02057142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2451843023300171,
"learning_rate": 3.4000000000000003e-07,
"loss": 0.0,
"num_tokens": 2277639.0,
"reward": -0.18317042291164398,
"reward_std": 0.20634235441684723,
"rewards/cosine_scaled_reward/mean": -0.18317043781280518,
"rewards/cosine_scaled_reward/std": 0.27781662344932556,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1633.0,
"completions/mean_length": 1735.984375,
"completions/mean_terminated_length": 997.0,
"completions/min_length": 462.0,
"completions/min_terminated_length": 462.0,
"epoch": 0.021714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24677637219429016,
"learning_rate": 3.6e-07,
"loss": 0.0,
"num_tokens": 2399998.0,
"reward": -0.04996331408619881,
"reward_std": 0.2841629385948181,
"rewards/cosine_scaled_reward/mean": -0.04996330291032791,
"rewards/cosine_scaled_reward/std": 0.4186851680278778,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.640625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1643.0,
"completions/mean_length": 1614.890625,
"completions/mean_terminated_length": 842.8261108398438,
"completions/min_length": 411.0,
"completions/min_terminated_length": 411.0,
"epoch": 0.022857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2543003559112549,
"learning_rate": 3.7999999999999996e-07,
"loss": -0.0,
"num_tokens": 2514703.0,
"reward": -0.09282197058200836,
"reward_std": 0.2568933367729187,
"rewards/cosine_scaled_reward/mean": -0.09282197058200836,
"rewards/cosine_scaled_reward/std": 0.4104878604412079,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1988.0,
"completions/mean_length": 1786.734375,
"completions/mean_terminated_length": 1119.0555419921875,
"completions/min_length": 348.0,
"completions/min_terminated_length": 348.0,
"epoch": 0.024,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3147278130054474,
"learning_rate": 4e-07,
"loss": 0.0,
"num_tokens": 2639862.0,
"reward": -0.16029146313667297,
"reward_std": 0.2322564721107483,
"rewards/cosine_scaled_reward/mean": -0.16029146313667297,
"rewards/cosine_scaled_reward/std": 0.36191171407699585,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1833.0,
"completions/mean_length": 1300.484375,
"completions/mean_terminated_length": 789.0263061523438,
"completions/min_length": 287.0,
"completions/min_terminated_length": 287.0,
"epoch": 0.025142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.32522445917129517,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.0,
"num_tokens": 2732109.0,
"reward": 0.0033364146947860718,
"reward_std": 0.18878400325775146,
"rewards/cosine_scaled_reward/mean": 0.0033364109694957733,
"rewards/cosine_scaled_reward/std": 0.45390966534614563,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1920.0,
"completions/mean_length": 1641.03125,
"completions/mean_terminated_length": 1046.2308349609375,
"completions/min_length": 422.0,
"completions/min_terminated_length": 422.0,
"epoch": 0.026285714285714287,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.28244850039482117,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0,
"num_tokens": 2847927.0,
"reward": -0.21077856421470642,
"reward_std": 0.24399788677692413,
"rewards/cosine_scaled_reward/mean": -0.21077856421470642,
"rewards/cosine_scaled_reward/std": 0.2925592362880707,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1965.0,
"completions/mean_length": 1789.59375,
"completions/mean_terminated_length": 1129.2222900390625,
"completions/min_length": 560.0,
"completions/min_terminated_length": 560.0,
"epoch": 0.027428571428571427,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24896308779716492,
"learning_rate": 4.6e-07,
"loss": -0.0,
"num_tokens": 2973389.0,
"reward": -0.1665852814912796,
"reward_std": 0.307574987411499,
"rewards/cosine_scaled_reward/mean": -0.1665852665901184,
"rewards/cosine_scaled_reward/std": 0.4072873294353485,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1851.0,
"completions/mean_length": 1696.40625,
"completions/mean_terminated_length": 1025.181884765625,
"completions/min_length": 434.0,
"completions/min_terminated_length": 434.0,
"epoch": 0.02857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.262716144323349,
"learning_rate": 4.8e-07,
"loss": 0.0,
"num_tokens": 3092255.0,
"reward": -0.14361324906349182,
"reward_std": 0.3466429114341736,
"rewards/cosine_scaled_reward/mean": -0.14361326396465302,
"rewards/cosine_scaled_reward/std": 0.3933021128177643,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1975.0,
"completions/mean_length": 1973.046875,
"completions/mean_terminated_length": 1448.375,
"completions/min_length": 1035.0,
"completions/min_terminated_length": 1035.0,
"epoch": 0.029714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2365841567516327,
"learning_rate": 5e-07,
"loss": -0.0,
"num_tokens": 3229162.0,
"reward": -0.050574399530887604,
"reward_std": 0.22459164261817932,
"rewards/cosine_scaled_reward/mean": -0.050574399530887604,
"rewards/cosine_scaled_reward/std": 0.37290775775909424,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.796875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1982.0,
"completions/mean_length": 1878.53125,
"completions/mean_terminated_length": 1213.6923828125,
"completions/min_length": 498.0,
"completions/min_terminated_length": 498.0,
"epoch": 0.030857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2821083068847656,
"learning_rate": 5.2e-07,
"loss": 0.0,
"num_tokens": 3359676.0,
"reward": -0.13096781075000763,
"reward_std": 0.26249831914901733,
"rewards/cosine_scaled_reward/mean": -0.13096781075000763,
"rewards/cosine_scaled_reward/std": 0.3478032350540161,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.78125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1933.0,
"completions/mean_length": 1827.453125,
"completions/mean_terminated_length": 1039.7857666015625,
"completions/min_length": 419.0,
"completions/min_terminated_length": 419.0,
"epoch": 0.032,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2539210915565491,
"learning_rate": 5.4e-07,
"loss": 0.0,
"num_tokens": 3486969.0,
"reward": -0.11822876334190369,
"reward_std": 0.2370690554380417,
"rewards/cosine_scaled_reward/mean": -0.11822875589132309,
"rewards/cosine_scaled_reward/std": 0.4236762225627899,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.9375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2047.0,
"completions/mean_length": 2020.5,
"completions/mean_terminated_length": 1608.0,
"completions/min_length": 887.0,
"completions/min_terminated_length": 887.0,
"epoch": 0.03314285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23259545862674713,
"learning_rate": 5.6e-07,
"loss": -0.0,
"num_tokens": 3626753.0,
"reward": -0.20220182836055756,
"reward_std": 0.15910759568214417,
"rewards/cosine_scaled_reward/mean": -0.20220182836055756,
"rewards/cosine_scaled_reward/std": 0.20781411230564117,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.828125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1662.0,
"completions/mean_length": 1903.703125,
"completions/mean_terminated_length": 1208.45458984375,
"completions/min_length": 961.0,
"completions/min_terminated_length": 961.0,
"epoch": 0.03428571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24027252197265625,
"learning_rate": 5.8e-07,
"loss": 0.0,
"num_tokens": 3759126.0,
"reward": -0.19193249940872192,
"reward_std": 0.24584847688674927,
"rewards/cosine_scaled_reward/mean": -0.19193249940872192,
"rewards/cosine_scaled_reward/std": 0.28378522396087646,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.796875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1922.0,
"completions/mean_length": 1847.34375,
"completions/mean_terminated_length": 1060.1539306640625,
"completions/min_length": 311.0,
"completions/min_terminated_length": 311.0,
"epoch": 0.03542857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2703397274017334,
"learning_rate": 6e-07,
"loss": -0.0,
"num_tokens": 3887852.0,
"reward": -0.25379180908203125,
"reward_std": 0.24661941826343536,
"rewards/cosine_scaled_reward/mean": -0.25379180908203125,
"rewards/cosine_scaled_reward/std": 0.29188498854637146,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.828125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1995.0,
"completions/mean_length": 1950.3125,
"completions/mean_terminated_length": 1479.6363525390625,
"completions/min_length": 766.0,
"completions/min_terminated_length": 766.0,
"epoch": 0.036571428571428574,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21763876080513,
"learning_rate": 6.2e-07,
"loss": -0.0,
"num_tokens": 4023024.0,
"reward": -0.16017228364944458,
"reward_std": 0.2255343496799469,
"rewards/cosine_scaled_reward/mean": -0.16017228364944458,
"rewards/cosine_scaled_reward/std": 0.3709539771080017,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1957.0,
"completions/mean_length": 1996.28125,
"completions/mean_terminated_length": 1634.25,
"completions/min_length": 1237.0,
"completions/min_terminated_length": 1237.0,
"epoch": 0.037714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22758260369300842,
"learning_rate": 6.4e-07,
"loss": -0.0,
"num_tokens": 4162002.0,
"reward": -0.20318198204040527,
"reward_std": 0.18396919965744019,
"rewards/cosine_scaled_reward/mean": -0.20318198204040527,
"rewards/cosine_scaled_reward/std": 0.34913352131843567,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.578125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1850.0,
"completions/mean_length": 1703.265625,
"completions/mean_terminated_length": 1230.851806640625,
"completions/min_length": 651.0,
"completions/min_terminated_length": 651.0,
"epoch": 0.038857142857142854,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.31658875942230225,
"learning_rate": 6.6e-07,
"loss": -0.0,
"num_tokens": 4280563.0,
"reward": -0.05977274850010872,
"reward_std": 0.30437377095222473,
"rewards/cosine_scaled_reward/mean": -0.059772733598947525,
"rewards/cosine_scaled_reward/std": 0.4424094259738922,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1846.0,
"completions/mean_length": 1807.546875,
"completions/mean_terminated_length": 765.5833740234375,
"completions/min_length": 419.0,
"completions/min_terminated_length": 419.0,
"epoch": 0.04,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2792847156524658,
"learning_rate": 6.800000000000001e-07,
"loss": -0.0,
"num_tokens": 4407742.0,
"reward": -0.18658886849880219,
"reward_std": 0.2910658121109009,
"rewards/cosine_scaled_reward/mean": -0.18658888339996338,
"rewards/cosine_scaled_reward/std": 0.34802255034446716,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.921875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1912.0,
"completions/mean_length": 1995.65625,
"completions/mean_terminated_length": 1378.0,
"completions/min_length": 1090.0,
"completions/min_terminated_length": 1090.0,
"epoch": 0.04114285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23547738790512085,
"learning_rate": 7e-07,
"loss": 0.0,
"num_tokens": 4546576.0,
"reward": -0.23918019235134125,
"reward_std": 0.19598917663097382,
"rewards/cosine_scaled_reward/mean": -0.23918019235134125,
"rewards/cosine_scaled_reward/std": 0.2425125539302826,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.90625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2015.0,
"completions/mean_length": 1994.75,
"completions/mean_terminated_length": 1480.0,
"completions/min_length": 545.0,
"completions/min_terminated_length": 545.0,
"epoch": 0.04228571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22962674498558044,
"learning_rate": 7.2e-07,
"loss": -0.0,
"num_tokens": 4685264.0,
"reward": -0.25335729122161865,
"reward_std": 0.15323391556739807,
"rewards/cosine_scaled_reward/mean": -0.25335729122161865,
"rewards/cosine_scaled_reward/std": 0.17556406557559967,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.890625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1934.0,
"completions/mean_length": 1957.484375,
"completions/mean_terminated_length": 1220.4285888671875,
"completions/min_length": 965.0,
"completions/min_terminated_length": 965.0,
"epoch": 0.04342857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24781912565231323,
"learning_rate": 7.4e-07,
"loss": -0.0,
"num_tokens": 4822255.0,
"reward": -0.13536512851715088,
"reward_std": 0.19208545982837677,
"rewards/cosine_scaled_reward/mean": -0.13536511361598969,
"rewards/cosine_scaled_reward/std": 0.30052343010902405,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1824.0,
"completions/mean_length": 1744.421875,
"completions/mean_terminated_length": 833.6875,
"completions/min_length": 317.0,
"completions/min_terminated_length": 317.0,
"epoch": 0.044571428571428574,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2562144994735718,
"learning_rate": 7.599999999999999e-07,
"loss": -0.0,
"num_tokens": 4944682.0,
"reward": -0.041110455989837646,
"reward_std": 0.21381449699401855,
"rewards/cosine_scaled_reward/mean": -0.04111045226454735,
"rewards/cosine_scaled_reward/std": 0.35980772972106934,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2043.0,
"completions/mean_length": 1774.359375,
"completions/mean_terminated_length": 1017.8235473632812,
"completions/min_length": 445.0,
"completions/min_terminated_length": 445.0,
"epoch": 0.045714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.25478634238243103,
"learning_rate": 7.799999999999999e-07,
"loss": 0.0,
"num_tokens": 5068313.0,
"reward": -0.12165145576000214,
"reward_std": 0.17204006016254425,
"rewards/cosine_scaled_reward/mean": -0.12165144830942154,
"rewards/cosine_scaled_reward/std": 0.4099982678890228,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.640625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1991.0,
"completions/mean_length": 1814.375,
"completions/mean_terminated_length": 1397.9130859375,
"completions/min_length": 968.0,
"completions/min_terminated_length": 968.0,
"epoch": 0.046857142857142854,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21750310063362122,
"learning_rate": 8e-07,
"loss": 0.0,
"num_tokens": 5195585.0,
"reward": -0.25668060779571533,
"reward_std": 0.2832298278808594,
"rewards/cosine_scaled_reward/mean": -0.25668060779571533,
"rewards/cosine_scaled_reward/std": 0.3347759544849396,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.765625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1764.0,
"completions/mean_length": 1714.59375,
"completions/mean_terminated_length": 625.4666748046875,
"completions/min_length": 186.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.048,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.34486907720565796,
"learning_rate": 8.199999999999999e-07,
"loss": -0.0,
"num_tokens": 5315679.0,
"reward": -0.2253742218017578,
"reward_std": 0.1778060495853424,
"rewards/cosine_scaled_reward/mean": -0.22537420690059662,
"rewards/cosine_scaled_reward/std": 0.19647939503192902,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.828125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1638.0,
"completions/mean_length": 1863.78125,
"completions/mean_terminated_length": 976.1818237304688,
"completions/min_length": 669.0,
"completions/min_terminated_length": 669.0,
"epoch": 0.04914285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23907455801963806,
"learning_rate": 8.399999999999999e-07,
"loss": 0.0,
"num_tokens": 5446577.0,
"reward": -0.1142776757478714,
"reward_std": 0.21804723143577576,
"rewards/cosine_scaled_reward/mean": -0.1142776757478714,
"rewards/cosine_scaled_reward/std": 0.3637608587741852,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1920.0,
"completions/mean_length": 1771.125,
"completions/mean_terminated_length": 940.5,
"completions/min_length": 344.0,
"completions/min_terminated_length": 344.0,
"epoch": 0.05028571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2888188362121582,
"learning_rate": 8.599999999999999e-07,
"loss": 0.0,
"num_tokens": 5570625.0,
"reward": -0.11845305562019348,
"reward_std": 0.2729855477809906,
"rewards/cosine_scaled_reward/mean": -0.11845306307077408,
"rewards/cosine_scaled_reward/std": 0.4279690086841583,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.96875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1296.0,
"completions/mean_length": 2020.859375,
"completions/mean_terminated_length": 1179.5,
"completions/min_length": 1063.0,
"completions/min_terminated_length": 1063.0,
"epoch": 0.05142857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2232045829296112,
"learning_rate": 8.799999999999999e-07,
"loss": 0.0,
"num_tokens": 5711616.0,
"reward": -0.1830526441335678,
"reward_std": 0.20074567198753357,
"rewards/cosine_scaled_reward/mean": -0.1830526441335678,
"rewards/cosine_scaled_reward/std": 0.3221423327922821,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.828125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1121.0,
"completions/mean_length": 1843.328125,
"completions/mean_terminated_length": 857.1818237304688,
"completions/min_length": 608.0,
"completions/min_terminated_length": 608.0,
"epoch": 0.052571428571428575,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2569328844547272,
"learning_rate": 9e-07,
"loss": 0.0,
"num_tokens": 5840757.0,
"reward": -0.21247822046279907,
"reward_std": 0.17188501358032227,
"rewards/cosine_scaled_reward/mean": -0.21247822046279907,
"rewards/cosine_scaled_reward/std": 0.183182492852211,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2047.0,
"completions/mean_length": 1772.984375,
"completions/mean_terminated_length": 1012.6470336914062,
"completions/min_length": 461.0,
"completions/min_terminated_length": 461.0,
"epoch": 0.053714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2800576090812683,
"learning_rate": 9.2e-07,
"loss": -0.0,
"num_tokens": 5964628.0,
"reward": -0.1755329668521881,
"reward_std": 0.19662824273109436,
"rewards/cosine_scaled_reward/mean": -0.1755329668521881,
"rewards/cosine_scaled_reward/std": 0.3987559974193573,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1949.0,
"completions/mean_length": 1787.046875,
"completions/mean_terminated_length": 1120.1666259765625,
"completions/min_length": 630.0,
"completions/min_terminated_length": 630.0,
"epoch": 0.054857142857142854,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2499135434627533,
"learning_rate": 9.399999999999999e-07,
"loss": -0.0,
"num_tokens": 6089543.0,
"reward": -0.07469595968723297,
"reward_std": 0.2802818715572357,
"rewards/cosine_scaled_reward/mean": -0.07469595968723297,
"rewards/cosine_scaled_reward/std": 0.39331451058387756,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.578125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1818.0,
"completions/mean_length": 1611.65625,
"completions/mean_terminated_length": 1013.7037353515625,
"completions/min_length": 298.0,
"completions/min_terminated_length": 298.0,
"epoch": 0.056,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2976716160774231,
"learning_rate": 9.6e-07,
"loss": -0.0,
"num_tokens": 6202753.0,
"reward": -0.14219576120376587,
"reward_std": 0.3252427875995636,
"rewards/cosine_scaled_reward/mean": -0.14219576120376587,
"rewards/cosine_scaled_reward/std": 0.41946855187416077,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.828125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1916.0,
"completions/mean_length": 1826.90625,
"completions/mean_terminated_length": 761.6364135742188,
"completions/min_length": 341.0,
"completions/min_terminated_length": 341.0,
"epoch": 0.05714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2344626933336258,
"learning_rate": 9.8e-07,
"loss": -0.0,
"num_tokens": 6330491.0,
"reward": -0.098542720079422,
"reward_std": 0.20483215153217316,
"rewards/cosine_scaled_reward/mean": -0.0985427126288414,
"rewards/cosine_scaled_reward/std": 0.396296888589859,
"step": 50
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 6330491,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}