adapter-211 / trainer_state.json
hongyu05's picture
Upload folder using huggingface_hub
ec85541 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.0008,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1790.0,
"completions/max_terminated_length": 1790.0,
"completions/mean_length": 182.1484375,
"completions/mean_terminated_length": 182.1484375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.28799160569906235,
"epoch": 8e-06,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.015653066337108612,
"kl": 9.092812547351059e-07,
"learning_rate": 0.0,
"loss": 0.0006,
"num_tokens": 572435.0,
"reward": 0.5309156775474548,
"reward_std": 0.4604809284210205,
"rewards/reward_func/mean": 0.5309156775474548,
"rewards/reward_func/std": 0.4604808986186981,
"sampling/importance_sampling_ratio/max": 1.9294580221176147,
"sampling/importance_sampling_ratio/mean": 1.009232997894287,
"sampling/importance_sampling_ratio/min": 0.34108418226242065,
"sampling/sampling_logp_difference/max": 0.4118213653564453,
"sampling/sampling_logp_difference/mean": 0.006867324002087116,
"step": 1,
"step_time": 92.00019569275901
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.2680463492870331,
"epoch": 1.6e-05,
"grad_norm": 0.030799396336078644,
"kl": 3.06405117722619e-07,
"learning_rate": 5e-06,
"loss": -0.0008,
"step": 2,
"step_time": 32.768778500845656
},
{
"clip_ratio/high_max": 0.0007134150364436209,
"clip_ratio/high_mean": 8.917687955545262e-05,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 8.917687955545262e-05,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5461.0,
"completions/max_terminated_length": 5461.0,
"completions/mean_length": 492.9140625,
"completions/mean_terminated_length": 492.9140625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.2738131880760193,
"epoch": 2.4e-05,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.04047725349664688,
"kl": 0.00043132787686772645,
"learning_rate": 1e-05,
"loss": 0.0128,
"num_tokens": 1023912.0,
"reward": 0.36626869440078735,
"reward_std": 0.465108722448349,
"rewards/reward_func/mean": 0.36626869440078735,
"rewards/reward_func/std": 0.465108722448349,
"sampling/importance_sampling_ratio/max": 2.1430017948150635,
"sampling/importance_sampling_ratio/mean": 0.9558022022247314,
"sampling/importance_sampling_ratio/min": 0.020946042612195015,
"sampling/sampling_logp_difference/max": 1.0668578147888184,
"sampling/sampling_logp_difference/mean": 0.012038183398544788,
"step": 3,
"step_time": 149.4881290521007
},
{
"clip_ratio/high_max": 0.000916204895474948,
"clip_ratio/high_mean": 0.0001145256119343685,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0001145256119343685,
"entropy": 0.3138197138905525,
"epoch": 3.2e-05,
"grad_norm": 0.022033225744962692,
"kl": 0.0005088059915578924,
"learning_rate": 1.5e-05,
"loss": -0.025,
"step": 4,
"step_time": 58.02533454587683
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 8909.0,
"completions/mean_length": 1400.8125,
"completions/mean_terminated_length": 401.933349609375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.3101617470383644,
"epoch": 4e-05,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.0025309158954769373,
"kl": 0.0005323870427673683,
"learning_rate": 2e-05,
"loss": -0.0,
"num_tokens": 1820264.0,
"reward": 0.4536225199699402,
"reward_std": 0.49777504801750183,
"rewards/reward_func/mean": 0.4536225199699402,
"rewards/reward_func/std": 0.49777501821517944,
"sampling/importance_sampling_ratio/max": 1.2027839422225952,
"sampling/importance_sampling_ratio/mean": 0.8852319717407227,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.777749061584473,
"sampling/sampling_logp_difference/mean": 0.01063137874007225,
"step": 5,
"step_time": 239.55608439911157
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0052083334885537624,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0052083334885537624,
"entropy": 0.3320774510502815,
"epoch": 4.8e-05,
"grad_norm": 0.002884262939915061,
"kl": 0.0017297266749665141,
"learning_rate": 2.5e-05,
"loss": 0.0,
"step": 6,
"step_time": 59.483061871957034
},
{
"clip_ratio/high_max": 0.0013319647405296564,
"clip_ratio/high_mean": 0.00016649559256620705,
"clip_ratio/low_mean": 0.00011880823876708746,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0002853038313332945,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12890.0,
"completions/mean_length": 2285.6953125,
"completions/mean_terminated_length": 960.2137451171875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.28958937525749207,
"epoch": 5.6e-05,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.0031948399264365435,
"kl": 0.0007329104555537924,
"learning_rate": 3e-05,
"loss": -0.0075,
"num_tokens": 2596929.0,
"reward": 0.37591269612312317,
"reward_std": 0.46268102526664734,
"rewards/reward_func/mean": 0.37591269612312317,
"rewards/reward_func/std": 0.46268102526664734,
"sampling/importance_sampling_ratio/max": 1.3332258462905884,
"sampling/importance_sampling_ratio/mean": 0.7790708541870117,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.7908563613891602,
"sampling/sampling_logp_difference/mean": 0.011350465007126331,
"step": 7,
"step_time": 291.5669959378429
},
{
"clip_ratio/high_max": 0.06269620433158707,
"clip_ratio/high_mean": 0.007905740383648663,
"clip_ratio/low_mean": 3.790311166085303e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007943643497128505,
"entropy": 0.27083906158804893,
"epoch": 6.4e-05,
"grad_norm": 0.0021086863707751036,
"kl": 0.0014875386259518564,
"learning_rate": 3.5e-05,
"loss": -0.0005,
"step": 8,
"step_time": 86.35025516920723
},
{
"clip_ratio/high_max": 0.0006997402815613896,
"clip_ratio/high_mean": 0.00015431304200319573,
"clip_ratio/low_mean": 0.00010484526228538016,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0002591583006505971,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14202.0,
"completions/mean_length": 2823.390625,
"completions/mean_terminated_length": 1797.7984619140625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.14772238209843636,
"epoch": 7.2e-05,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.006001872010529041,
"kl": 0.000943778213695623,
"learning_rate": 4e-05,
"loss": 0.0108,
"num_tokens": 3207611.0,
"reward": 0.5036642551422119,
"reward_std": 0.49157199263572693,
"rewards/reward_func/mean": 0.5036642551422119,
"rewards/reward_func/std": 0.49157199263572693,
"sampling/importance_sampling_ratio/max": 1.5270230770111084,
"sampling/importance_sampling_ratio/mean": 0.7227185964584351,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.1080689430236816,
"sampling/sampling_logp_difference/mean": 0.004280484281480312,
"step": 9,
"step_time": 252.9167389899958
},
{
"clip_ratio/high_max": 0.001605564437340945,
"clip_ratio/high_mean": 0.00024234261945821345,
"clip_ratio/low_mean": 9.370225598104298e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00033604487907723524,
"entropy": 0.19918649271130562,
"epoch": 8e-05,
"grad_norm": 0.0017423235112801194,
"kl": 0.0017632123199291527,
"learning_rate": 4.5e-05,
"loss": 0.0011,
"step": 10,
"step_time": 71.32449517771602
},
{
"clip_ratio/high_max": 0.0016184170162887312,
"clip_ratio/high_mean": 0.0002023021270360914,
"clip_ratio/low_mean": 2.689328721316997e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00022919541515875608,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15886.0,
"completions/mean_length": 1933.03125,
"completions/mean_terminated_length": 708.3728637695312,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.283096544444561,
"epoch": 8.8e-05,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.005225648172199726,
"kl": 0.020552616333588958,
"learning_rate": 5e-05,
"loss": -0.0047,
"num_tokens": 3911607.0,
"reward": 0.3723485767841339,
"reward_std": 0.45915600657463074,
"rewards/reward_func/mean": 0.3723485767841339,
"rewards/reward_func/std": 0.45915600657463074,
"sampling/importance_sampling_ratio/max": 2.6595818996429443,
"sampling/importance_sampling_ratio/mean": 0.8330748081207275,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 4.817470550537109,
"sampling/sampling_logp_difference/mean": 0.018553579226136208,
"step": 11,
"step_time": 305.5446167134214
},
{
"clip_ratio/high_max": 0.11326734800240956,
"clip_ratio/high_mean": 0.014203241193172289,
"clip_ratio/low_mean": 0.02032394427806139,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03452718561311485,
"entropy": 0.31704550981521606,
"epoch": 9.6e-05,
"grad_norm": 0.0021242008078843355,
"kl": 0.041884748614393175,
"learning_rate": 5.500000000000001e-05,
"loss": -0.0024,
"step": 12,
"step_time": 100.18367045000196
},
{
"clip_ratio/high_max": 0.05000000074505806,
"clip_ratio/high_mean": 0.0062500000931322575,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0062500000931322575,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14215.0,
"completions/mean_length": 994.609375,
"completions/mean_terminated_length": 498.1773986816406,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.21508953720331192,
"epoch": 0.000104,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.008786521852016449,
"kl": 0.010858730238396674,
"learning_rate": 6e-05,
"loss": 0.0112,
"num_tokens": 4508533.0,
"reward": 0.49886971712112427,
"reward_std": 0.479078471660614,
"rewards/reward_func/mean": 0.49886971712112427,
"rewards/reward_func/std": 0.479078471660614,
"sampling/importance_sampling_ratio/max": 1.2790054082870483,
"sampling/importance_sampling_ratio/mean": 0.9344986081123352,
"sampling/importance_sampling_ratio/min": 1.8428319634167245e-11,
"sampling/sampling_logp_difference/max": 1.4877722263336182,
"sampling/sampling_logp_difference/mean": 0.008521802723407745,
"step": 13,
"step_time": 443.75575554184616
},
{
"clip_ratio/high_max": 0.20842555643321248,
"clip_ratio/high_mean": 0.031284590383620525,
"clip_ratio/low_mean": 0.03960632954840548,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.07089091930538416,
"entropy": 0.18631769344210625,
"epoch": 0.000112,
"grad_norm": 0.003839960554614663,
"kl": 0.3403822723776102,
"learning_rate": 6.500000000000001e-05,
"loss": -0.0202,
"step": 14,
"step_time": 85.95637208526023
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0001768385773175396,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0001768385773175396,
"completions/clipped_ratio": 0.0,
"completions/max_length": 5468.0,
"completions/max_terminated_length": 5468.0,
"completions/mean_length": 578.9375,
"completions/mean_terminated_length": 578.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.159299585968256,
"epoch": 0.00012,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.012311691418290138,
"kl": 0.01162221294362098,
"learning_rate": 7e-05,
"loss": 0.0128,
"num_tokens": 5117069.0,
"reward": 0.5088721513748169,
"reward_std": 0.4710608422756195,
"rewards/reward_func/mean": 0.5088721513748169,
"rewards/reward_func/std": 0.4710608124732971,
"sampling/importance_sampling_ratio/max": 2.0296452045440674,
"sampling/importance_sampling_ratio/mean": 0.9467288255691528,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 6.81820821762085,
"sampling/sampling_logp_difference/mean": 0.011304730549454689,
"step": 15,
"step_time": 139.54756120592356
},
{
"clip_ratio/high_max": 0.1666666716337204,
"clip_ratio/high_mean": 0.02633665595203638,
"clip_ratio/low_mean": 0.027520230985828675,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.053856888320297,
"entropy": 0.24024979025125504,
"epoch": 0.000128,
"grad_norm": 0.004696316551417112,
"kl": 0.04927169228903949,
"learning_rate": 7.500000000000001e-05,
"loss": 0.0008,
"step": 16,
"step_time": 51.39879226591438
},
{
"clip_ratio/high_max": 0.001534179231384769,
"clip_ratio/high_mean": 0.00019177240392309614,
"clip_ratio/low_mean": 5.4063617426436394e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00024583602498751134,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12325.0,
"completions/mean_length": 2109.71875,
"completions/mean_terminated_length": 900.0338745117188,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.259117666631937,
"epoch": 0.000136,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.0027392900083214045,
"kl": 0.022709450451657176,
"learning_rate": 8e-05,
"loss": -0.0015,
"num_tokens": 5846785.0,
"reward": 0.5021514892578125,
"reward_std": 0.4946684241294861,
"rewards/reward_func/mean": 0.5021514892578125,
"rewards/reward_func/std": 0.4946684241294861,
"sampling/importance_sampling_ratio/max": 1.3155046701431274,
"sampling/importance_sampling_ratio/mean": 0.8164673447608948,
"sampling/importance_sampling_ratio/min": 3.531794431563262e-12,
"sampling/sampling_logp_difference/max": 1.9771251678466797,
"sampling/sampling_logp_difference/mean": 0.010305984877049923,
"step": 17,
"step_time": 405.8358749570325
},
{
"clip_ratio/high_max": 0.1666666716337204,
"clip_ratio/high_mean": 0.021097486838698387,
"clip_ratio/low_mean": 0.011532710865139961,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03263019863516092,
"entropy": 0.1985614150762558,
"epoch": 0.000144,
"grad_norm": 0.005495882593095303,
"kl": 0.03126369323581457,
"learning_rate": 8.5e-05,
"loss": -0.0155,
"step": 18,
"step_time": 142.6410668361932
},
{
"clip_ratio/high_max": 0.0031972584838513285,
"clip_ratio/high_mean": 0.0005761296160926577,
"clip_ratio/low_mean": 0.0004054550954606384,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0009815847006393597,
"completions/clipped_ratio": 0.1796875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16000.0,
"completions/mean_length": 4290.2109375,
"completions/mean_terminated_length": 1641.0953369140625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.09939680807292461,
"epoch": 0.000152,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.004325419198721647,
"kl": 0.025787187332753092,
"learning_rate": 9e-05,
"loss": 0.0035,
"num_tokens": 6617028.0,
"reward": 0.4427623152732849,
"reward_std": 0.4857458472251892,
"rewards/reward_func/mean": 0.4427623152732849,
"rewards/reward_func/std": 0.4857458472251892,
"sampling/importance_sampling_ratio/max": 1.2794967889785767,
"sampling/importance_sampling_ratio/mean": 0.6535032987594604,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 3.10308837890625,
"sampling/sampling_logp_difference/mean": 0.007126981392502785,
"step": 19,
"step_time": 253.69875198067166
},
{
"clip_ratio/high_max": 0.002380200894549489,
"clip_ratio/high_mean": 0.0004641784689738415,
"clip_ratio/low_mean": 0.006076709658373147,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006540888105519116,
"entropy": 0.10411721095442772,
"epoch": 0.00016,
"grad_norm": 0.0034780765417963266,
"kl": 0.08655104972422123,
"learning_rate": 9.5e-05,
"loss": -0.0049,
"step": 20,
"step_time": 54.08613259694539
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.0052374619990587234,
"clip_ratio/low_mean": 7.05718994140625e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005308033898472786,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14426.0,
"completions/mean_length": 2772.7421875,
"completions/mean_terminated_length": 545.4454345703125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.26159290969371796,
"epoch": 0.000168,
"frac_reward_zero_std": 0.6875,
"grad_norm": 0.0016806161729618907,
"kl": 0.13244479056447744,
"learning_rate": 0.0001,
"loss": -0.0032,
"num_tokens": 7608123.0,
"reward": 0.30994826555252075,
"reward_std": 0.4333060681819916,
"rewards/reward_func/mean": 0.30994826555252075,
"rewards/reward_func/std": 0.4333060681819916,
"sampling/importance_sampling_ratio/max": 2.071073532104492,
"sampling/importance_sampling_ratio/mean": 0.8328644633293152,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.6652194261550903,
"sampling/sampling_logp_difference/mean": 0.013313735835254192,
"step": 21,
"step_time": 408.48256488214247
},
{
"clip_ratio/high_max": 0.0009995178115786985,
"clip_ratio/high_mean": 0.0001249397264473373,
"clip_ratio/low_mean": 0.01684358110651374,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01696852078748634,
"entropy": 0.21135510131716728,
"epoch": 0.000176,
"grad_norm": 0.011167092248797417,
"kl": 0.07416732516139746,
"learning_rate": 0.0001,
"loss": -0.0403,
"step": 22,
"step_time": 148.2932638968341
},
{
"clip_ratio/high_max": 0.0006121824262663722,
"clip_ratio/high_mean": 7.652280328329653e-05,
"clip_ratio/low_mean": 0.0001096606720238924,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00018618347530718893,
"completions/clipped_ratio": 0.171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 16332.0,
"completions/mean_length": 3878.53125,
"completions/mean_terminated_length": 1283.056640625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.17607736214995384,
"epoch": 0.000184,
"frac_reward_zero_std": 0.6875,
"grad_norm": 0.0007014994043856859,
"kl": 0.027259970782324672,
"learning_rate": 0.0001,
"loss": -0.0,
"num_tokens": 8549975.0,
"reward": 0.4119563102722168,
"reward_std": 0.4680332541465759,
"rewards/reward_func/mean": 0.4119563102722168,
"rewards/reward_func/std": 0.4680332839488983,
"sampling/importance_sampling_ratio/max": 1.453569769859314,
"sampling/importance_sampling_ratio/mean": 0.7403019666671753,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.4103574752807617,
"sampling/sampling_logp_difference/mean": 0.00972401350736618,
"step": 23,
"step_time": 281.3584946768824
},
{
"clip_ratio/high_max": 0.0006908245850354433,
"clip_ratio/high_mean": 8.635307312943041e-05,
"clip_ratio/low_mean": 0.00011251296746195294,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00019886604059138335,
"entropy": 0.21833522990345955,
"epoch": 0.000192,
"grad_norm": 0.0062239160761237144,
"kl": 0.06342287547886372,
"learning_rate": 0.0001,
"loss": 0.0208,
"step": 24,
"step_time": 83.19132332946174
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3324.0,
"completions/max_terminated_length": 3324.0,
"completions/mean_length": 350.3125,
"completions/mean_terminated_length": 350.3125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.21079584956169128,
"epoch": 0.0002,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.01816585287451744,
"kl": 0.047579593025147915,
"learning_rate": 0.0001,
"loss": -0.0,
"num_tokens": 9024447.0,
"reward": 0.5932583212852478,
"reward_std": 0.4719974398612976,
"rewards/reward_func/mean": 0.5932583212852478,
"rewards/reward_func/std": 0.4719974100589752,
"sampling/importance_sampling_ratio/max": 2.610856533050537,
"sampling/importance_sampling_ratio/mean": 0.9387929439544678,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.8608701229095459,
"sampling/sampling_logp_difference/mean": 0.012685808353126049,
"step": 25,
"step_time": 86.05315418587998
},
{
"clip_ratio/high_max": 0.2500000037252903,
"clip_ratio/high_mean": 0.06541509041562676,
"clip_ratio/low_mean": 0.0699066836386919,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.13532177731394768,
"entropy": 0.16260286793112755,
"epoch": 0.000208,
"grad_norm": 0.009579629637300968,
"kl": 1.1700078528374434,
"learning_rate": 0.0001,
"loss": -0.0004,
"step": 26,
"step_time": 34.18811777303927
},
{
"clip_ratio/high_max": 0.004500097595155239,
"clip_ratio/high_mean": 0.0007490973512176424,
"clip_ratio/low_mean": 2.6662485652195755e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0007757598377793329,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11635.0,
"completions/mean_length": 2257.7578125,
"completions/mean_terminated_length": 1060.61865234375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.1516659539192915,
"epoch": 0.000216,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.0051688519306480885,
"kl": 0.047932930290699005,
"learning_rate": 0.0001,
"loss": 0.0004,
"num_tokens": 9745576.0,
"reward": 0.46375519037246704,
"reward_std": 0.48423123359680176,
"rewards/reward_func/mean": 0.46375519037246704,
"rewards/reward_func/std": 0.48423123359680176,
"sampling/importance_sampling_ratio/max": 2.536548376083374,
"sampling/importance_sampling_ratio/mean": 0.8151211738586426,
"sampling/importance_sampling_ratio/min": 5.257729753793683e-06,
"sampling/sampling_logp_difference/max": 6.129981994628906,
"sampling/sampling_logp_difference/mean": 0.005175800062716007,
"step": 27,
"step_time": 436.48245814908296
},
{
"clip_ratio/high_max": 0.08695227152202278,
"clip_ratio/high_mean": 0.011061059150961228,
"clip_ratio/low_mean": 0.010714802792790579,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.021775862463982776,
"entropy": 0.13691149465739727,
"epoch": 0.000224,
"grad_norm": 0.024410562589764595,
"kl": 0.03237813455052674,
"learning_rate": 0.0001,
"loss": 0.0973,
"step": 28,
"step_time": 156.84757056180388
},
{
"clip_ratio/high_max": 0.0003620828501880169,
"clip_ratio/high_mean": 6.965760076127481e-05,
"clip_ratio/low_mean": 0.0001634064483369002,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00023306404909817502,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15615.0,
"completions/mean_length": 2270.578125,
"completions/mean_terminated_length": 1203.176513671875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.25759194791316986,
"epoch": 0.000232,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.012714912183582783,
"kl": 0.10287779942154884,
"learning_rate": 0.0001,
"loss": -0.0209,
"num_tokens": 10600314.0,
"reward": 0.4594896733760834,
"reward_std": 0.48041772842407227,
"rewards/reward_func/mean": 0.4594896733760834,
"rewards/reward_func/std": 0.4804176688194275,
"sampling/importance_sampling_ratio/max": 2.5663199424743652,
"sampling/importance_sampling_ratio/mean": 0.8002752065658569,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.7162349224090576,
"sampling/sampling_logp_difference/mean": 0.015552837401628494,
"step": 29,
"step_time": 263.5061617055908
},
{
"clip_ratio/high_max": 0.04263468802673742,
"clip_ratio/high_mean": 0.005374936816224363,
"clip_ratio/low_mean": 0.01608989532542182,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.021464832054334693,
"entropy": 0.2745072916150093,
"epoch": 0.00024,
"grad_norm": 0.0038172281347215176,
"kl": 0.09374767541885376,
"learning_rate": 0.0001,
"loss": -0.0059,
"step": 30,
"step_time": 65.82626755977981
},
{
"clip_ratio/high_max": 0.0027790770400315523,
"clip_ratio/high_mean": 0.00034738463000394404,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00034738463000394404,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2434.0,
"completions/max_terminated_length": 2434.0,
"completions/mean_length": 264.4453125,
"completions/mean_terminated_length": 264.4453125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.23240000382065773,
"epoch": 0.000248,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.018018538132309914,
"kl": 0.07072961144149303,
"learning_rate": 0.0001,
"loss": -0.0044,
"num_tokens": 11270435.0,
"reward": 0.5666041970252991,
"reward_std": 0.49271145462989807,
"rewards/reward_func/mean": 0.5666041970252991,
"rewards/reward_func/std": 0.4927114248275757,
"sampling/importance_sampling_ratio/max": 1.8200021982192993,
"sampling/importance_sampling_ratio/mean": 0.97586989402771,
"sampling/importance_sampling_ratio/min": 0.1302126795053482,
"sampling/sampling_logp_difference/max": 0.7188519239425659,
"sampling/sampling_logp_difference/mean": 0.010895353741943836,
"step": 31,
"step_time": 103.47118871309794
},
{
"clip_ratio/high_max": 0.22714198799803853,
"clip_ratio/high_mean": 0.06246224191272631,
"clip_ratio/low_mean": 0.05428445339202881,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.11674669571220875,
"entropy": 0.202346783131361,
"epoch": 0.000256,
"grad_norm": 0.03462144732475281,
"kl": 0.26291508600115776,
"learning_rate": 0.0001,
"loss": -0.0331,
"step": 32,
"step_time": 44.26117577217519
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 1344.0,
"completions/mean_length": 3158.328125,
"completions/mean_terminated_length": 106.25000762939453,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.3173985183238983,
"epoch": 0.000264,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.0015116184949874878,
"kl": 0.17226483300328255,
"learning_rate": 0.0001,
"loss": 0.0,
"num_tokens": 12009445.0,
"reward": 0.37367647886276245,
"reward_std": 0.4661514163017273,
"rewards/reward_func/mean": 0.37367647886276245,
"rewards/reward_func/std": 0.4661514163017273,
"sampling/importance_sampling_ratio/max": 1.266974687576294,
"sampling/importance_sampling_ratio/mean": 0.8097731471061707,
"sampling/importance_sampling_ratio/min": 5.553430160176731e-09,
"sampling/sampling_logp_difference/max": 13.181495666503906,
"sampling/sampling_logp_difference/mean": 0.015717996284365654,
"step": 33,
"step_time": 252.64742845576257
},
{
"clip_ratio/high_max": 0.2083333395421505,
"clip_ratio/high_mean": 0.026041667442768812,
"clip_ratio/low_mean": 0.015625000465661287,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04166666744276881,
"entropy": 0.3447694480419159,
"epoch": 0.000272,
"grad_norm": 0.0012050194200128317,
"kl": 0.09908118983730674,
"learning_rate": 0.0001,
"loss": -0.0,
"step": 34,
"step_time": 52.40133061888628
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10039.0,
"completions/max_terminated_length": 10039.0,
"completions/mean_length": 676.140625,
"completions/mean_terminated_length": 676.140625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.36223024874925613,
"epoch": 0.00028,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.006729471497237682,
"kl": 0.2994176782667637,
"learning_rate": 0.0001,
"loss": 0.0002,
"num_tokens": 12784223.0,
"reward": 0.35155534744262695,
"reward_std": 0.46008607745170593,
"rewards/reward_func/mean": 0.35155534744262695,
"rewards/reward_func/std": 0.46008607745170593,
"sampling/importance_sampling_ratio/max": 1.6923160552978516,
"sampling/importance_sampling_ratio/mean": 0.9230321645736694,
"sampling/importance_sampling_ratio/min": 4.0193415544627353e-13,
"sampling/sampling_logp_difference/max": 2.2721805572509766,
"sampling/sampling_logp_difference/mean": 0.019083332270383835,
"step": 35,
"step_time": 156.88474073121324
},
{
"clip_ratio/high_max": 0.21071429178118706,
"clip_ratio/high_mean": 0.04456845438107848,
"clip_ratio/low_mean": 0.042559525929391384,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.08712797984480858,
"entropy": 0.3039735332131386,
"epoch": 0.000288,
"grad_norm": 0.002629757858812809,
"kl": 1.421083964407444,
"learning_rate": 0.0001,
"loss": -0.0001,
"step": 36,
"step_time": 47.79177230759524
},
{
"clip_ratio/high_max": 0.00212460938928416,
"clip_ratio/high_mean": 0.0004709042623289861,
"clip_ratio/low_mean": 0.00013680529809789732,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0006077095604268834,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14803.0,
"completions/mean_length": 1618.3046875,
"completions/mean_terminated_length": 1383.9287109375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.205389566719532,
"epoch": 0.000296,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.034038130193948746,
"kl": 0.1238506119698286,
"learning_rate": 0.0001,
"loss": 0.0496,
"num_tokens": 13562182.0,
"reward": 0.5307304859161377,
"reward_std": 0.4933399260044098,
"rewards/reward_func/mean": 0.5307304859161377,
"rewards/reward_func/std": 0.4933399558067322,
"sampling/importance_sampling_ratio/max": 2.9194023609161377,
"sampling/importance_sampling_ratio/mean": 0.8693970441818237,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 6.139076232910156,
"sampling/sampling_logp_difference/mean": 0.00930335745215416,
"step": 37,
"step_time": 304.38122520502657
},
{
"clip_ratio/high_max": 0.12984464410692453,
"clip_ratio/high_mean": 0.017246030358364806,
"clip_ratio/low_mean": 0.04759028274565935,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.06483631394803524,
"entropy": 0.19659215956926346,
"epoch": 0.000304,
"grad_norm": 0.014286670833826065,
"kl": 0.09789336752146482,
"learning_rate": 0.0001,
"loss": -0.0889,
"step": 38,
"step_time": 108.92287370702252
},
{
"clip_ratio/high_max": 0.002319882420124486,
"clip_ratio/high_mean": 0.0004717662050097715,
"clip_ratio/low_mean": 4.9786372983362526e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0005215525743551552,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14867.0,
"completions/mean_length": 2032.5,
"completions/mean_terminated_length": 1569.54833984375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.19366027787327766,
"epoch": 0.000312,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.02529173344373703,
"kl": 0.12496417853981256,
"learning_rate": 0.0001,
"loss": -0.1314,
"num_tokens": 14363022.0,
"reward": 0.6012634038925171,
"reward_std": 0.46494749188423157,
"rewards/reward_func/mean": 0.6012634038925171,
"rewards/reward_func/std": 0.46494749188423157,
"sampling/importance_sampling_ratio/max": 2.67891263961792,
"sampling/importance_sampling_ratio/mean": 0.8825238943099976,
"sampling/importance_sampling_ratio/min": 9.072877865667905e-12,
"sampling/sampling_logp_difference/max": 2.937361478805542,
"sampling/sampling_logp_difference/mean": 0.007425494492053986,
"step": 39,
"step_time": 242.51036442094482
},
{
"clip_ratio/high_max": 0.004297120030969381,
"clip_ratio/high_mean": 0.0005371400038711727,
"clip_ratio/low_mean": 0.019620355626102537,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.020157495309831575,
"entropy": 0.2599617578089237,
"epoch": 0.00032,
"grad_norm": 0.003947969060391188,
"kl": 0.17537523806095123,
"learning_rate": 0.0001,
"loss": -0.0074,
"step": 40,
"step_time": 64.70451782317832
},
{
"clip_ratio/high_max": 0.05252361833117902,
"clip_ratio/high_mean": 0.00663674037787132,
"clip_ratio/low_mean": 0.0005198562575969845,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007156596751883626,
"completions/clipped_ratio": 0.125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12109.0,
"completions/mean_length": 2499.59375,
"completions/mean_terminated_length": 516.107177734375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.21654297411441803,
"epoch": 0.000328,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.0020191774237900972,
"kl": 0.050669580698013306,
"learning_rate": 0.0001,
"loss": -0.0037,
"num_tokens": 15148602.0,
"reward": 0.41135668754577637,
"reward_std": 0.4713096022605896,
"rewards/reward_func/mean": 0.41135668754577637,
"rewards/reward_func/std": 0.4713096022605896,
"sampling/importance_sampling_ratio/max": 1.3856815099716187,
"sampling/importance_sampling_ratio/mean": 0.8404459953308105,
"sampling/importance_sampling_ratio/min": 8.073855711603073e-14,
"sampling/sampling_logp_difference/max": 3.5181496143341064,
"sampling/sampling_logp_difference/mean": 0.010210744105279446,
"step": 41,
"step_time": 249.73666059714742
},
{
"clip_ratio/high_max": 0.051496061148100125,
"clip_ratio/high_mean": 0.00651522628334078,
"clip_ratio/low_mean": 0.010954441386274993,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.017469667189288884,
"entropy": 0.22261176258325577,
"epoch": 0.000336,
"grad_norm": 0.01327864546328783,
"kl": 0.08568831626325846,
"learning_rate": 0.0001,
"loss": -0.0071,
"step": 42,
"step_time": 65.51700961985625
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2104.0,
"completions/max_terminated_length": 2104.0,
"completions/mean_length": 337.921875,
"completions/mean_terminated_length": 337.921875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.18646935559809208,
"epoch": 0.000344,
"frac_reward_zero_std": 0.8125,
"grad_norm": 0.009093403816223145,
"kl": 0.04625028697773814,
"learning_rate": 0.0001,
"loss": -0.0001,
"num_tokens": 15697984.0,
"reward": 0.5859714150428772,
"reward_std": 0.48348718881607056,
"rewards/reward_func/mean": 0.5859714150428772,
"rewards/reward_func/std": 0.48348718881607056,
"sampling/importance_sampling_ratio/max": 2.9582595825195312,
"sampling/importance_sampling_ratio/mean": 0.9583175182342529,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.8077512979507446,
"sampling/sampling_logp_difference/mean": 0.01057741791009903,
"step": 43,
"step_time": 83.54059210349806
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.02708333358168602,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02708333358168602,
"entropy": 0.1239668894559145,
"epoch": 0.000352,
"grad_norm": 0.002130000153556466,
"kl": 0.7455503353849053,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 44,
"step_time": 39.00458105024882
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 7439.0,
"completions/max_terminated_length": 7439.0,
"completions/mean_length": 794.125,
"completions/mean_terminated_length": 794.125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.17747851833701134,
"epoch": 0.00036,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.009445400908589363,
"kl": 0.19956867769360542,
"learning_rate": 0.0001,
"loss": 0.0001,
"num_tokens": 16409168.0,
"reward": 0.3401130437850952,
"reward_std": 0.44747307896614075,
"rewards/reward_func/mean": 0.3401130437850952,
"rewards/reward_func/std": 0.44747307896614075,
"sampling/importance_sampling_ratio/max": 1.213844895362854,
"sampling/importance_sampling_ratio/mean": 0.8736224174499512,
"sampling/importance_sampling_ratio/min": 1.1355460628692526e-05,
"sampling/sampling_logp_difference/max": 1.4877896308898926,
"sampling/sampling_logp_difference/mean": 0.009566227905452251,
"step": 45,
"step_time": 112.80105081154034
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.005241899751126766,
"clip_ratio/low_mean": 0.05211690114811063,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05735880043357611,
"entropy": 0.11628733202815056,
"epoch": 0.000368,
"grad_norm": 0.0010464430088177323,
"kl": 0.5155483353883028,
"learning_rate": 0.0001,
"loss": -0.0001,
"step": 46,
"step_time": 33.51861782022752
},
{
"clip_ratio/high_max": 0.0008075302612269297,
"clip_ratio/high_mean": 0.00010094128265336622,
"clip_ratio/low_mean": 0.00017487616059952416,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0002758174450718798,
"completions/clipped_ratio": 0.1171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14041.0,
"completions/mean_length": 2758.3125,
"completions/mean_terminated_length": 949.5928955078125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.20697598904371262,
"epoch": 0.000376,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.008048626594245434,
"kl": 0.31821640580892563,
"learning_rate": 0.0001,
"loss": 0.0341,
"num_tokens": 17250760.0,
"reward": 0.4497794508934021,
"reward_std": 0.47367680072784424,
"rewards/reward_func/mean": 0.4497794508934021,
"rewards/reward_func/std": 0.4736768305301666,
"sampling/importance_sampling_ratio/max": 1.4071918725967407,
"sampling/importance_sampling_ratio/mean": 0.7693284749984741,
"sampling/importance_sampling_ratio/min": 6.366646576258517e-14,
"sampling/sampling_logp_difference/max": 2.0170488357543945,
"sampling/sampling_logp_difference/mean": 0.014924651943147182,
"step": 47,
"step_time": 265.7373479530215
},
{
"clip_ratio/high_max": 0.18790940550388768,
"clip_ratio/high_mean": 0.04044250077276956,
"clip_ratio/low_mean": 0.023039879743009806,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.06348238191276323,
"entropy": 0.20131932944059372,
"epoch": 0.000384,
"grad_norm": 0.002510676858946681,
"kl": 0.3020637482404709,
"learning_rate": 0.0001,
"loss": 0.0018,
"step": 48,
"step_time": 63.53406945313327
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 2.0148290786892176e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 2.0148290786892176e-05,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1549.0,
"completions/max_terminated_length": 1549.0,
"completions/mean_length": 194.90625,
"completions/mean_terminated_length": 194.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.16713403165340424,
"epoch": 0.000392,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.03661969304084778,
"kl": 0.27568595856428146,
"learning_rate": 0.0001,
"loss": 0.1756,
"num_tokens": 17609516.0,
"reward": 0.3761705458164215,
"reward_std": 0.4686991274356842,
"rewards/reward_func/mean": 0.3761705458164215,
"rewards/reward_func/std": 0.4686991274356842,
"sampling/importance_sampling_ratio/max": 2.2934863567352295,
"sampling/importance_sampling_ratio/mean": 1.0030004978179932,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.8843307495117188,
"sampling/sampling_logp_difference/mean": 0.011824723333120346,
"step": 49,
"step_time": 33.989881575806066
},
{
"clip_ratio/high_max": 0.18333333730697632,
"clip_ratio/high_mean": 0.03967476915568113,
"clip_ratio/low_mean": 0.06265822611749172,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.10233299620449543,
"entropy": 0.11747358739376068,
"epoch": 0.0004,
"grad_norm": 520.3370971679688,
"kl": 138366.12171524763,
"learning_rate": 0.0001,
"loss": 0.9764,
"step": 50,
"step_time": 11.585765323834494
},
{
"clip_ratio/high_max": 0.001699419430224225,
"clip_ratio/high_mean": 0.00021242742877802812,
"clip_ratio/low_mean": 8.518956747138873e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00029761699261143804,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14391.0,
"completions/mean_length": 1964.4296875,
"completions/mean_terminated_length": 1499.2822265625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.19312381371855736,
"epoch": 0.000408,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.004857824184000492,
"kl": 0.11069730296730995,
"learning_rate": 0.0001,
"loss": -0.0,
"num_tokens": 18231323.0,
"reward": 0.5406150221824646,
"reward_std": 0.474507600069046,
"rewards/reward_func/mean": 0.5406150221824646,
"rewards/reward_func/std": 0.474507600069046,
"sampling/importance_sampling_ratio/max": 1.754118800163269,
"sampling/importance_sampling_ratio/mean": 0.8371249437332153,
"sampling/importance_sampling_ratio/min": 2.1961432238731815e-12,
"sampling/sampling_logp_difference/max": 1.711458444595337,
"sampling/sampling_logp_difference/mean": 0.011138837784528732,
"step": 51,
"step_time": 234.7052824080456
},
{
"clip_ratio/high_max": 0.052048374724108726,
"clip_ratio/high_mean": 0.006506046840513591,
"clip_ratio/low_mean": 0.0005927347665419802,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007098781847162172,
"entropy": 0.18790747597813606,
"epoch": 0.000416,
"grad_norm": 0.02879762463271618,
"kl": 0.14645008742809296,
"learning_rate": 0.0001,
"loss": -0.0793,
"step": 52,
"step_time": 67.58970385813154
},
{
"clip_ratio/high_max": 0.0011903044069185853,
"clip_ratio/high_mean": 0.00014878805086482316,
"clip_ratio/low_mean": 0.00016782619059085846,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0003166142414556816,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14571.0,
"completions/mean_length": 1242.5234375,
"completions/mean_terminated_length": 1002.1826171875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.2562270388007164,
"epoch": 0.000424,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.003293583169579506,
"kl": 0.14673679322004318,
"learning_rate": 0.0001,
"loss": -0.0,
"num_tokens": 18904638.0,
"reward": 0.515934944152832,
"reward_std": 0.4869852364063263,
"rewards/reward_func/mean": 0.515934944152832,
"rewards/reward_func/std": 0.4869852364063263,
"sampling/importance_sampling_ratio/max": 2.125849485397339,
"sampling/importance_sampling_ratio/mean": 0.8796348571777344,
"sampling/importance_sampling_ratio/min": 1.6146490811053127e-09,
"sampling/sampling_logp_difference/max": 2.236321210861206,
"sampling/sampling_logp_difference/mean": 0.00907333567738533,
"step": 53,
"step_time": 403.6396517488174
},
{
"clip_ratio/high_max": 0.004773067426867783,
"clip_ratio/high_mean": 0.0005966334283584729,
"clip_ratio/low_mean": 9.633062768443779e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0006929640658199787,
"entropy": 0.16939815878868103,
"epoch": 0.000432,
"grad_norm": 0.005178892519325018,
"kl": 0.12158099561929703,
"learning_rate": 0.0001,
"loss": -0.0092,
"step": 54,
"step_time": 155.7508629639633
},
{
"clip_ratio/high_max": 0.00413007679162547,
"clip_ratio/high_mean": 0.0005162595989531837,
"clip_ratio/low_mean": 0.00044288246863288805,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.000959142082137987,
"completions/clipped_ratio": 0.1640625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15599.0,
"completions/mean_length": 4115.78125,
"completions/mean_terminated_length": 1708.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.14124679006636143,
"epoch": 0.00044,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.0008667530491948128,
"kl": 0.15955708548426628,
"learning_rate": 0.0001,
"loss": 0.0005,
"num_tokens": 19689170.0,
"reward": 0.4943884015083313,
"reward_std": 0.4778369963169098,
"rewards/reward_func/mean": 0.4943884015083313,
"rewards/reward_func/std": 0.4778369963169098,
"sampling/importance_sampling_ratio/max": 1.5920473337173462,
"sampling/importance_sampling_ratio/mean": 0.6500886082649231,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.026095390319824,
"sampling/sampling_logp_difference/mean": 0.006507984362542629,
"step": 55,
"step_time": 295.36478371801786
},
{
"clip_ratio/high_max": 0.006438895943574607,
"clip_ratio/high_mean": 0.0010595864005153999,
"clip_ratio/low_mean": 0.0001230314956046641,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0011826178670162335,
"entropy": 0.16170401498675346,
"epoch": 0.000448,
"grad_norm": 0.0010976437479257584,
"kl": 0.14007344283163548,
"learning_rate": 0.0001,
"loss": -0.0014,
"step": 56,
"step_time": 87.99333154270425
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0005311340792104602,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0005311340792104602,
"completions/clipped_ratio": 0.1171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 10352.0,
"completions/mean_length": 2540.15625,
"completions/mean_terminated_length": 702.4778442382812,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.09304443560540676,
"epoch": 0.000456,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.0008479373645968735,
"kl": 0.06781758088618517,
"learning_rate": 0.0001,
"loss": 0.0,
"num_tokens": 20376822.0,
"reward": 0.5963060855865479,
"reward_std": 0.4900885820388794,
"rewards/reward_func/mean": 0.5963060855865479,
"rewards/reward_func/std": 0.4900885820388794,
"sampling/importance_sampling_ratio/max": 1.911750316619873,
"sampling/importance_sampling_ratio/mean": 0.787479043006897,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.4092633724212646,
"sampling/sampling_logp_difference/mean": 0.004730356857180595,
"step": 57,
"step_time": 275.88833857746795
},
{
"clip_ratio/high_max": 0.0007761355664115399,
"clip_ratio/high_mean": 9.701694580144249e-05,
"clip_ratio/low_mean": 0.0005300462580635212,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0006270632075029425,
"entropy": 0.07158766873180866,
"epoch": 0.000464,
"grad_norm": 0.0023808805271983147,
"kl": 0.058913652785122395,
"learning_rate": 0.0001,
"loss": -0.0026,
"step": 58,
"step_time": 78.15128924208693
},
{
"clip_ratio/high_max": 0.0045549869537353516,
"clip_ratio/high_mean": 0.0005933154025115073,
"clip_ratio/low_mean": 0.0014756222371943295,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0020689376979134977,
"completions/clipped_ratio": 0.1015625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14099.0,
"completions/mean_length": 2205.8359375,
"completions/mean_terminated_length": 603.0869140625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.12633545510470867,
"epoch": 0.000472,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0027347116265445948,
"kl": 0.06752120889723301,
"learning_rate": 0.0001,
"loss": -0.0058,
"num_tokens": 20882249.0,
"reward": 0.37335968017578125,
"reward_std": 0.46674516797065735,
"rewards/reward_func/mean": 0.37335968017578125,
"rewards/reward_func/std": 0.46674516797065735,
"sampling/importance_sampling_ratio/max": 1.1944468021392822,
"sampling/importance_sampling_ratio/mean": 0.8154112100601196,
"sampling/importance_sampling_ratio/min": 3.1055763429627126e-12,
"sampling/sampling_logp_difference/max": 2.516141891479492,
"sampling/sampling_logp_difference/mean": 0.005523150786757469,
"step": 59,
"step_time": 243.8482750041876
},
{
"clip_ratio/high_max": 0.0070169707760214806,
"clip_ratio/high_mean": 0.0008790283463895321,
"clip_ratio/low_mean": 0.011722586234100163,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012601614958839491,
"entropy": 0.19912216998636723,
"epoch": 0.00048,
"grad_norm": 0.004964805673807859,
"kl": 0.3173718089237809,
"learning_rate": 0.0001,
"loss": -0.0111,
"step": 60,
"step_time": 67.48408747394569
},
{
"clip_ratio/high_max": 0.003293595160357654,
"clip_ratio/high_mean": 0.00041169939504470676,
"clip_ratio/low_mean": 0.001325315679423511,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001737015089020133,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15692.0,
"completions/mean_length": 3332.625,
"completions/mean_terminated_length": 1196.9454345703125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.20970237255096436,
"epoch": 0.000488,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.0021050143986940384,
"kl": 0.22093774378299713,
"learning_rate": 0.0001,
"loss": 0.0007,
"num_tokens": 21614089.0,
"reward": 0.39702850580215454,
"reward_std": 0.48266974091529846,
"rewards/reward_func/mean": 0.39702850580215454,
"rewards/reward_func/std": 0.48266977071762085,
"sampling/importance_sampling_ratio/max": 1.2085174322128296,
"sampling/importance_sampling_ratio/mean": 0.7436270713806152,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.35117506980896,
"sampling/sampling_logp_difference/mean": 0.009538266807794571,
"step": 61,
"step_time": 356.0667571427766
},
{
"clip_ratio/high_max": 0.0040483163320459425,
"clip_ratio/high_mean": 0.0008040911634452641,
"clip_ratio/low_mean": 0.000619270489551127,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0014233616821002215,
"entropy": 0.2903680093586445,
"epoch": 0.000496,
"grad_norm": 0.0013163138646632433,
"kl": 0.22006989642977715,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 62,
"step_time": 120.53421806404367
},
{
"clip_ratio/high_max": 0.04629576357547194,
"clip_ratio/high_mean": 0.006143865539343096,
"clip_ratio/low_mean": 0.00034697772935032845,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006490843268693425,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 5936.0,
"completions/mean_length": 3316.71875,
"completions/mean_terminated_length": 301.19232177734375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.11050131916999817,
"epoch": 0.000504,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.011926773004233837,
"kl": 0.7313324622809887,
"learning_rate": 0.0001,
"loss": 0.0115,
"num_tokens": 22353821.0,
"reward": 0.326221227645874,
"reward_std": 0.46681538224220276,
"rewards/reward_func/mean": 0.326221227645874,
"rewards/reward_func/std": 0.46681535243988037,
"sampling/importance_sampling_ratio/max": 1.7655757665634155,
"sampling/importance_sampling_ratio/mean": 0.7426654100418091,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.8568098545074463,
"sampling/sampling_logp_difference/mean": 0.006891004741191864,
"step": 63,
"step_time": 302.78563037491404
},
{
"clip_ratio/high_max": 0.04654776549432427,
"clip_ratio/high_mean": 0.0064325097628170624,
"clip_ratio/low_mean": 0.0002609335570014082,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.006693443312542513,
"entropy": 0.2221522331237793,
"epoch": 0.000512,
"grad_norm": 0.00616535684093833,
"kl": 0.31134266406297684,
"learning_rate": 0.0001,
"loss": -0.0099,
"step": 64,
"step_time": 98.32436606986448
},
{
"clip_ratio/high_max": 0.002486778888851404,
"clip_ratio/high_mean": 0.0003108473611064255,
"clip_ratio/low_mean": 0.004113847695407458,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004424695056513883,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12115.0,
"completions/mean_length": 1701.78125,
"completions/mean_terminated_length": 591.3613891601562,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.21395106986165047,
"epoch": 0.00052,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.0024383016861975193,
"kl": 0.6399696841835976,
"learning_rate": 0.0001,
"loss": -0.0004,
"num_tokens": 23079545.0,
"reward": 0.5035587549209595,
"reward_std": 0.49234020709991455,
"rewards/reward_func/mean": 0.5035587549209595,
"rewards/reward_func/std": 0.49234020709991455,
"sampling/importance_sampling_ratio/max": 2.8838155269622803,
"sampling/importance_sampling_ratio/mean": 0.9122731685638428,
"sampling/importance_sampling_ratio/min": 1.5084187154554285e-12,
"sampling/sampling_logp_difference/max": 1.4747650623321533,
"sampling/sampling_logp_difference/mean": 0.008541534654796124,
"step": 65,
"step_time": 420.5027240368072
},
{
"clip_ratio/high_max": 0.006610034382902086,
"clip_ratio/high_mean": 0.0008262542978627607,
"clip_ratio/low_mean": 0.025520833674818277,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.026347088234615512,
"entropy": 0.21546945348381996,
"epoch": 0.000528,
"grad_norm": 0.002588092116639018,
"kl": 0.3157913535833359,
"learning_rate": 0.0001,
"loss": -0.0018,
"step": 66,
"step_time": 150.25083671603352
},
{
"clip_ratio/high_max": 0.00305051077157259,
"clip_ratio/high_mean": 0.0004194560169707984,
"clip_ratio/low_mean": 0.0010861777554964647,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0015056337579153478,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 11704.0,
"completions/mean_length": 1241.578125,
"completions/mean_terminated_length": 753.1128540039062,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.34802427887916565,
"epoch": 0.000536,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.010660664178431034,
"kl": 0.3574690632522106,
"learning_rate": 0.0001,
"loss": 0.0188,
"num_tokens": 23671939.0,
"reward": 0.30863311886787415,
"reward_std": 0.4487622380256653,
"rewards/reward_func/mean": 0.30863311886787415,
"rewards/reward_func/std": 0.4487622380256653,
"sampling/importance_sampling_ratio/max": 1.2251367568969727,
"sampling/importance_sampling_ratio/mean": 0.8774986267089844,
"sampling/importance_sampling_ratio/min": 5.643987203594533e-15,
"sampling/sampling_logp_difference/max": 2.2548747062683105,
"sampling/sampling_logp_difference/mean": 0.01900642365217209,
"step": 67,
"step_time": 272.54457034613006
},
{
"clip_ratio/high_max": 0.04360994976013899,
"clip_ratio/high_mean": 0.0056244394509121776,
"clip_ratio/low_mean": 0.019384152255952358,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.025008591823279858,
"entropy": 0.30202219262719154,
"epoch": 0.000544,
"grad_norm": 0.0031096329912543297,
"kl": 0.19528233632445335,
"learning_rate": 0.0001,
"loss": -0.0002,
"step": 68,
"step_time": 87.20590779092163
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 7.0,
"completions/mean_length": 1026.96875,
"completions/mean_terminated_length": 3.1666667461395264,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.360334113240242,
"epoch": 0.000552,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.004637387115508318,
"kl": 0.21597419865429401,
"learning_rate": 0.0001,
"loss": 0.0001,
"num_tokens": 24378991.0,
"reward": 0.2120947688817978,
"reward_std": 0.3439648747444153,
"rewards/reward_func/mean": 0.2120947688817978,
"rewards/reward_func/std": 0.3439648747444153,
"sampling/importance_sampling_ratio/max": 1.2173486948013306,
"sampling/importance_sampling_ratio/mean": 0.9408060908317566,
"sampling/importance_sampling_ratio/min": 6.941108278424313e-12,
"sampling/sampling_logp_difference/max": 2.992739677429199,
"sampling/sampling_logp_difference/mean": 0.01907212659716606,
"step": 69,
"step_time": 413.641770795919
},
{
"clip_ratio/high_max": 0.22500000521540642,
"clip_ratio/high_mean": 0.033333334140479565,
"clip_ratio/low_mean": 0.01145833358168602,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04479166818782687,
"entropy": 0.43712426722049713,
"epoch": 0.00056,
"grad_norm": 0.0043171476572752,
"kl": 0.2573888264596462,
"learning_rate": 0.0001,
"loss": -0.0001,
"step": 70,
"step_time": 149.37348693376407
},
{
"clip_ratio/high_max": 0.011275041149929166,
"clip_ratio/high_mean": 0.0016421006293967366,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0016421006293967366,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15718.0,
"completions/mean_length": 1101.6171875,
"completions/mean_terminated_length": 981.283447265625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.24557911232113838,
"epoch": 0.000568,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.0067433081567287445,
"kl": 0.1323122438043356,
"learning_rate": 0.0001,
"loss": -0.0056,
"num_tokens": 24892926.0,
"reward": 0.5961877107620239,
"reward_std": 0.48414939641952515,
"rewards/reward_func/mean": 0.5961877107620239,
"rewards/reward_func/std": 0.48414939641952515,
"sampling/importance_sampling_ratio/max": 2.302067279815674,
"sampling/importance_sampling_ratio/mean": 0.8856508731842041,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.4530627727508545,
"sampling/sampling_logp_difference/mean": 0.014587011188268661,
"step": 71,
"step_time": 211.82522862195037
},
{
"clip_ratio/high_max": 0.05322292904020287,
"clip_ratio/high_mean": 0.007361717482126551,
"clip_ratio/low_mean": 0.0005052025571785634,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.007866920397646027,
"entropy": 0.24957521632313728,
"epoch": 0.000576,
"grad_norm": 0.03510262817144394,
"kl": 0.10960768908262253,
"learning_rate": 0.0001,
"loss": 0.0024,
"step": 72,
"step_time": 55.394755602115765
},
{
"clip_ratio/high_max": 0.04315747210057452,
"clip_ratio/high_mean": 0.005394684012571815,
"clip_ratio/low_mean": 0.01105505934174289,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01644974334340077,
"completions/clipped_ratio": 0.0390625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14039.0,
"completions/mean_length": 1335.3359375,
"completions/mean_terminated_length": 723.6016235351562,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.24440882354974747,
"epoch": 0.000584,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.003040608251467347,
"kl": 0.3741700351238251,
"learning_rate": 0.0001,
"loss": 0.0027,
"num_tokens": 25583089.0,
"reward": 0.5570250153541565,
"reward_std": 0.4779793322086334,
"rewards/reward_func/mean": 0.5570250153541565,
"rewards/reward_func/std": 0.4779793322086334,
"sampling/importance_sampling_ratio/max": 1.250884771347046,
"sampling/importance_sampling_ratio/mean": 0.888532280921936,
"sampling/importance_sampling_ratio/min": 2.9074090690528465e-08,
"sampling/sampling_logp_difference/max": 1.549929141998291,
"sampling/sampling_logp_difference/mean": 0.010362871922552586,
"step": 73,
"step_time": 457.1430780822411
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.01055803267081501,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01055803267081501,
"entropy": 0.20154350250959396,
"epoch": 0.000592,
"grad_norm": 0.002142983488738537,
"kl": 0.14010655879974365,
"learning_rate": 0.0001,
"loss": 0.0032,
"step": 74,
"step_time": 180.0332786256913
},
{
"clip_ratio/high_max": 0.003929472557501867,
"clip_ratio/high_mean": 0.000681739784340607,
"clip_ratio/low_mean": 0.000918249599635601,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0015999893948901445,
"completions/clipped_ratio": 0.171875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13287.0,
"completions/mean_length": 3697.3046875,
"completions/mean_terminated_length": 1064.217041015625,
"completions/min_length": 3.0,
"completions/min_terminated_length": 3.0,
"entropy": 0.22590620815753937,
"epoch": 0.0006,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.003586079925298691,
"kl": 0.14618558436632156,
"learning_rate": 0.0001,
"loss": -0.0088,
"num_tokens": 26577264.0,
"reward": 0.3558464050292969,
"reward_std": 0.4764367341995239,
"rewards/reward_func/mean": 0.3558464050292969,
"rewards/reward_func/std": 0.4764367640018463,
"sampling/importance_sampling_ratio/max": 1.2136788368225098,
"sampling/importance_sampling_ratio/mean": 0.7275122404098511,
"sampling/importance_sampling_ratio/min": 1.4362665263063827e-19,
"sampling/sampling_logp_difference/max": 1.6977732181549072,
"sampling/sampling_logp_difference/mean": 0.013559934683144093,
"step": 75,
"step_time": 279.72021683468483
},
{
"clip_ratio/high_max": 0.03983482558396645,
"clip_ratio/high_mean": 0.005230471204413334,
"clip_ratio/low_mean": 0.004741664102766663,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009972134721465409,
"entropy": 0.20037926360964775,
"epoch": 0.000608,
"grad_norm": 0.0038128597661852837,
"kl": 0.12694548070430756,
"learning_rate": 0.0001,
"loss": -0.0052,
"step": 76,
"step_time": 75.0094793732278
},
{
"clip_ratio/high_max": 0.0030184224306140095,
"clip_ratio/high_mean": 0.0003773028038267512,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0003773028038267512,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 13004.0,
"completions/mean_length": 2918.640625,
"completions/mean_terminated_length": 715.2181396484375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.2610139548778534,
"epoch": 0.000616,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0017114380607381463,
"kl": 0.35155298560857773,
"learning_rate": 0.0001,
"loss": -0.0019,
"num_tokens": 27487122.0,
"reward": 0.37231287360191345,
"reward_std": 0.451972633600235,
"rewards/reward_func/mean": 0.37231287360191345,
"rewards/reward_func/std": 0.451972633600235,
"sampling/importance_sampling_ratio/max": 1.72040593624115,
"sampling/importance_sampling_ratio/mean": 0.8099797964096069,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.51240873336792,
"sampling/sampling_logp_difference/mean": 0.01623906008899212,
"step": 77,
"step_time": 270.05139491008595
},
{
"clip_ratio/high_max": 0.04338416282553226,
"clip_ratio/high_mean": 0.00988730626704637,
"clip_ratio/low_mean": 0.014765097017516382,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.024652403328218497,
"entropy": 0.31988539546728134,
"epoch": 0.000624,
"grad_norm": 0.01336484681814909,
"kl": 0.2897513546049595,
"learning_rate": 0.0001,
"loss": -0.0337,
"step": 78,
"step_time": 79.79506105207838
},
{
"clip_ratio/high_max": 0.05000000074505806,
"clip_ratio/high_mean": 0.0062500000931322575,
"clip_ratio/low_mean": 0.0052083334885537624,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01145833358168602,
"completions/clipped_ratio": 0.125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 1118.0,
"completions/mean_length": 2130.7109375,
"completions/mean_terminated_length": 94.52678680419922,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.2616175599396229,
"epoch": 0.000632,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.003293546847999096,
"kl": 0.34126188047230244,
"learning_rate": 0.0001,
"loss": -0.0058,
"num_tokens": 28247557.0,
"reward": 0.42832934856414795,
"reward_std": 0.45818737149238586,
"rewards/reward_func/mean": 0.42832934856414795,
"rewards/reward_func/std": 0.45818737149238586,
"sampling/importance_sampling_ratio/max": 2.1360301971435547,
"sampling/importance_sampling_ratio/mean": 0.9051436185836792,
"sampling/importance_sampling_ratio/min": 2.526717501893927e-09,
"sampling/sampling_logp_difference/max": 2.0171802043914795,
"sampling/sampling_logp_difference/mean": 0.017018210142850876,
"step": 79,
"step_time": 285.03405929682776
},
{
"clip_ratio/high_max": 0.22500000521540642,
"clip_ratio/high_mean": 0.03081387374550104,
"clip_ratio/low_mean": 0.01876397612886649,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04957784991711378,
"entropy": 0.2775324620306492,
"epoch": 0.00064,
"grad_norm": 0.003191626165062189,
"kl": 0.21329555287957191,
"learning_rate": 0.0001,
"loss": 0.0083,
"step": 80,
"step_time": 82.6638121791184
},
{
"clip_ratio/high_max": 0.0008650519303046167,
"clip_ratio/high_mean": 0.00010813149128807709,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00010813149128807709,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 1379.0,
"completions/mean_length": 1149.046875,
"completions/mean_terminated_length": 133.3833465576172,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.17147246748209,
"epoch": 0.000648,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0028895766008645296,
"kl": 0.09206334501504898,
"learning_rate": 0.0001,
"loss": 0.0,
"num_tokens": 28968019.0,
"reward": 0.5178102254867554,
"reward_std": 0.47222229838371277,
"rewards/reward_func/mean": 0.5178102254867554,
"rewards/reward_func/std": 0.47222229838371277,
"sampling/importance_sampling_ratio/max": 1.396835446357727,
"sampling/importance_sampling_ratio/mean": 0.9524275064468384,
"sampling/importance_sampling_ratio/min": 1.0122628737008199e-05,
"sampling/sampling_logp_difference/max": 1.3480243682861328,
"sampling/sampling_logp_difference/mean": 0.026343410834670067,
"step": 81,
"step_time": 286.13709013699554
},
{
"clip_ratio/high_max": 0.09256594924954697,
"clip_ratio/high_mean": 0.021243362592940684,
"clip_ratio/low_mean": 0.01618036488071084,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.037423727568238974,
"entropy": 0.19774584844708443,
"epoch": 0.000656,
"grad_norm": 0.003909197635948658,
"kl": 0.20639685168862343,
"learning_rate": 0.0001,
"loss": -0.0001,
"step": 82,
"step_time": 81.07379846903495
},
{
"clip_ratio/high_max": 0.0021464216988533735,
"clip_ratio/high_mean": 0.0002683027123566717,
"clip_ratio/low_mean": 0.005357950474717654,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0056262532161781564,
"completions/clipped_ratio": 0.0,
"completions/max_length": 3288.0,
"completions/max_terminated_length": 3288.0,
"completions/mean_length": 196.8671875,
"completions/mean_terminated_length": 196.8671875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.3457997739315033,
"epoch": 0.000664,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.022016355767846107,
"kl": 0.4088924489915371,
"learning_rate": 0.0001,
"loss": -0.007,
"num_tokens": 29420586.0,
"reward": 0.37082305550575256,
"reward_std": 0.46140459179878235,
"rewards/reward_func/mean": 0.37082305550575256,
"rewards/reward_func/std": 0.46140459179878235,
"sampling/importance_sampling_ratio/max": 1.4103598594665527,
"sampling/importance_sampling_ratio/mean": 0.9493687748908997,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.0998687744140625,
"sampling/sampling_logp_difference/mean": 0.023627880960702896,
"step": 83,
"step_time": 57.941960010211915
},
{
"clip_ratio/high_max": 0.1273603499867022,
"clip_ratio/high_mean": 0.016455542587209493,
"clip_ratio/low_mean": 0.06458333507180214,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.08103887923061848,
"entropy": 0.282495453953743,
"epoch": 0.000672,
"grad_norm": 0.021933559328317642,
"kl": 0.2802862487733364,
"learning_rate": 0.0001,
"loss": -0.0041,
"step": 84,
"step_time": 18.895374842220917
},
{
"clip_ratio/high_max": 0.0016916769818635657,
"clip_ratio/high_mean": 0.00021145962273294572,
"clip_ratio/low_mean": 0.0004629129107343033,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0006743725316482596,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 8302.0,
"completions/mean_length": 1417.296875,
"completions/mean_terminated_length": 419.5166931152344,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.19242028519511223,
"epoch": 0.00068,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.004599843639880419,
"kl": 0.10945684090256691,
"learning_rate": 0.0001,
"loss": -0.0015,
"num_tokens": 30223584.0,
"reward": 0.38827669620513916,
"reward_std": 0.4383874535560608,
"rewards/reward_func/mean": 0.38827669620513916,
"rewards/reward_func/std": 0.4383874237537384,
"sampling/importance_sampling_ratio/max": 1.2852191925048828,
"sampling/importance_sampling_ratio/mean": 0.8869220018386841,
"sampling/importance_sampling_ratio/min": 6.8003160436092e-08,
"sampling/sampling_logp_difference/max": 3.0428004264831543,
"sampling/sampling_logp_difference/mean": 0.011553528718650341,
"step": 85,
"step_time": 401.25821340084076
},
{
"clip_ratio/high_max": 0.08453038916923106,
"clip_ratio/high_mean": 0.01064000147744082,
"clip_ratio/low_mean": 0.041717116328072734,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.052357119115185924,
"entropy": 0.28386014327406883,
"epoch": 0.000688,
"grad_norm": 0.008475115522742271,
"kl": 0.24625534750521183,
"learning_rate": 0.0001,
"loss": 0.0125,
"step": 86,
"step_time": 140.54234700393863
},
{
"clip_ratio/high_max": 0.025748229207238182,
"clip_ratio/high_mean": 0.0034044762833218556,
"clip_ratio/low_mean": 0.0008440050805802457,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00424848121474497,
"completions/clipped_ratio": 0.0859375,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14449.0,
"completions/mean_length": 2038.8046875,
"completions/mean_terminated_length": 690.1111450195312,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.17366936802864075,
"epoch": 0.000696,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.004945417400449514,
"kl": 0.21483153477311134,
"learning_rate": 0.0001,
"loss": -0.0155,
"num_tokens": 30996463.0,
"reward": 0.569595456123352,
"reward_std": 0.4689222276210785,
"rewards/reward_func/mean": 0.569595456123352,
"rewards/reward_func/std": 0.4689222574234009,
"sampling/importance_sampling_ratio/max": 1.5022563934326172,
"sampling/importance_sampling_ratio/mean": 0.8090132474899292,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.836258888244629,
"sampling/sampling_logp_difference/mean": 0.012665043585002422,
"step": 87,
"step_time": 420.3839778539259
},
{
"clip_ratio/high_max": 0.07500000111758709,
"clip_ratio/high_mean": 0.009656705195084214,
"clip_ratio/low_mean": 0.021538077868171968,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03119478444568813,
"entropy": 0.14676123298704624,
"epoch": 0.000704,
"grad_norm": 0.0015365808503702283,
"kl": 0.2069963738322258,
"learning_rate": 0.0001,
"loss": 0.0006,
"step": 88,
"step_time": 159.39437931077555
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0052083334885537624,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0052083334885537624,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2818.0,
"completions/max_terminated_length": 2818.0,
"completions/mean_length": 249.2734375,
"completions/mean_terminated_length": 249.2734375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.19034305587410927,
"epoch": 0.000712,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.013274043798446655,
"kl": 0.10685652680695057,
"learning_rate": 0.0001,
"loss": 0.0003,
"num_tokens": 31367970.0,
"reward": 0.5855048894882202,
"reward_std": 0.4757256805896759,
"rewards/reward_func/mean": 0.5855048894882202,
"rewards/reward_func/std": 0.4757256805896759,
"sampling/importance_sampling_ratio/max": 2.164741277694702,
"sampling/importance_sampling_ratio/mean": 0.9093552827835083,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 0.9638514518737793,
"sampling/sampling_logp_difference/mean": 0.010461562313139439,
"step": 89,
"step_time": 45.6294292754028
},
{
"clip_ratio/high_max": 0.1666666716337204,
"clip_ratio/high_mean": 0.031250000931322575,
"clip_ratio/low_mean": 0.02095832316626911,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05220832256600261,
"entropy": 0.16832438856363297,
"epoch": 0.00072,
"grad_norm": 0.013677907176315784,
"kl": 0.21795203164219856,
"learning_rate": 0.0001,
"loss": -0.0003,
"step": 90,
"step_time": 14.39668608084321
},
{
"clip_ratio/high_max": 0.0016864245990291238,
"clip_ratio/high_mean": 0.00021080307487864047,
"clip_ratio/low_mean": 9.959549061022699e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00031039856548886746,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 14607.0,
"completions/mean_length": 2265.78125,
"completions/mean_terminated_length": 1198.016845703125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.3109714537858963,
"epoch": 0.000728,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.0026221030857414007,
"kl": 0.29385947436094284,
"learning_rate": 0.0001,
"loss": -0.0016,
"num_tokens": 31978134.0,
"reward": 0.41071587800979614,
"reward_std": 0.48560255765914917,
"rewards/reward_func/mean": 0.41071587800979614,
"rewards/reward_func/std": 0.48560255765914917,
"sampling/importance_sampling_ratio/max": 2.1744186878204346,
"sampling/importance_sampling_ratio/mean": 0.7339906692504883,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.330114483833313,
"sampling/sampling_logp_difference/mean": 0.01747949793934822,
"step": 91,
"step_time": 224.5670603781473
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.00536864111199975,
"clip_ratio/low_mean": 0.01056993727979716,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01593857839179691,
"entropy": 0.3097205422818661,
"epoch": 0.000736,
"grad_norm": 0.0013873938005417585,
"kl": 0.21971550025045872,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 92,
"step_time": 41.30870744702406
},
{
"clip_ratio/high_max": 0.0018048831261694431,
"clip_ratio/high_mean": 0.00023677908757235855,
"clip_ratio/low_mean": 0.021204495129495626,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.021441274860990234,
"completions/clipped_ratio": 0.1796875,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 8798.0,
"completions/mean_length": 3474.4140625,
"completions/mean_terminated_length": 646.6000366210938,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.16217100247740746,
"epoch": 0.000744,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.00539315864443779,
"kl": 0.22048377990722656,
"learning_rate": 0.0001,
"loss": 0.0072,
"num_tokens": 33019531.0,
"reward": 0.2454037368297577,
"reward_std": 0.39766862988471985,
"rewards/reward_func/mean": 0.2454037368297577,
"rewards/reward_func/std": 0.3976685702800751,
"sampling/importance_sampling_ratio/max": 2.8269150257110596,
"sampling/importance_sampling_ratio/mean": 0.7975430488586426,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.909695863723755,
"sampling/sampling_logp_difference/mean": 0.007487665396183729,
"step": 93,
"step_time": 474.8481697048992
},
{
"clip_ratio/high_max": 0.04168192390443437,
"clip_ratio/high_mean": 0.005210240488054296,
"clip_ratio/low_mean": 0.0055653811286902055,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01077562254795339,
"entropy": 0.15040505304932594,
"epoch": 0.000752,
"grad_norm": 0.008554578758776188,
"kl": 0.21248403005301952,
"learning_rate": 0.0001,
"loss": 0.0569,
"step": 94,
"step_time": 175.67941991216503
},
{
"clip_ratio/high_max": 0.004689611494541168,
"clip_ratio/high_mean": 0.0007652845233678818,
"clip_ratio/low_mean": 0.00020595593377947807,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0009712404571473598,
"completions/clipped_ratio": 0.0703125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 15830.0,
"completions/mean_length": 1703.2734375,
"completions/mean_terminated_length": 592.9664306640625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.11936141178011894,
"epoch": 0.00076,
"frac_reward_zero_std": 0.5625,
"grad_norm": 0.007037501782178879,
"kl": 0.15314552932977676,
"learning_rate": 0.0001,
"loss": 0.0121,
"num_tokens": 33616110.0,
"reward": 0.5862494707107544,
"reward_std": 0.4870387017726898,
"rewards/reward_func/mean": 0.5862494707107544,
"rewards/reward_func/std": 0.4870387017726898,
"sampling/importance_sampling_ratio/max": 1.921995997428894,
"sampling/importance_sampling_ratio/mean": 0.8520303964614868,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.492267608642578,
"sampling/sampling_logp_difference/mean": 0.0060460735112428665,
"step": 95,
"step_time": 279.93328415811993
},
{
"clip_ratio/high_max": 0.1648085294291377,
"clip_ratio/high_mean": 0.027523898315848783,
"clip_ratio/low_mean": 0.00035762626794166863,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.02788152452558279,
"entropy": 0.11354503780603409,
"epoch": 0.000768,
"grad_norm": 0.007192930206656456,
"kl": 0.07495404127985239,
"learning_rate": 0.0001,
"loss": -0.0076,
"step": 96,
"step_time": 81.44801545701921
},
{
"clip_ratio/high_max": 0.0718371415277943,
"clip_ratio/high_mean": 0.009176566891255789,
"clip_ratio/low_mean": 0.0004482567746890709,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009624823651392944,
"completions/clipped_ratio": 0.1640625,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 12879.0,
"completions/mean_length": 3361.4921875,
"completions/mean_terminated_length": 805.6728515625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.09433631971478462,
"epoch": 0.000776,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.006469857878983021,
"kl": 0.06316580064594746,
"learning_rate": 0.0001,
"loss": -0.0171,
"num_tokens": 34458349.0,
"reward": 0.3976612091064453,
"reward_std": 0.47885704040527344,
"rewards/reward_func/mean": 0.3976612091064453,
"rewards/reward_func/std": 0.47885704040527344,
"sampling/importance_sampling_ratio/max": 1.6287328004837036,
"sampling/importance_sampling_ratio/mean": 0.7641345262527466,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 2.092411518096924,
"sampling/sampling_logp_difference/mean": 0.0073972526006400585,
"step": 97,
"step_time": 312.0833521957975
},
{
"clip_ratio/high_max": 0.0019895988516509533,
"clip_ratio/high_mean": 0.000492683844640851,
"clip_ratio/low_mean": 0.004362157778814435,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004854841623455286,
"entropy": 0.11002197489142418,
"epoch": 0.000784,
"grad_norm": 0.0009743753471411765,
"kl": 0.06945361755788326,
"learning_rate": 0.0001,
"loss": 0.0,
"step": 98,
"step_time": 100.79940819228068
},
{
"clip_ratio/high_max": 0.00242789089679718,
"clip_ratio/high_mean": 0.00032243724854197353,
"clip_ratio/low_mean": 3.6235200241208076e-05,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0003586724487831816,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 9493.0,
"completions/mean_length": 656.71875,
"completions/mean_terminated_length": 532.8818969726562,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 0.12588375620543957,
"epoch": 0.000792,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.024520058184862137,
"kl": 0.1035240120254457,
"learning_rate": 0.0001,
"loss": 0.0842,
"num_tokens": 34860025.0,
"reward": 0.5634695887565613,
"reward_std": 0.4744718670845032,
"rewards/reward_func/mean": 0.5634695887565613,
"rewards/reward_func/std": 0.4744718670845032,
"sampling/importance_sampling_ratio/max": 1.7210280895233154,
"sampling/importance_sampling_ratio/mean": 0.9508634805679321,
"sampling/importance_sampling_ratio/min": 0.0,
"sampling/sampling_logp_difference/max": 1.2289901971817017,
"sampling/sampling_logp_difference/mean": 0.006527569144964218,
"step": 99,
"step_time": 239.3775063320063
},
{
"clip_ratio/high_max": 0.08591295027872548,
"clip_ratio/high_mean": 0.011129089194582775,
"clip_ratio/low_mean": 0.005252894119621487,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.016381983092287555,
"entropy": 0.11453884467482567,
"epoch": 0.0008,
"grad_norm": 0.01832975633442402,
"kl": 0.032136627938598394,
"learning_rate": 0.0001,
"loss": -0.0691,
"step": 100,
"step_time": 78.3447534351144
}
],
"logging_steps": 1,
"max_steps": 10000,
"num_input_tokens_seen": 34860025,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}