smoke_test_cm_2_lp / trainer_state.json
Gege24's picture
Upload task output 1
9b41f81 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.00118,
"eval_steps": 500,
"global_step": 59,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 272.0,
"completions/max_terminated_length": 272.0,
"completions/mean_length": 234.90625,
"completions/mean_terminated_length": 234.258056640625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 3.2142516672611237,
"epoch": 2e-05,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.26979702711105347,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.005,
"num_tokens": 25117.0,
"reward": -0.10680253803730011,
"reward_std": 0.15763120353221893,
"rewards/rollout_reward_func/mean": -0.10680253803730011,
"rewards/rollout_reward_func/std": 0.4190797507762909,
"sampling/importance_sampling_ratio/max": 0.39367836713790894,
"sampling/importance_sampling_ratio/mean": 0.3039231598377228,
"sampling/importance_sampling_ratio/min": 1.1115786026014152e-12,
"sampling/sampling_logp_difference/max": 8.616442680358887,
"sampling/sampling_logp_difference/mean": 0.5023343563079834,
"step": 1,
"step_time": 6.355231066998385
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 3.2142516672611237,
"epoch": 4e-05,
"grad_norm": 0.27107080817222595,
"kl": 0.0,
"learning_rate": 2.8571428571428575e-07,
"loss": 0.005,
"step": 2,
"step_time": 2.6362348689945065
},
{
"clip_ratio/high_max": 0.02130681835114956,
"clip_ratio/high_mean": 0.01065340917557478,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01065340917557478,
"completions/clipped_ratio": 0.0,
"completions/max_length": 276.0,
"completions/max_terminated_length": 276.0,
"completions/mean_length": 234.0,
"completions/mean_terminated_length": 234.0,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"entropy": 3.0704929530620575,
"epoch": 6e-05,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.22045230865478516,
"kl": 0.0027004847361240536,
"learning_rate": 5.714285714285715e-07,
"loss": -0.004,
"num_tokens": 50037.0,
"reward": -0.19287078082561493,
"reward_std": 0.10101090371608734,
"rewards/rollout_reward_func/mean": -0.19287078082561493,
"rewards/rollout_reward_func/std": 0.222161203622818,
"sampling/importance_sampling_ratio/max": 0.41802993416786194,
"sampling/importance_sampling_ratio/mean": 0.3280646502971649,
"sampling/importance_sampling_ratio/min": 3.99030703268274e-17,
"sampling/sampling_logp_difference/max": 15.911174774169922,
"sampling/sampling_logp_difference/mean": 0.6590549349784851,
"step": 3,
"step_time": 5.095425448002061
},
{
"clip_ratio/high_max": 0.03125,
"clip_ratio/high_mean": 0.015625,
"clip_ratio/low_mean": 0.0062500000931322575,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.021875000093132257,
"entropy": 3.064293920993805,
"epoch": 8e-05,
"grad_norm": 0.18429496884346008,
"kl": 0.0012484827602747828,
"learning_rate": 8.571428571428572e-07,
"loss": -0.0041,
"step": 4,
"step_time": 3.698633335996419
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 276.0,
"completions/max_terminated_length": 276.0,
"completions/mean_length": 228.375,
"completions/mean_terminated_length": 228.375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 3.0153795182704926,
"epoch": 0.0001,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.3265765905380249,
"kl": 0.0012420682760421187,
"learning_rate": 1.142857142857143e-06,
"loss": 0.0053,
"num_tokens": 75373.0,
"reward": -0.34361034631729126,
"reward_std": 0.057992346584796906,
"rewards/rollout_reward_func/mean": -0.34361034631729126,
"rewards/rollout_reward_func/std": 0.18802577257156372,
"sampling/importance_sampling_ratio/max": 0.5093293786048889,
"sampling/importance_sampling_ratio/mean": 0.3409231901168823,
"sampling/importance_sampling_ratio/min": 4.8556185738846125e-09,
"sampling/sampling_logp_difference/max": 9.86337947845459,
"sampling/sampling_logp_difference/mean": 0.4095209240913391,
"step": 5,
"step_time": 5.0735454369932995
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 3.0051720440387726,
"epoch": 0.00012,
"grad_norm": 0.31776297092437744,
"kl": 0.0012318175904511008,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.0053,
"step": 6,
"step_time": 2.5831480210108566
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 272.0,
"completions/max_terminated_length": 272.0,
"completions/mean_length": 250.46875,
"completions/mean_terminated_length": 250.46875,
"completions/min_length": 229.0,
"completions/min_terminated_length": 229.0,
"entropy": 2.8582278192043304,
"epoch": 0.00014,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.2764955759048462,
"kl": 0.0006178142502903938,
"learning_rate": 1.7142857142857145e-06,
"loss": 0.0004,
"num_tokens": 101372.0,
"reward": -0.29117995500564575,
"reward_std": 0.09318102151155472,
"rewards/rollout_reward_func/mean": -0.29117995500564575,
"rewards/rollout_reward_func/std": 0.2679314613342285,
"sampling/importance_sampling_ratio/max": 0.49537980556488037,
"sampling/importance_sampling_ratio/mean": 0.349077433347702,
"sampling/importance_sampling_ratio/min": 0.25360107421875,
"sampling/sampling_logp_difference/max": 1.0881049633026123,
"sampling/sampling_logp_difference/mean": 0.28574055433273315,
"step": 7,
"step_time": 5.081170800003747
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 2.862494111061096,
"epoch": 0.00016,
"grad_norm": 0.24010437726974487,
"kl": 0.0006794510409235954,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0005,
"step": 8,
"step_time": 3.1074692439869978
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0028409091755747795,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0028409091755747795,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 283.0,
"completions/max_terminated_length": 283.0,
"completions/mean_length": 235.875,
"completions/mean_terminated_length": 234.41934204101562,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 3.4469158351421356,
"epoch": 0.00018,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.23515327274799347,
"kl": 0.0016030340811994392,
"learning_rate": 2.285714285714286e-06,
"loss": -0.0015,
"num_tokens": 127272.0,
"reward": -0.192843496799469,
"reward_std": 0.09330694377422333,
"rewards/rollout_reward_func/mean": -0.192843496799469,
"rewards/rollout_reward_func/std": 0.5610886812210083,
"sampling/importance_sampling_ratio/max": 0.41105180978775024,
"sampling/importance_sampling_ratio/mean": 0.30184194445610046,
"sampling/importance_sampling_ratio/min": 6.500172067269716e-10,
"sampling/sampling_logp_difference/max": 10.035122871398926,
"sampling/sampling_logp_difference/mean": 0.6302409172058105,
"step": 9,
"step_time": 5.048265172990796
},
{
"clip_ratio/high_max": 0.03125,
"clip_ratio/high_mean": 0.015625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"entropy": 3.4479992985725403,
"epoch": 0.0002,
"grad_norm": 0.23885409533977509,
"kl": 0.0010869057805393822,
"learning_rate": 2.571428571428571e-06,
"loss": -0.0018,
"step": 10,
"step_time": 2.5924187649870873
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 276.0,
"completions/max_terminated_length": 276.0,
"completions/mean_length": 245.9375,
"completions/mean_terminated_length": 245.9375,
"completions/min_length": 229.0,
"completions/min_terminated_length": 229.0,
"entropy": 2.81420236825943,
"epoch": 0.00022,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.2363705188035965,
"kl": 0.0017093666829168797,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.001,
"num_tokens": 152122.0,
"reward": -0.3222354054450989,
"reward_std": 0.06711984425783157,
"rewards/rollout_reward_func/mean": -0.3222354054450989,
"rewards/rollout_reward_func/std": 0.3185221254825592,
"sampling/importance_sampling_ratio/max": 0.46333321928977966,
"sampling/importance_sampling_ratio/mean": 0.3522881865501404,
"sampling/importance_sampling_ratio/min": 0.2758025825023651,
"sampling/sampling_logp_difference/max": 1.0277656316757202,
"sampling/sampling_logp_difference/mean": 0.27784621715545654,
"step": 11,
"step_time": 5.074914941003954
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 2.8088128864765167,
"epoch": 0.00024,
"grad_norm": 0.2671680748462677,
"kl": 0.0009637279435992241,
"learning_rate": 3.142857142857143e-06,
"loss": 0.0005,
"step": 12,
"step_time": 3.1117320080084028
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.012500000186264515,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"completions/clipped_ratio": 0.0,
"completions/max_length": 276.0,
"completions/max_terminated_length": 276.0,
"completions/mean_length": 220.90625,
"completions/mean_terminated_length": 220.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 3.451016843318939,
"epoch": 0.00026,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.2638246715068817,
"kl": 0.002861911394575145,
"learning_rate": 3.428571428571429e-06,
"loss": -0.002,
"num_tokens": 177459.0,
"reward": -0.3641868233680725,
"reward_std": 0.15367230772972107,
"rewards/rollout_reward_func/mean": -0.3641868233680725,
"rewards/rollout_reward_func/std": 0.29250121116638184,
"sampling/importance_sampling_ratio/max": 0.44515958428382874,
"sampling/importance_sampling_ratio/mean": 0.2989964485168457,
"sampling/importance_sampling_ratio/min": 2.1738792987946454e-09,
"sampling/sampling_logp_difference/max": 9.765713691711426,
"sampling/sampling_logp_difference/mean": 0.689436674118042,
"step": 13,
"step_time": 5.032078587006254
},
{
"clip_ratio/high_max": 0.028125000186264515,
"clip_ratio/high_mean": 0.014062500093132257,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.014062500093132257,
"entropy": 3.4329649209976196,
"epoch": 0.00028,
"grad_norm": 0.22904707491397858,
"kl": 0.003218118588847574,
"learning_rate": 3.7142857142857146e-06,
"loss": -0.0022,
"step": 14,
"step_time": 2.5871444669901393
},
{
"clip_ratio/high_max": 0.012500000186264515,
"clip_ratio/high_mean": 0.0062500000931322575,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0062500000931322575,
"completions/clipped_ratio": 0.0,
"completions/max_length": 276.0,
"completions/max_terminated_length": 276.0,
"completions/mean_length": 233.4375,
"completions/mean_terminated_length": 233.4375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 2.8146192133426666,
"epoch": 0.0003,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.19310137629508972,
"kl": 0.0009782408815226518,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0002,
"num_tokens": 202417.0,
"reward": -0.19271773099899292,
"reward_std": 0.155815988779068,
"rewards/rollout_reward_func/mean": -0.19271773099899292,
"rewards/rollout_reward_func/std": 0.4227723181247711,
"sampling/importance_sampling_ratio/max": 0.4287247955799103,
"sampling/importance_sampling_ratio/mean": 0.3414804935455322,
"sampling/importance_sampling_ratio/min": 0.20568381249904633,
"sampling/sampling_logp_difference/max": 1.0057648420333862,
"sampling/sampling_logp_difference/mean": 0.28122472763061523,
"step": 15,
"step_time": 4.874961112989695
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 2.8126984536647797,
"epoch": 0.00032,
"grad_norm": 0.2404056191444397,
"kl": 0.0004795501008629799,
"learning_rate": 4.2857142857142855e-06,
"loss": 0.0002,
"step": 16,
"step_time": 3.1072658550037886
},
{
"clip_ratio/high_max": 0.03125,
"clip_ratio/high_mean": 0.015625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 244.0,
"completions/max_terminated_length": 244.0,
"completions/mean_length": 239.375,
"completions/mean_terminated_length": 239.375,
"completions/min_length": 229.0,
"completions/min_terminated_length": 229.0,
"entropy": 2.7332915663719177,
"epoch": 0.00034,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.09953422099351883,
"kl": 0.0015153931453824043,
"learning_rate": 4.571428571428572e-06,
"loss": 0.0014,
"num_tokens": 228193.0,
"reward": -0.2743194103240967,
"reward_std": 0.06014951318502426,
"rewards/rollout_reward_func/mean": -0.2743194103240967,
"rewards/rollout_reward_func/std": 0.2062394618988037,
"sampling/importance_sampling_ratio/max": 0.4074307382106781,
"sampling/importance_sampling_ratio/mean": 0.362698495388031,
"sampling/importance_sampling_ratio/min": 0.211087167263031,
"sampling/sampling_logp_difference/max": 0.9929541349411011,
"sampling/sampling_logp_difference/mean": 0.26707231998443604,
"step": 17,
"step_time": 4.845946382018155
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 2.7276317477226257,
"epoch": 0.00036,
"grad_norm": 0.08120391517877579,
"kl": 0.00179352518171072,
"learning_rate": 4.857142857142858e-06,
"loss": 0.0014,
"step": 18,
"step_time": 2.566697195004963
},
{
"clip_ratio/high_max": 0.03125,
"clip_ratio/high_mean": 0.015625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 274.0,
"completions/max_terminated_length": 274.0,
"completions/mean_length": 230.125,
"completions/mean_terminated_length": 230.125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 2.9977245032787323,
"epoch": 0.00038,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.420897901058197,
"kl": 0.006651273382885847,
"learning_rate": 5.142857142857142e-06,
"loss": 0.0059,
"num_tokens": 253409.0,
"reward": -0.1733579933643341,
"reward_std": 0.19301274418830872,
"rewards/rollout_reward_func/mean": -0.1733579933643341,
"rewards/rollout_reward_func/std": 0.2591477334499359,
"sampling/importance_sampling_ratio/max": 0.5843319892883301,
"sampling/importance_sampling_ratio/mean": 0.3510555028915405,
"sampling/importance_sampling_ratio/min": 2.4533126641301806e-09,
"sampling/sampling_logp_difference/max": 9.161478996276855,
"sampling/sampling_logp_difference/mean": 0.43575042486190796,
"step": 19,
"step_time": 5.015811472003406
},
{
"clip_ratio/high_max": 0.046875,
"clip_ratio/high_mean": 0.0234375,
"clip_ratio/low_mean": 0.014062500093132257,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03749999962747097,
"entropy": 2.9959041476249695,
"epoch": 0.0004,
"grad_norm": 0.34493955969810486,
"kl": 0.008892004458175506,
"learning_rate": 5.428571428571429e-06,
"loss": 0.0054,
"step": 20,
"step_time": 3.104410635001841
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 276.0,
"completions/max_terminated_length": 276.0,
"completions/mean_length": 242.59375,
"completions/mean_terminated_length": 242.59375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 2.8769345581531525,
"epoch": 0.00042,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.2891327738761902,
"kl": 0.019077016040682793,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.0007,
"num_tokens": 279428.0,
"reward": -0.2033514380455017,
"reward_std": 0.0789310410618782,
"rewards/rollout_reward_func/mean": -0.2033514380455017,
"rewards/rollout_reward_func/std": 0.2912832200527191,
"sampling/importance_sampling_ratio/max": 0.5870522856712341,
"sampling/importance_sampling_ratio/mean": 0.3549953103065491,
"sampling/importance_sampling_ratio/min": 0.2617434561252594,
"sampling/sampling_logp_difference/max": 1.0146353244781494,
"sampling/sampling_logp_difference/mean": 0.28163671493530273,
"step": 21,
"step_time": 5.382710714999121
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 2.873391270637512,
"epoch": 0.00044,
"grad_norm": 0.2984742820262909,
"kl": 0.02395339752547443,
"learning_rate": 6e-06,
"loss": 0.001,
"step": 22,
"step_time": 2.595737472984183
},
{
"clip_ratio/high_max": 0.03125,
"clip_ratio/high_mean": 0.015625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 269.0,
"completions/max_terminated_length": 269.0,
"completions/mean_length": 226.96875,
"completions/mean_terminated_length": 226.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 3.036062628030777,
"epoch": 0.00046,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.27704352140426636,
"kl": 0.04521754803135991,
"learning_rate": 6.285714285714286e-06,
"loss": -0.0044,
"num_tokens": 304835.0,
"reward": 0.03992068022489548,
"reward_std": 0.18363338708877563,
"rewards/rollout_reward_func/mean": 0.03992068022489548,
"rewards/rollout_reward_func/std": 0.47476619482040405,
"sampling/importance_sampling_ratio/max": 0.4635872542858124,
"sampling/importance_sampling_ratio/mean": 0.3497931957244873,
"sampling/importance_sampling_ratio/min": 5.469144026548634e-10,
"sampling/sampling_logp_difference/max": 10.450542449951172,
"sampling/sampling_logp_difference/mean": 0.45533978939056396,
"step": 23,
"step_time": 4.866940429994429
},
{
"clip_ratio/high_max": 0.07500000018626451,
"clip_ratio/high_mean": 0.03750000009313226,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.04531250009313226,
"entropy": 3.0493311882019043,
"epoch": 0.00048,
"grad_norm": 0.20107966661453247,
"kl": 0.06984312972053885,
"learning_rate": 6.571428571428572e-06,
"loss": -0.005,
"step": 24,
"step_time": 2.5650386179913767
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 271.0,
"completions/max_terminated_length": 271.0,
"completions/mean_length": 219.78125,
"completions/mean_terminated_length": 219.78125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 2.8874370455741882,
"epoch": 0.0005,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.23385846614837646,
"kl": 0.054296525893732905,
"learning_rate": 6.857142857142858e-06,
"loss": -0.0032,
"num_tokens": 329752.0,
"reward": 0.12345941364765167,
"reward_std": 0.1870170384645462,
"rewards/rollout_reward_func/mean": 0.12345941364765167,
"rewards/rollout_reward_func/std": 0.4734259247779846,
"sampling/importance_sampling_ratio/max": 0.5099582672119141,
"sampling/importance_sampling_ratio/mean": 0.35814619064331055,
"sampling/importance_sampling_ratio/min": 0.2283174693584442,
"sampling/sampling_logp_difference/max": 0.9733547568321228,
"sampling/sampling_logp_difference/mean": 0.28791680932044983,
"step": 25,
"step_time": 5.722078731007059
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"entropy": 2.8810991644859314,
"epoch": 0.00052,
"grad_norm": 0.22762461006641388,
"kl": 0.07796057686209679,
"learning_rate": 7.1428571428571436e-06,
"loss": -0.0029,
"step": 26,
"step_time": 2.5734081249975134
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 276.0,
"completions/max_terminated_length": 276.0,
"completions/mean_length": 250.03125,
"completions/mean_terminated_length": 250.03125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 2.8771714568138123,
"epoch": 0.00054,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.45455417037010193,
"kl": 0.05301585793495178,
"learning_rate": 7.428571428571429e-06,
"loss": -0.0037,
"num_tokens": 355821.0,
"reward": -0.27320465445518494,
"reward_std": 0.1116408258676529,
"rewards/rollout_reward_func/mean": -0.27320465445518494,
"rewards/rollout_reward_func/std": 0.4916592240333557,
"sampling/importance_sampling_ratio/max": 0.4857718050479889,
"sampling/importance_sampling_ratio/mean": 0.3579443693161011,
"sampling/importance_sampling_ratio/min": 0.22359538078308105,
"sampling/sampling_logp_difference/max": 0.9943762421607971,
"sampling/sampling_logp_difference/mean": 0.2942765951156616,
"step": 27,
"step_time": 4.950633588006895
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 2.8775526881217957,
"epoch": 0.00056,
"grad_norm": 0.48685505986213684,
"kl": 0.06347140111029148,
"learning_rate": 7.714285714285716e-06,
"loss": -0.0037,
"step": 28,
"step_time": 2.586594164989947
},
{
"clip_ratio/high_max": 0.02500000037252903,
"clip_ratio/high_mean": 0.012500000186264515,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012500000186264515,
"completions/clipped_ratio": 0.0,
"completions/max_length": 270.0,
"completions/max_terminated_length": 270.0,
"completions/mean_length": 232.6875,
"completions/mean_terminated_length": 232.6875,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"entropy": 2.9303584694862366,
"epoch": 0.00058,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.13718773424625397,
"kl": 0.21013515145750716,
"learning_rate": 8.000000000000001e-06,
"loss": -0.0006,
"num_tokens": 380679.0,
"reward": -0.10938680171966553,
"reward_std": 0.03988201543688774,
"rewards/rollout_reward_func/mean": -0.10938680171966553,
"rewards/rollout_reward_func/std": 0.39720484614372253,
"sampling/importance_sampling_ratio/max": 0.46476417779922485,
"sampling/importance_sampling_ratio/mean": 0.32575637102127075,
"sampling/importance_sampling_ratio/min": 1.840887664528168e-09,
"sampling/sampling_logp_difference/max": 8.97761058807373,
"sampling/sampling_logp_difference/mean": 0.43960580229759216,
"step": 29,
"step_time": 5.402698652986146
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.021875000093132257,
"clip_ratio/low_min": 0.015625,
"clip_ratio/region_mean": 0.021875000093132257,
"entropy": 2.9387083649635315,
"epoch": 0.0006,
"grad_norm": 0.12522387504577637,
"kl": 0.28398286853916943,
"learning_rate": 8.285714285714287e-06,
"loss": -0.0007,
"step": 30,
"step_time": 2.5845938549973653
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 276.0,
"completions/max_terminated_length": 276.0,
"completions/mean_length": 240.71875,
"completions/mean_terminated_length": 240.71875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 2.932174474000931,
"epoch": 0.00062,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.2146458476781845,
"kl": 0.11049511469900608,
"learning_rate": 8.571428571428571e-06,
"loss": 0.0018,
"num_tokens": 405834.0,
"reward": -0.22345471382141113,
"reward_std": 0.14430388808250427,
"rewards/rollout_reward_func/mean": -0.22345471382141113,
"rewards/rollout_reward_func/std": 0.2893519699573517,
"sampling/importance_sampling_ratio/max": 0.6078521013259888,
"sampling/importance_sampling_ratio/mean": 0.36050623655319214,
"sampling/importance_sampling_ratio/min": 0.19197672605514526,
"sampling/sampling_logp_difference/max": 1.0430428981781006,
"sampling/sampling_logp_difference/mean": 0.30271923542022705,
"step": 31,
"step_time": 4.912783895008033
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 2.9295111298561096,
"epoch": 0.00064,
"grad_norm": 0.32109126448631287,
"kl": 0.11441947985440493,
"learning_rate": 8.857142857142858e-06,
"loss": 0.0021,
"step": 32,
"step_time": 2.578935690005892
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 276.0,
"completions/max_terminated_length": 276.0,
"completions/mean_length": 217.25,
"completions/mean_terminated_length": 217.25,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 2.9171493649482727,
"epoch": 0.00066,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.18634726107120514,
"kl": 0.29283354338258505,
"learning_rate": 9.142857142857144e-06,
"loss": -0.0046,
"num_tokens": 430838.0,
"reward": -0.021942690014839172,
"reward_std": 0.2572951316833496,
"rewards/rollout_reward_func/mean": -0.021942690014839172,
"rewards/rollout_reward_func/std": 0.3850228190422058,
"sampling/importance_sampling_ratio/max": 0.6168394684791565,
"sampling/importance_sampling_ratio/mean": 0.3683329224586487,
"sampling/importance_sampling_ratio/min": 0.1770981401205063,
"sampling/sampling_logp_difference/max": 1.0844477415084839,
"sampling/sampling_logp_difference/mean": 0.3178885579109192,
"step": 33,
"step_time": 5.883909591000702
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.018229166977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.018229166977107525,
"entropy": 2.9187699258327484,
"epoch": 0.00068,
"grad_norm": 0.17209632694721222,
"kl": 0.32030509738251567,
"learning_rate": 9.42857142857143e-06,
"loss": -0.0045,
"step": 34,
"step_time": 2.583553667005617
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 269.0,
"completions/max_terminated_length": 269.0,
"completions/mean_length": 251.15625,
"completions/mean_terminated_length": 251.15625,
"completions/min_length": 233.0,
"completions/min_terminated_length": 233.0,
"entropy": 2.9039885699748993,
"epoch": 0.0007,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21747003495693207,
"kl": 0.4501009024679661,
"learning_rate": 9.714285714285715e-06,
"loss": -0.01,
"num_tokens": 457291.0,
"reward": -0.008474733680486679,
"reward_std": 0.06938640028238297,
"rewards/rollout_reward_func/mean": -0.008474733680486679,
"rewards/rollout_reward_func/std": 0.4130137264728546,
"sampling/importance_sampling_ratio/max": 0.516679584980011,
"sampling/importance_sampling_ratio/mean": 0.33271369338035583,
"sampling/importance_sampling_ratio/min": 7.735414442322508e-07,
"sampling/sampling_logp_difference/max": 3.892051935195923,
"sampling/sampling_logp_difference/mean": 0.387271523475647,
"step": 35,
"step_time": 4.876037522997649
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.012620192486792803,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.012620192486792803,
"entropy": 2.9125703275203705,
"epoch": 0.00072,
"grad_norm": 0.22622092068195343,
"kl": 0.514522522687912,
"learning_rate": 1e-05,
"loss": -0.0102,
"step": 36,
"step_time": 2.586019046000729
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 276.0,
"completions/max_terminated_length": 276.0,
"completions/mean_length": 223.71875,
"completions/mean_terminated_length": 223.71875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 2.898330956697464,
"epoch": 0.00074,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.18963545560836792,
"kl": 0.17799657548312098,
"learning_rate": 9.9999999995372e-06,
"loss": -0.0066,
"num_tokens": 482530.0,
"reward": -0.16677546501159668,
"reward_std": 0.156550794839859,
"rewards/rollout_reward_func/mean": -0.16677546501159668,
"rewards/rollout_reward_func/std": 0.27479690313339233,
"sampling/importance_sampling_ratio/max": 0.6498793959617615,
"sampling/importance_sampling_ratio/mean": 0.3772757649421692,
"sampling/importance_sampling_ratio/min": 0.20579339563846588,
"sampling/sampling_logp_difference/max": 1.1838572025299072,
"sampling/sampling_logp_difference/mean": 0.31108224391937256,
"step": 37,
"step_time": 6.162664868999855
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 2.899034082889557,
"epoch": 0.00076,
"grad_norm": 0.173002228140831,
"kl": 0.2087516870815307,
"learning_rate": 9.999999998148802e-06,
"loss": -0.007,
"step": 38,
"step_time": 2.6345554510044167
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 276.0,
"completions/max_terminated_length": 276.0,
"completions/mean_length": 246.03125,
"completions/mean_terminated_length": 246.03125,
"completions/min_length": 229.0,
"completions/min_terminated_length": 229.0,
"entropy": 2.736233025789261,
"epoch": 0.00078,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.25805914402008057,
"kl": 0.31404404155910015,
"learning_rate": 9.999999995834804e-06,
"loss": -0.0102,
"num_tokens": 508515.0,
"reward": -0.0983460322022438,
"reward_std": 0.1256507933139801,
"rewards/rollout_reward_func/mean": -0.0983460322022438,
"rewards/rollout_reward_func/std": 0.5733147263526917,
"sampling/importance_sampling_ratio/max": 0.5405430793762207,
"sampling/importance_sampling_ratio/mean": 0.3518107533454895,
"sampling/importance_sampling_ratio/min": 0.08160559087991714,
"sampling/sampling_logp_difference/max": 1.1959668397903442,
"sampling/sampling_logp_difference/mean": 0.294846773147583,
"step": 39,
"step_time": 4.918988351011649
},
{
"clip_ratio/high_max": 0.059375000186264515,
"clip_ratio/high_mean": 0.029687500093132257,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03750000009313226,
"entropy": 2.7310811281204224,
"epoch": 0.0008,
"grad_norm": 0.16329945623874664,
"kl": 0.36445298697799444,
"learning_rate": 9.999999992595207e-06,
"loss": -0.0111,
"step": 40,
"step_time": 2.647009986983903
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 276.0,
"completions/max_terminated_length": 276.0,
"completions/mean_length": 236.96875,
"completions/mean_terminated_length": 236.96875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 2.7252674400806427,
"epoch": 0.00082,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.1424042135477066,
"kl": 0.3397897696122527,
"learning_rate": 9.999999988430008e-06,
"loss": 0.0013,
"num_tokens": 533826.0,
"reward": -0.16728736460208893,
"reward_std": 0.09324388951063156,
"rewards/rollout_reward_func/mean": -0.16728736460208893,
"rewards/rollout_reward_func/std": 0.3316197991371155,
"sampling/importance_sampling_ratio/max": 0.42009323835372925,
"sampling/importance_sampling_ratio/mean": 0.36413732171058655,
"sampling/importance_sampling_ratio/min": 0.2344428449869156,
"sampling/sampling_logp_difference/max": 1.2678487300872803,
"sampling/sampling_logp_difference/mean": 0.2750445306301117,
"step": 41,
"step_time": 6.286279361011111
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 2.705561101436615,
"epoch": 0.00084,
"grad_norm": 0.14423894882202148,
"kl": 0.31878931261599064,
"learning_rate": 9.999999983339212e-06,
"loss": 0.0012,
"step": 42,
"step_time": 2.644381924001209
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 276.0,
"completions/max_terminated_length": 276.0,
"completions/mean_length": 248.25,
"completions/mean_terminated_length": 248.25,
"completions/min_length": 229.0,
"completions/min_terminated_length": 229.0,
"entropy": 2.7233501076698303,
"epoch": 0.00086,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.27433744072914124,
"kl": 0.26571275014430285,
"learning_rate": 9.999999977322818e-06,
"loss": 0.001,
"num_tokens": 559034.0,
"reward": -0.16031917929649353,
"reward_std": 0.07910899817943573,
"rewards/rollout_reward_func/mean": -0.16031917929649353,
"rewards/rollout_reward_func/std": 0.3891371190547943,
"sampling/importance_sampling_ratio/max": 0.7106043696403503,
"sampling/importance_sampling_ratio/mean": 0.3692682087421417,
"sampling/importance_sampling_ratio/min": 0.22985637187957764,
"sampling/sampling_logp_difference/max": 0.6034185886383057,
"sampling/sampling_logp_difference/mean": 0.29134175181388855,
"step": 43,
"step_time": 5.0206790450029075
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"entropy": 2.7124963998794556,
"epoch": 0.00088,
"grad_norm": 0.22208449244499207,
"kl": 0.2730150371789932,
"learning_rate": 9.999999970380822e-06,
"loss": 0.0003,
"step": 44,
"step_time": 2.6403936139904545
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 276.0,
"completions/max_terminated_length": 276.0,
"completions/mean_length": 227.28125,
"completions/mean_terminated_length": 227.28125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 2.7599149346351624,
"epoch": 0.0009,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.17858587205410004,
"kl": 0.21788341365754604,
"learning_rate": 9.999999962513228e-06,
"loss": 0.0004,
"num_tokens": 583771.0,
"reward": -0.13604271411895752,
"reward_std": 0.10448753833770752,
"rewards/rollout_reward_func/mean": -0.13604271411895752,
"rewards/rollout_reward_func/std": 0.4671633243560791,
"sampling/importance_sampling_ratio/max": 0.6547749638557434,
"sampling/importance_sampling_ratio/mean": 0.3665401041507721,
"sampling/importance_sampling_ratio/min": 0.06266149878501892,
"sampling/sampling_logp_difference/max": 1.4546922445297241,
"sampling/sampling_logp_difference/mean": 0.31887930631637573,
"step": 45,
"step_time": 6.400341804008349
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 2.7397755682468414,
"epoch": 0.00092,
"grad_norm": 0.18169698119163513,
"kl": 0.24561153631657362,
"learning_rate": 9.999999953720035e-06,
"loss": -0.0003,
"step": 46,
"step_time": 3.840367698998307
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 276.0,
"completions/max_terminated_length": 276.0,
"completions/mean_length": 232.625,
"completions/mean_terminated_length": 232.625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 2.7486678063869476,
"epoch": 0.00094,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.1613720804452896,
"kl": 0.5124468728899956,
"learning_rate": 9.99999994400124e-06,
"loss": -0.0059,
"num_tokens": 609515.0,
"reward": -0.22357675433158875,
"reward_std": 0.11547203361988068,
"rewards/rollout_reward_func/mean": -0.22357675433158875,
"rewards/rollout_reward_func/std": 0.30443012714385986,
"sampling/importance_sampling_ratio/max": 0.7394812107086182,
"sampling/importance_sampling_ratio/mean": 0.39799731969833374,
"sampling/importance_sampling_ratio/min": 0.003087133402004838,
"sampling/sampling_logp_difference/max": 2.968440055847168,
"sampling/sampling_logp_difference/mean": 0.33213719725608826,
"step": 47,
"step_time": 5.084561435993237
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 2.717843770980835,
"epoch": 0.00096,
"grad_norm": 0.15270604193210602,
"kl": 0.5517388842999935,
"learning_rate": 9.999999933356848e-06,
"loss": -0.0065,
"step": 48,
"step_time": 2.594435404003889
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 267.0,
"completions/max_terminated_length": 267.0,
"completions/mean_length": 236.96875,
"completions/mean_terminated_length": 236.96875,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"entropy": 2.6678203344345093,
"epoch": 0.00098,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.35319241881370544,
"kl": 0.5060755051672459,
"learning_rate": 9.999999921786855e-06,
"loss": -0.0234,
"num_tokens": 635106.0,
"reward": -0.08378944545984268,
"reward_std": 0.06696479767560959,
"rewards/rollout_reward_func/mean": -0.08378944545984268,
"rewards/rollout_reward_func/std": 0.1741461306810379,
"sampling/importance_sampling_ratio/max": 0.7419227957725525,
"sampling/importance_sampling_ratio/mean": 0.3981371819972992,
"sampling/importance_sampling_ratio/min": 9.574564474590375e-10,
"sampling/sampling_logp_difference/max": 8.066584587097168,
"sampling/sampling_logp_difference/mean": 0.44935131072998047,
"step": 49,
"step_time": 4.888581562976469
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.04531250009313226,
"clip_ratio/low_min": 0.015625,
"clip_ratio/region_mean": 0.04531250009313226,
"entropy": 2.6393848061561584,
"epoch": 0.001,
"grad_norm": 0.2584957182407379,
"kl": 0.6151874884963036,
"learning_rate": 9.999999909291265e-06,
"loss": -0.0249,
"step": 50,
"step_time": 3.5894425570004387
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 276.0,
"completions/max_terminated_length": 276.0,
"completions/mean_length": 247.375,
"completions/mean_terminated_length": 247.375,
"completions/min_length": 229.0,
"completions/min_terminated_length": 229.0,
"entropy": 2.4370608031749725,
"epoch": 0.00102,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.11229121685028076,
"kl": 0.5419403500854969,
"learning_rate": 9.999999895870075e-06,
"loss": -0.015,
"num_tokens": 660746.0,
"reward": -0.18949081003665924,
"reward_std": 0.13007065653800964,
"rewards/rollout_reward_func/mean": -0.18949081003665924,
"rewards/rollout_reward_func/std": 0.47671881318092346,
"sampling/importance_sampling_ratio/max": 0.7944492101669312,
"sampling/importance_sampling_ratio/mean": 0.44122982025146484,
"sampling/importance_sampling_ratio/min": 0.17443421483039856,
"sampling/sampling_logp_difference/max": 0.8063274621963501,
"sampling/sampling_logp_difference/mean": 0.27315592765808105,
"step": 51,
"step_time": 4.902517995004018
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"entropy": 2.3851166665554047,
"epoch": 0.00104,
"grad_norm": 0.13190092146396637,
"kl": 0.5634779073297977,
"learning_rate": 9.999999881523285e-06,
"loss": -0.0154,
"step": 52,
"step_time": 2.5712496630003443
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 276.0,
"completions/max_terminated_length": 276.0,
"completions/mean_length": 250.34375,
"completions/mean_terminated_length": 250.34375,
"completions/min_length": 184.0,
"completions/min_terminated_length": 184.0,
"entropy": 2.39392626285553,
"epoch": 0.00106,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.22307628393173218,
"kl": 1.1005694568157196,
"learning_rate": 9.999999866250896e-06,
"loss": 0.0005,
"num_tokens": 686797.0,
"reward": -0.24457362294197083,
"reward_std": 0.06005535274744034,
"rewards/rollout_reward_func/mean": -0.24457362294197083,
"rewards/rollout_reward_func/std": 0.387178510427475,
"sampling/importance_sampling_ratio/max": 0.7177804112434387,
"sampling/importance_sampling_ratio/mean": 0.4388836622238159,
"sampling/importance_sampling_ratio/min": 0.0532250739634037,
"sampling/sampling_logp_difference/max": 2.0519614219665527,
"sampling/sampling_logp_difference/mean": 0.3103662431240082,
"step": 53,
"step_time": 4.848857997007144
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.015625,
"clip_ratio/region_mean": 0.0390625,
"entropy": 2.33899587392807,
"epoch": 0.00108,
"grad_norm": 0.2479284703731537,
"kl": 1.431688316166401,
"learning_rate": 9.999999850052909e-06,
"loss": 0.0006,
"step": 54,
"step_time": 3.562508454975614
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 276.0,
"completions/max_terminated_length": 276.0,
"completions/mean_length": 250.28125,
"completions/mean_terminated_length": 250.28125,
"completions/min_length": 241.0,
"completions/min_terminated_length": 241.0,
"entropy": 2.302690327167511,
"epoch": 0.0011,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.10195823013782501,
"kl": 0.3720350982621312,
"learning_rate": 9.99999983292932e-06,
"loss": -0.0047,
"num_tokens": 712758.0,
"reward": -0.21304789185523987,
"reward_std": 0.05584158003330231,
"rewards/rollout_reward_func/mean": -0.21304789185523987,
"rewards/rollout_reward_func/std": 0.31514599919319153,
"sampling/importance_sampling_ratio/max": 0.5960808396339417,
"sampling/importance_sampling_ratio/mean": 0.44993600249290466,
"sampling/importance_sampling_ratio/min": 3.7855832488276064e-05,
"sampling/sampling_logp_difference/max": 4.051816940307617,
"sampling/sampling_logp_difference/mean": 0.2676515281200409,
"step": 55,
"step_time": 5.935931921994779
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.004807692486792803,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.004807692486792803,
"entropy": 2.275217980146408,
"epoch": 0.00112,
"grad_norm": 0.10368601977825165,
"kl": 0.3742258697748184,
"learning_rate": 9.999999814880132e-06,
"loss": -0.0048,
"step": 56,
"step_time": 3.2161756420027814
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 267.0,
"completions/max_terminated_length": 267.0,
"completions/mean_length": 224.03125,
"completions/mean_terminated_length": 224.03125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 2.3107198774814606,
"epoch": 0.00114,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.30300799012184143,
"kl": 1.3965208567678928,
"learning_rate": 9.999999795905347e-06,
"loss": -0.0063,
"num_tokens": 736995.0,
"reward": -0.09847276657819748,
"reward_std": 0.052444059401750565,
"rewards/rollout_reward_func/mean": -0.09847276657819748,
"rewards/rollout_reward_func/std": 0.15305201709270477,
"sampling/importance_sampling_ratio/max": 0.6250765323638916,
"sampling/importance_sampling_ratio/mean": 0.45801815390586853,
"sampling/importance_sampling_ratio/min": 3.5164142708765667e-09,
"sampling/sampling_logp_difference/max": 9.024276733398438,
"sampling/sampling_logp_difference/mean": 0.38153740763664246,
"step": 57,
"step_time": 4.980028837999271
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 2.274234175682068,
"epoch": 0.00116,
"grad_norm": 0.21491846442222595,
"kl": 1.0686182007193565,
"learning_rate": 9.999999776004962e-06,
"loss": -0.0072,
"step": 58,
"step_time": 3.5114306210089126
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 267.0,
"completions/max_terminated_length": 267.0,
"completions/mean_length": 244.0625,
"completions/mean_terminated_length": 244.0625,
"completions/min_length": 229.0,
"completions/min_terminated_length": 229.0,
"entropy": 2.026425540447235,
"epoch": 0.00118,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.17460982501506805,
"kl": 1.455301407724619,
"learning_rate": 9.999999755178978e-06,
"loss": -0.003,
"num_tokens": 762249.0,
"reward": -0.19697391986846924,
"reward_std": 0.037940964102745056,
"rewards/rollout_reward_func/mean": -0.19697391986846924,
"rewards/rollout_reward_func/std": 0.1964397132396698,
"sampling/importance_sampling_ratio/max": 0.670058012008667,
"sampling/importance_sampling_ratio/mean": 0.47034597396850586,
"sampling/importance_sampling_ratio/min": 0.08649313449859619,
"sampling/sampling_logp_difference/max": 1.5561704635620117,
"sampling/sampling_logp_difference/mean": 0.22143109142780304,
"step": 59,
"step_time": 4.800487477994466
}
],
"logging_steps": 1.0,
"max_steps": 200000,
"num_input_tokens_seen": 762249,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}