{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.00118, "eval_steps": 500, "global_step": 59, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 234.90625, "completions/mean_terminated_length": 234.258056640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.2142516672611237, "epoch": 2e-05, "frac_reward_zero_std": 0.125, "grad_norm": 0.26979702711105347, "kl": 0.0, "learning_rate": 0.0, "loss": 0.005, "num_tokens": 25117.0, "reward": -0.10680253803730011, "reward_std": 0.15763120353221893, "rewards/rollout_reward_func/mean": -0.10680253803730011, "rewards/rollout_reward_func/std": 0.4190797507762909, "sampling/importance_sampling_ratio/max": 0.39367836713790894, "sampling/importance_sampling_ratio/mean": 0.3039231598377228, "sampling/importance_sampling_ratio/min": 1.1115786026014152e-12, "sampling/sampling_logp_difference/max": 8.616442680358887, "sampling/sampling_logp_difference/mean": 0.5023343563079834, "step": 1, "step_time": 6.355231066998385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.2142516672611237, "epoch": 4e-05, "grad_norm": 0.27107080817222595, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "loss": 0.005, "step": 2, "step_time": 2.6362348689945065 }, { "clip_ratio/high_max": 0.02130681835114956, "clip_ratio/high_mean": 0.01065340917557478, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01065340917557478, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 234.0, "completions/mean_terminated_length": 234.0, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 3.0704929530620575, "epoch": 6e-05, "frac_reward_zero_std": 0.25, "grad_norm": 0.22045230865478516, "kl": 0.0027004847361240536, "learning_rate": 5.714285714285715e-07, "loss": -0.004, "num_tokens": 50037.0, "reward": -0.19287078082561493, "reward_std": 0.10101090371608734, "rewards/rollout_reward_func/mean": -0.19287078082561493, "rewards/rollout_reward_func/std": 0.222161203622818, "sampling/importance_sampling_ratio/max": 0.41802993416786194, "sampling/importance_sampling_ratio/mean": 0.3280646502971649, "sampling/importance_sampling_ratio/min": 3.99030703268274e-17, "sampling/sampling_logp_difference/max": 15.911174774169922, "sampling/sampling_logp_difference/mean": 0.6590549349784851, "step": 3, "step_time": 5.095425448002061 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021875000093132257, "entropy": 3.064293920993805, "epoch": 8e-05, "grad_norm": 0.18429496884346008, "kl": 0.0012484827602747828, "learning_rate": 8.571428571428572e-07, "loss": -0.0041, "step": 4, "step_time": 3.698633335996419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 228.375, "completions/mean_terminated_length": 228.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.0153795182704926, "epoch": 0.0001, "frac_reward_zero_std": 0.5, "grad_norm": 0.3265765905380249, "kl": 0.0012420682760421187, "learning_rate": 1.142857142857143e-06, "loss": 0.0053, "num_tokens": 75373.0, "reward": -0.34361034631729126, "reward_std": 0.057992346584796906, "rewards/rollout_reward_func/mean": -0.34361034631729126, "rewards/rollout_reward_func/std": 0.18802577257156372, "sampling/importance_sampling_ratio/max": 0.5093293786048889, "sampling/importance_sampling_ratio/mean": 0.3409231901168823, "sampling/importance_sampling_ratio/min": 4.8556185738846125e-09, "sampling/sampling_logp_difference/max": 9.86337947845459, "sampling/sampling_logp_difference/mean": 0.4095209240913391, "step": 5, "step_time": 5.0735454369932995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 3.0051720440387726, "epoch": 0.00012, "grad_norm": 0.31776297092437744, "kl": 0.0012318175904511008, "learning_rate": 1.4285714285714286e-06, "loss": 0.0053, "step": 6, "step_time": 2.5831480210108566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 250.46875, "completions/mean_terminated_length": 250.46875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 2.8582278192043304, "epoch": 0.00014, "frac_reward_zero_std": 0.25, "grad_norm": 0.2764955759048462, "kl": 0.0006178142502903938, "learning_rate": 1.7142857142857145e-06, "loss": 0.0004, "num_tokens": 101372.0, "reward": -0.29117995500564575, "reward_std": 0.09318102151155472, "rewards/rollout_reward_func/mean": -0.29117995500564575, "rewards/rollout_reward_func/std": 0.2679314613342285, "sampling/importance_sampling_ratio/max": 0.49537980556488037, "sampling/importance_sampling_ratio/mean": 0.349077433347702, "sampling/importance_sampling_ratio/min": 0.25360107421875, "sampling/sampling_logp_difference/max": 1.0881049633026123, "sampling/sampling_logp_difference/mean": 0.28574055433273315, "step": 7, "step_time": 5.081170800003747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.862494111061096, "epoch": 0.00016, "grad_norm": 0.24010437726974487, "kl": 0.0006794510409235954, "learning_rate": 2.0000000000000003e-06, "loss": 0.0005, "step": 8, "step_time": 3.1074692439869978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.03125, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 235.875, "completions/mean_terminated_length": 234.41934204101562, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.4469158351421356, "epoch": 0.00018, "frac_reward_zero_std": 0.25, "grad_norm": 0.23515327274799347, "kl": 0.0016030340811994392, "learning_rate": 2.285714285714286e-06, "loss": -0.0015, "num_tokens": 127272.0, "reward": -0.192843496799469, "reward_std": 0.09330694377422333, "rewards/rollout_reward_func/mean": -0.192843496799469, "rewards/rollout_reward_func/std": 0.5610886812210083, "sampling/importance_sampling_ratio/max": 0.41105180978775024, "sampling/importance_sampling_ratio/mean": 0.30184194445610046, "sampling/importance_sampling_ratio/min": 6.500172067269716e-10, "sampling/sampling_logp_difference/max": 10.035122871398926, "sampling/sampling_logp_difference/mean": 0.6302409172058105, "step": 9, "step_time": 5.048265172990796 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 3.4479992985725403, "epoch": 0.0002, "grad_norm": 0.23885409533977509, "kl": 0.0010869057805393822, "learning_rate": 2.571428571428571e-06, "loss": -0.0018, "step": 10, "step_time": 2.5924187649870873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 245.9375, "completions/mean_terminated_length": 245.9375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 2.81420236825943, "epoch": 0.00022, "frac_reward_zero_std": 0.25, "grad_norm": 0.2363705188035965, "kl": 0.0017093666829168797, "learning_rate": 2.8571428571428573e-06, "loss": 0.001, "num_tokens": 152122.0, "reward": -0.3222354054450989, "reward_std": 0.06711984425783157, "rewards/rollout_reward_func/mean": -0.3222354054450989, "rewards/rollout_reward_func/std": 0.3185221254825592, "sampling/importance_sampling_ratio/max": 0.46333321928977966, "sampling/importance_sampling_ratio/mean": 0.3522881865501404, "sampling/importance_sampling_ratio/min": 0.2758025825023651, "sampling/sampling_logp_difference/max": 1.0277656316757202, "sampling/sampling_logp_difference/mean": 0.27784621715545654, "step": 11, "step_time": 5.074914941003954 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.8088128864765167, "epoch": 0.00024, "grad_norm": 0.2671680748462677, "kl": 0.0009637279435992241, "learning_rate": 3.142857142857143e-06, "loss": 0.0005, "step": 12, "step_time": 3.1117320080084028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 220.90625, "completions/mean_terminated_length": 220.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.451016843318939, "epoch": 0.00026, "frac_reward_zero_std": 0.125, "grad_norm": 0.2638246715068817, "kl": 0.002861911394575145, "learning_rate": 3.428571428571429e-06, "loss": -0.002, "num_tokens": 177459.0, "reward": -0.3641868233680725, "reward_std": 0.15367230772972107, "rewards/rollout_reward_func/mean": -0.3641868233680725, "rewards/rollout_reward_func/std": 0.29250121116638184, "sampling/importance_sampling_ratio/max": 0.44515958428382874, "sampling/importance_sampling_ratio/mean": 0.2989964485168457, "sampling/importance_sampling_ratio/min": 2.1738792987946454e-09, "sampling/sampling_logp_difference/max": 9.765713691711426, "sampling/sampling_logp_difference/mean": 0.689436674118042, "step": 13, "step_time": 5.032078587006254 }, { "clip_ratio/high_max": 0.028125000186264515, "clip_ratio/high_mean": 0.014062500093132257, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "entropy": 3.4329649209976196, "epoch": 0.00028, "grad_norm": 0.22904707491397858, "kl": 0.003218118588847574, "learning_rate": 3.7142857142857146e-06, "loss": -0.0022, "step": 14, "step_time": 2.5871444669901393 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 233.4375, "completions/mean_terminated_length": 233.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.8146192133426666, "epoch": 0.0003, "frac_reward_zero_std": 0.375, "grad_norm": 0.19310137629508972, "kl": 0.0009782408815226518, "learning_rate": 4.000000000000001e-06, "loss": 0.0002, "num_tokens": 202417.0, "reward": -0.19271773099899292, "reward_std": 0.155815988779068, "rewards/rollout_reward_func/mean": -0.19271773099899292, "rewards/rollout_reward_func/std": 0.4227723181247711, "sampling/importance_sampling_ratio/max": 0.4287247955799103, "sampling/importance_sampling_ratio/mean": 0.3414804935455322, "sampling/importance_sampling_ratio/min": 0.20568381249904633, "sampling/sampling_logp_difference/max": 1.0057648420333862, "sampling/sampling_logp_difference/mean": 0.28122472763061523, "step": 15, "step_time": 4.874961112989695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8126984536647797, "epoch": 0.00032, "grad_norm": 0.2404056191444397, "kl": 0.0004795501008629799, "learning_rate": 4.2857142857142855e-06, "loss": 0.0002, "step": 16, "step_time": 3.1072658550037886 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 239.375, "completions/mean_terminated_length": 239.375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 2.7332915663719177, "epoch": 0.00034, "frac_reward_zero_std": 0.625, "grad_norm": 0.09953422099351883, "kl": 0.0015153931453824043, "learning_rate": 4.571428571428572e-06, "loss": 0.0014, "num_tokens": 228193.0, "reward": -0.2743194103240967, "reward_std": 0.06014951318502426, "rewards/rollout_reward_func/mean": -0.2743194103240967, "rewards/rollout_reward_func/std": 0.2062394618988037, "sampling/importance_sampling_ratio/max": 0.4074307382106781, "sampling/importance_sampling_ratio/mean": 0.362698495388031, "sampling/importance_sampling_ratio/min": 0.211087167263031, "sampling/sampling_logp_difference/max": 0.9929541349411011, "sampling/sampling_logp_difference/mean": 0.26707231998443604, "step": 17, "step_time": 4.845946382018155 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.7276317477226257, "epoch": 0.00036, "grad_norm": 0.08120391517877579, "kl": 0.00179352518171072, "learning_rate": 4.857142857142858e-06, "loss": 0.0014, "step": 18, "step_time": 2.566697195004963 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 230.125, "completions/mean_terminated_length": 230.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9977245032787323, "epoch": 0.00038, "frac_reward_zero_std": 0.0, "grad_norm": 0.420897901058197, "kl": 0.006651273382885847, "learning_rate": 5.142857142857142e-06, "loss": 0.0059, "num_tokens": 253409.0, "reward": -0.1733579933643341, "reward_std": 0.19301274418830872, "rewards/rollout_reward_func/mean": -0.1733579933643341, "rewards/rollout_reward_func/std": 0.2591477334499359, "sampling/importance_sampling_ratio/max": 0.5843319892883301, "sampling/importance_sampling_ratio/mean": 0.3510555028915405, "sampling/importance_sampling_ratio/min": 2.4533126641301806e-09, "sampling/sampling_logp_difference/max": 9.161478996276855, "sampling/sampling_logp_difference/mean": 0.43575042486190796, "step": 19, "step_time": 5.015811472003406 }, { "clip_ratio/high_max": 0.046875, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.014062500093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03749999962747097, "entropy": 2.9959041476249695, "epoch": 0.0004, "grad_norm": 0.34493955969810486, "kl": 0.008892004458175506, "learning_rate": 5.428571428571429e-06, "loss": 0.0054, "step": 20, "step_time": 3.104410635001841 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 242.59375, "completions/mean_terminated_length": 242.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.8769345581531525, "epoch": 0.00042, "frac_reward_zero_std": 0.125, "grad_norm": 0.2891327738761902, "kl": 0.019077016040682793, "learning_rate": 5.7142857142857145e-06, "loss": 0.0007, "num_tokens": 279428.0, "reward": -0.2033514380455017, "reward_std": 0.0789310410618782, "rewards/rollout_reward_func/mean": -0.2033514380455017, "rewards/rollout_reward_func/std": 0.2912832200527191, "sampling/importance_sampling_ratio/max": 0.5870522856712341, "sampling/importance_sampling_ratio/mean": 0.3549953103065491, "sampling/importance_sampling_ratio/min": 0.2617434561252594, "sampling/sampling_logp_difference/max": 1.0146353244781494, "sampling/sampling_logp_difference/mean": 0.28163671493530273, "step": 21, "step_time": 5.382710714999121 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.873391270637512, "epoch": 0.00044, "grad_norm": 0.2984742820262909, "kl": 0.02395339752547443, "learning_rate": 6e-06, "loss": 0.001, "step": 22, "step_time": 2.595737472984183 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 226.96875, "completions/mean_terminated_length": 226.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.036062628030777, "epoch": 0.00046, "frac_reward_zero_std": 0.125, "grad_norm": 0.27704352140426636, "kl": 0.04521754803135991, "learning_rate": 6.285714285714286e-06, "loss": -0.0044, "num_tokens": 304835.0, "reward": 0.03992068022489548, "reward_std": 0.18363338708877563, "rewards/rollout_reward_func/mean": 0.03992068022489548, "rewards/rollout_reward_func/std": 0.47476619482040405, "sampling/importance_sampling_ratio/max": 0.4635872542858124, "sampling/importance_sampling_ratio/mean": 0.3497931957244873, "sampling/importance_sampling_ratio/min": 5.469144026548634e-10, "sampling/sampling_logp_difference/max": 10.450542449951172, "sampling/sampling_logp_difference/mean": 0.45533978939056396, "step": 23, "step_time": 4.866940429994429 }, { "clip_ratio/high_max": 0.07500000018626451, "clip_ratio/high_mean": 0.03750000009313226, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04531250009313226, "entropy": 3.0493311882019043, "epoch": 0.00048, "grad_norm": 0.20107966661453247, "kl": 0.06984312972053885, "learning_rate": 6.571428571428572e-06, "loss": -0.005, "step": 24, "step_time": 2.5650386179913767 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 219.78125, "completions/mean_terminated_length": 219.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.8874370455741882, "epoch": 0.0005, "frac_reward_zero_std": 0.375, "grad_norm": 0.23385846614837646, "kl": 0.054296525893732905, "learning_rate": 6.857142857142858e-06, "loss": -0.0032, "num_tokens": 329752.0, "reward": 0.12345941364765167, "reward_std": 0.1870170384645462, "rewards/rollout_reward_func/mean": 0.12345941364765167, "rewards/rollout_reward_func/std": 0.4734259247779846, "sampling/importance_sampling_ratio/max": 0.5099582672119141, "sampling/importance_sampling_ratio/mean": 0.35814619064331055, "sampling/importance_sampling_ratio/min": 0.2283174693584442, "sampling/sampling_logp_difference/max": 0.9733547568321228, "sampling/sampling_logp_difference/mean": 0.28791680932044983, "step": 25, "step_time": 5.722078731007059 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.8810991644859314, "epoch": 0.00052, "grad_norm": 0.22762461006641388, "kl": 0.07796057686209679, "learning_rate": 7.1428571428571436e-06, "loss": -0.0029, "step": 26, "step_time": 2.5734081249975134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 250.03125, "completions/mean_terminated_length": 250.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.8771714568138123, "epoch": 0.00054, "frac_reward_zero_std": 0.125, "grad_norm": 0.45455417037010193, "kl": 0.05301585793495178, "learning_rate": 7.428571428571429e-06, "loss": -0.0037, "num_tokens": 355821.0, "reward": -0.27320465445518494, "reward_std": 0.1116408258676529, "rewards/rollout_reward_func/mean": -0.27320465445518494, "rewards/rollout_reward_func/std": 0.4916592240333557, "sampling/importance_sampling_ratio/max": 0.4857718050479889, "sampling/importance_sampling_ratio/mean": 0.3579443693161011, "sampling/importance_sampling_ratio/min": 0.22359538078308105, "sampling/sampling_logp_difference/max": 0.9943762421607971, "sampling/sampling_logp_difference/mean": 0.2942765951156616, "step": 27, "step_time": 4.950633588006895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.8775526881217957, "epoch": 0.00056, "grad_norm": 0.48685505986213684, "kl": 0.06347140111029148, "learning_rate": 7.714285714285716e-06, "loss": -0.0037, "step": 28, "step_time": 2.586594164989947 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 232.6875, "completions/mean_terminated_length": 232.6875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 2.9303584694862366, "epoch": 0.00058, "frac_reward_zero_std": 0.5, "grad_norm": 0.13718773424625397, "kl": 0.21013515145750716, "learning_rate": 8.000000000000001e-06, "loss": -0.0006, "num_tokens": 380679.0, "reward": -0.10938680171966553, "reward_std": 0.03988201543688774, "rewards/rollout_reward_func/mean": -0.10938680171966553, "rewards/rollout_reward_func/std": 0.39720484614372253, "sampling/importance_sampling_ratio/max": 0.46476417779922485, "sampling/importance_sampling_ratio/mean": 0.32575637102127075, "sampling/importance_sampling_ratio/min": 1.840887664528168e-09, "sampling/sampling_logp_difference/max": 8.97761058807373, "sampling/sampling_logp_difference/mean": 0.43960580229759216, "step": 29, "step_time": 5.402698652986146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.021875000093132257, "clip_ratio/low_min": 0.015625, "clip_ratio/region_mean": 0.021875000093132257, "entropy": 2.9387083649635315, "epoch": 0.0006, "grad_norm": 0.12522387504577637, "kl": 0.28398286853916943, "learning_rate": 8.285714285714287e-06, "loss": -0.0007, "step": 30, "step_time": 2.5845938549973653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 240.71875, "completions/mean_terminated_length": 240.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.932174474000931, "epoch": 0.00062, "frac_reward_zero_std": 0.25, "grad_norm": 0.2146458476781845, "kl": 0.11049511469900608, "learning_rate": 8.571428571428571e-06, "loss": 0.0018, "num_tokens": 405834.0, "reward": -0.22345471382141113, "reward_std": 0.14430388808250427, "rewards/rollout_reward_func/mean": -0.22345471382141113, "rewards/rollout_reward_func/std": 0.2893519699573517, "sampling/importance_sampling_ratio/max": 0.6078521013259888, "sampling/importance_sampling_ratio/mean": 0.36050623655319214, "sampling/importance_sampling_ratio/min": 0.19197672605514526, "sampling/sampling_logp_difference/max": 1.0430428981781006, "sampling/sampling_logp_difference/mean": 0.30271923542022705, "step": 31, "step_time": 4.912783895008033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.9295111298561096, "epoch": 0.00064, "grad_norm": 0.32109126448631287, "kl": 0.11441947985440493, "learning_rate": 8.857142857142858e-06, "loss": 0.0021, "step": 32, "step_time": 2.578935690005892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 217.25, "completions/mean_terminated_length": 217.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.9171493649482727, "epoch": 0.00066, "frac_reward_zero_std": 0.125, "grad_norm": 0.18634726107120514, "kl": 0.29283354338258505, "learning_rate": 9.142857142857144e-06, "loss": -0.0046, "num_tokens": 430838.0, "reward": -0.021942690014839172, "reward_std": 0.2572951316833496, "rewards/rollout_reward_func/mean": -0.021942690014839172, "rewards/rollout_reward_func/std": 0.3850228190422058, "sampling/importance_sampling_ratio/max": 0.6168394684791565, "sampling/importance_sampling_ratio/mean": 0.3683329224586487, "sampling/importance_sampling_ratio/min": 0.1770981401205063, "sampling/sampling_logp_difference/max": 1.0844477415084839, "sampling/sampling_logp_difference/mean": 0.3178885579109192, "step": 33, "step_time": 5.883909591000702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.018229166977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "entropy": 2.9187699258327484, "epoch": 0.00068, "grad_norm": 0.17209632694721222, "kl": 0.32030509738251567, "learning_rate": 9.42857142857143e-06, "loss": -0.0045, "step": 34, "step_time": 2.583553667005617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 251.15625, "completions/mean_terminated_length": 251.15625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 2.9039885699748993, "epoch": 0.0007, "frac_reward_zero_std": 0.0, "grad_norm": 0.21747003495693207, "kl": 0.4501009024679661, "learning_rate": 9.714285714285715e-06, "loss": -0.01, "num_tokens": 457291.0, "reward": -0.008474733680486679, "reward_std": 0.06938640028238297, "rewards/rollout_reward_func/mean": -0.008474733680486679, "rewards/rollout_reward_func/std": 0.4130137264728546, "sampling/importance_sampling_ratio/max": 0.516679584980011, "sampling/importance_sampling_ratio/mean": 0.33271369338035583, "sampling/importance_sampling_ratio/min": 7.735414442322508e-07, "sampling/sampling_logp_difference/max": 3.892051935195923, "sampling/sampling_logp_difference/mean": 0.387271523475647, "step": 35, "step_time": 4.876037522997649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012620192486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012620192486792803, "entropy": 2.9125703275203705, "epoch": 0.00072, "grad_norm": 0.22622092068195343, "kl": 0.514522522687912, "learning_rate": 1e-05, "loss": -0.0102, "step": 36, "step_time": 2.586019046000729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 223.71875, "completions/mean_terminated_length": 223.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.898330956697464, "epoch": 0.00074, "frac_reward_zero_std": 0.125, "grad_norm": 0.18963545560836792, "kl": 0.17799657548312098, "learning_rate": 9.9999999995372e-06, "loss": -0.0066, "num_tokens": 482530.0, "reward": -0.16677546501159668, "reward_std": 0.156550794839859, "rewards/rollout_reward_func/mean": -0.16677546501159668, "rewards/rollout_reward_func/std": 0.27479690313339233, "sampling/importance_sampling_ratio/max": 0.6498793959617615, "sampling/importance_sampling_ratio/mean": 0.3772757649421692, "sampling/importance_sampling_ratio/min": 0.20579339563846588, "sampling/sampling_logp_difference/max": 1.1838572025299072, "sampling/sampling_logp_difference/mean": 0.31108224391937256, "step": 37, "step_time": 6.162664868999855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.899034082889557, "epoch": 0.00076, "grad_norm": 0.173002228140831, "kl": 0.2087516870815307, "learning_rate": 9.999999998148802e-06, "loss": -0.007, "step": 38, "step_time": 2.6345554510044167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 246.03125, "completions/mean_terminated_length": 246.03125, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 2.736233025789261, "epoch": 0.00078, "frac_reward_zero_std": 0.125, "grad_norm": 0.25805914402008057, "kl": 0.31404404155910015, "learning_rate": 9.999999995834804e-06, "loss": -0.0102, "num_tokens": 508515.0, "reward": -0.0983460322022438, "reward_std": 0.1256507933139801, "rewards/rollout_reward_func/mean": -0.0983460322022438, "rewards/rollout_reward_func/std": 0.5733147263526917, "sampling/importance_sampling_ratio/max": 0.5405430793762207, "sampling/importance_sampling_ratio/mean": 0.3518107533454895, "sampling/importance_sampling_ratio/min": 0.08160559087991714, "sampling/sampling_logp_difference/max": 1.1959668397903442, "sampling/sampling_logp_difference/mean": 0.294846773147583, "step": 39, "step_time": 4.918988351011649 }, { "clip_ratio/high_max": 0.059375000186264515, "clip_ratio/high_mean": 0.029687500093132257, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03750000009313226, "entropy": 2.7310811281204224, "epoch": 0.0008, "grad_norm": 0.16329945623874664, "kl": 0.36445298697799444, "learning_rate": 9.999999992595207e-06, "loss": -0.0111, "step": 40, "step_time": 2.647009986983903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 236.96875, "completions/mean_terminated_length": 236.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7252674400806427, "epoch": 0.00082, "frac_reward_zero_std": 0.5, "grad_norm": 0.1424042135477066, "kl": 0.3397897696122527, "learning_rate": 9.999999988430008e-06, "loss": 0.0013, "num_tokens": 533826.0, "reward": -0.16728736460208893, "reward_std": 0.09324388951063156, "rewards/rollout_reward_func/mean": -0.16728736460208893, "rewards/rollout_reward_func/std": 0.3316197991371155, "sampling/importance_sampling_ratio/max": 0.42009323835372925, "sampling/importance_sampling_ratio/mean": 0.36413732171058655, "sampling/importance_sampling_ratio/min": 0.2344428449869156, "sampling/sampling_logp_difference/max": 1.2678487300872803, "sampling/sampling_logp_difference/mean": 0.2750445306301117, "step": 41, "step_time": 6.286279361011111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.705561101436615, "epoch": 0.00084, "grad_norm": 0.14423894882202148, "kl": 0.31878931261599064, "learning_rate": 9.999999983339212e-06, "loss": 0.0012, "step": 42, "step_time": 2.644381924001209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 248.25, "completions/mean_terminated_length": 248.25, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 2.7233501076698303, "epoch": 0.00086, "frac_reward_zero_std": 0.375, "grad_norm": 0.27433744072914124, "kl": 0.26571275014430285, "learning_rate": 9.999999977322818e-06, "loss": 0.001, "num_tokens": 559034.0, "reward": -0.16031917929649353, "reward_std": 0.07910899817943573, "rewards/rollout_reward_func/mean": -0.16031917929649353, "rewards/rollout_reward_func/std": 0.3891371190547943, "sampling/importance_sampling_ratio/max": 0.7106043696403503, "sampling/importance_sampling_ratio/mean": 0.3692682087421417, "sampling/importance_sampling_ratio/min": 0.22985637187957764, "sampling/sampling_logp_difference/max": 0.6034185886383057, "sampling/sampling_logp_difference/mean": 0.29134175181388855, "step": 43, "step_time": 5.0206790450029075 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.7124963998794556, "epoch": 0.00088, "grad_norm": 0.22208449244499207, "kl": 0.2730150371789932, "learning_rate": 9.999999970380822e-06, "loss": 0.0003, "step": 44, "step_time": 2.6403936139904545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 227.28125, "completions/mean_terminated_length": 227.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7599149346351624, "epoch": 0.0009, "frac_reward_zero_std": 0.375, "grad_norm": 0.17858587205410004, "kl": 0.21788341365754604, "learning_rate": 9.999999962513228e-06, "loss": 0.0004, "num_tokens": 583771.0, "reward": -0.13604271411895752, "reward_std": 0.10448753833770752, "rewards/rollout_reward_func/mean": -0.13604271411895752, "rewards/rollout_reward_func/std": 0.4671633243560791, "sampling/importance_sampling_ratio/max": 0.6547749638557434, "sampling/importance_sampling_ratio/mean": 0.3665401041507721, "sampling/importance_sampling_ratio/min": 0.06266149878501892, "sampling/sampling_logp_difference/max": 1.4546922445297241, "sampling/sampling_logp_difference/mean": 0.31887930631637573, "step": 45, "step_time": 6.400341804008349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 2.7397755682468414, "epoch": 0.00092, "grad_norm": 0.18169698119163513, "kl": 0.24561153631657362, "learning_rate": 9.999999953720035e-06, "loss": -0.0003, "step": 46, "step_time": 3.840367698998307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 232.625, "completions/mean_terminated_length": 232.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.7486678063869476, "epoch": 0.00094, "frac_reward_zero_std": 0.125, "grad_norm": 0.1613720804452896, "kl": 0.5124468728899956, "learning_rate": 9.99999994400124e-06, "loss": -0.0059, "num_tokens": 609515.0, "reward": -0.22357675433158875, "reward_std": 0.11547203361988068, "rewards/rollout_reward_func/mean": -0.22357675433158875, "rewards/rollout_reward_func/std": 0.30443012714385986, "sampling/importance_sampling_ratio/max": 0.7394812107086182, "sampling/importance_sampling_ratio/mean": 0.39799731969833374, "sampling/importance_sampling_ratio/min": 0.003087133402004838, "sampling/sampling_logp_difference/max": 2.968440055847168, "sampling/sampling_logp_difference/mean": 0.33213719725608826, "step": 47, "step_time": 5.084561435993237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.717843770980835, "epoch": 0.00096, "grad_norm": 0.15270604193210602, "kl": 0.5517388842999935, "learning_rate": 9.999999933356848e-06, "loss": -0.0065, "step": 48, "step_time": 2.594435404003889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 236.96875, "completions/mean_terminated_length": 236.96875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 2.6678203344345093, "epoch": 0.00098, "frac_reward_zero_std": 0.25, "grad_norm": 0.35319241881370544, "kl": 0.5060755051672459, "learning_rate": 9.999999921786855e-06, "loss": -0.0234, "num_tokens": 635106.0, "reward": -0.08378944545984268, "reward_std": 0.06696479767560959, "rewards/rollout_reward_func/mean": -0.08378944545984268, "rewards/rollout_reward_func/std": 0.1741461306810379, "sampling/importance_sampling_ratio/max": 0.7419227957725525, "sampling/importance_sampling_ratio/mean": 0.3981371819972992, "sampling/importance_sampling_ratio/min": 9.574564474590375e-10, "sampling/sampling_logp_difference/max": 8.066584587097168, "sampling/sampling_logp_difference/mean": 0.44935131072998047, "step": 49, "step_time": 4.888581562976469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.04531250009313226, "clip_ratio/low_min": 0.015625, "clip_ratio/region_mean": 0.04531250009313226, "entropy": 2.6393848061561584, "epoch": 0.001, "grad_norm": 0.2584957182407379, "kl": 0.6151874884963036, "learning_rate": 9.999999909291265e-06, "loss": -0.0249, "step": 50, "step_time": 3.5894425570004387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 247.375, "completions/mean_terminated_length": 247.375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 2.4370608031749725, "epoch": 0.00102, "frac_reward_zero_std": 0.375, "grad_norm": 0.11229121685028076, "kl": 0.5419403500854969, "learning_rate": 9.999999895870075e-06, "loss": -0.015, "num_tokens": 660746.0, "reward": -0.18949081003665924, "reward_std": 0.13007065653800964, "rewards/rollout_reward_func/mean": -0.18949081003665924, "rewards/rollout_reward_func/std": 0.47671881318092346, "sampling/importance_sampling_ratio/max": 0.7944492101669312, "sampling/importance_sampling_ratio/mean": 0.44122982025146484, "sampling/importance_sampling_ratio/min": 0.17443421483039856, "sampling/sampling_logp_difference/max": 0.8063274621963501, "sampling/sampling_logp_difference/mean": 0.27315592765808105, "step": 51, "step_time": 4.902517995004018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 2.3851166665554047, "epoch": 0.00104, "grad_norm": 0.13190092146396637, "kl": 0.5634779073297977, "learning_rate": 9.999999881523285e-06, "loss": -0.0154, "step": 52, "step_time": 2.5712496630003443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 250.34375, "completions/mean_terminated_length": 250.34375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 2.39392626285553, "epoch": 0.00106, "frac_reward_zero_std": 0.375, "grad_norm": 0.22307628393173218, "kl": 1.1005694568157196, "learning_rate": 9.999999866250896e-06, "loss": 0.0005, "num_tokens": 686797.0, "reward": -0.24457362294197083, "reward_std": 0.06005535274744034, "rewards/rollout_reward_func/mean": -0.24457362294197083, "rewards/rollout_reward_func/std": 0.387178510427475, "sampling/importance_sampling_ratio/max": 0.7177804112434387, "sampling/importance_sampling_ratio/mean": 0.4388836622238159, "sampling/importance_sampling_ratio/min": 0.0532250739634037, "sampling/sampling_logp_difference/max": 2.0519614219665527, "sampling/sampling_logp_difference/mean": 0.3103662431240082, "step": 53, "step_time": 4.848857997007144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.015625, "clip_ratio/region_mean": 0.0390625, "entropy": 2.33899587392807, "epoch": 0.00108, "grad_norm": 0.2479284703731537, "kl": 1.431688316166401, "learning_rate": 9.999999850052909e-06, "loss": 0.0006, "step": 54, "step_time": 3.562508454975614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 250.28125, "completions/mean_terminated_length": 250.28125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 2.302690327167511, "epoch": 0.0011, "frac_reward_zero_std": 0.625, "grad_norm": 0.10195823013782501, "kl": 0.3720350982621312, "learning_rate": 9.99999983292932e-06, "loss": -0.0047, "num_tokens": 712758.0, "reward": -0.21304789185523987, "reward_std": 0.05584158003330231, "rewards/rollout_reward_func/mean": -0.21304789185523987, "rewards/rollout_reward_func/std": 0.31514599919319153, "sampling/importance_sampling_ratio/max": 0.5960808396339417, "sampling/importance_sampling_ratio/mean": 0.44993600249290466, "sampling/importance_sampling_ratio/min": 3.7855832488276064e-05, "sampling/sampling_logp_difference/max": 4.051816940307617, "sampling/sampling_logp_difference/mean": 0.2676515281200409, "step": 55, "step_time": 5.935931921994779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 2.275217980146408, "epoch": 0.00112, "grad_norm": 0.10368601977825165, "kl": 0.3742258697748184, "learning_rate": 9.999999814880132e-06, "loss": -0.0048, "step": 56, "step_time": 3.2161756420027814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 224.03125, "completions/mean_terminated_length": 224.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.3107198774814606, "epoch": 0.00114, "frac_reward_zero_std": 0.625, "grad_norm": 0.30300799012184143, "kl": 1.3965208567678928, "learning_rate": 9.999999795905347e-06, "loss": -0.0063, "num_tokens": 736995.0, "reward": -0.09847276657819748, "reward_std": 0.052444059401750565, "rewards/rollout_reward_func/mean": -0.09847276657819748, "rewards/rollout_reward_func/std": 0.15305201709270477, "sampling/importance_sampling_ratio/max": 0.6250765323638916, "sampling/importance_sampling_ratio/mean": 0.45801815390586853, "sampling/importance_sampling_ratio/min": 3.5164142708765667e-09, "sampling/sampling_logp_difference/max": 9.024276733398438, "sampling/sampling_logp_difference/mean": 0.38153740763664246, "step": 57, "step_time": 4.980028837999271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.274234175682068, "epoch": 0.00116, "grad_norm": 0.21491846442222595, "kl": 1.0686182007193565, "learning_rate": 9.999999776004962e-06, "loss": -0.0072, "step": 58, "step_time": 3.5114306210089126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 244.0625, "completions/mean_terminated_length": 244.0625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 2.026425540447235, "epoch": 0.00118, "frac_reward_zero_std": 0.75, "grad_norm": 0.17460982501506805, "kl": 1.455301407724619, "learning_rate": 9.999999755178978e-06, "loss": -0.003, "num_tokens": 762249.0, "reward": -0.19697391986846924, "reward_std": 0.037940964102745056, "rewards/rollout_reward_func/mean": -0.19697391986846924, "rewards/rollout_reward_func/std": 0.1964397132396698, "sampling/importance_sampling_ratio/max": 0.670058012008667, "sampling/importance_sampling_ratio/mean": 0.47034597396850586, "sampling/importance_sampling_ratio/min": 0.08649313449859619, "sampling/sampling_logp_difference/max": 1.5561704635620117, "sampling/sampling_logp_difference/mean": 0.22143109142780304, "step": 59, "step_time": 4.800487477994466 } ], "logging_steps": 1.0, "max_steps": 200000, "num_input_tokens_seen": 762249, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }