{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.00225, "eval_steps": 500, "global_step": 225, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 265.75, "completions/mean_terminated_length": 265.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.740012645721436, "epoch": 1e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.019221410155296326, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0006, "num_tokens": 45751.0, "reward": 0.816877007484436, "reward_std": 1.4014036655426025, "rewards/rollout_reward_func/mean": 0.816877007484436, "rewards/rollout_reward_func/std": 1.6075319051742554, "sampling/importance_sampling_ratio/max": 0.03914691507816315, "sampling/importance_sampling_ratio/mean": 0.013615390285849571, "sampling/importance_sampling_ratio/min": 1.1552421904970122e-15, "sampling/sampling_logp_difference/max": 3.914313554763794, "sampling/sampling_logp_difference/mean": 1.6371219158172607, "step": 1, "step_time": 9.72409536699979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.740012645721436, "epoch": 2e-05, "grad_norm": 0.01974678784608841, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "loss": -0.0006, "step": 2, "step_time": 4.797613267999623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 412.5625, "completions/mean_terminated_length": 412.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.820557475090027, "epoch": 3e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.011553559452295303, "kl": 0.0009907482417474966, "learning_rate": 5.714285714285715e-07, "loss": 0.0001, "num_tokens": 99312.0, "reward": 2.8992574214935303, "reward_std": 1.8266513347625732, "rewards/rollout_reward_func/mean": 2.8992574214935303, "rewards/rollout_reward_func/std": 1.9147884845733643, "sampling/importance_sampling_ratio/max": 0.0381130687892437, "sampling/importance_sampling_ratio/mean": 0.009180868044495583, "sampling/importance_sampling_ratio/min": 9.134832647250679e-12, "sampling/sampling_logp_difference/max": 3.4724807739257812, "sampling/sampling_logp_difference/mean": 1.698885440826416, "step": 3, "step_time": 8.947379413000363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.818018198013306, "epoch": 4e-05, "grad_norm": 0.011284240521490574, "kl": 0.0009902061865432188, "learning_rate": 8.571428571428572e-07, "loss": 0.0001, "step": 4, "step_time": 5.229739129000336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 165.09375, "completions/mean_terminated_length": 165.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.794174313545227, "epoch": 5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.028869854286313057, "kl": 0.0008135313764796592, "learning_rate": 1.142857142857143e-06, "loss": -0.0004, "num_tokens": 141029.0, "reward": 1.8564525842666626, "reward_std": 2.077150344848633, "rewards/rollout_reward_func/mean": 1.8564525842666626, "rewards/rollout_reward_func/std": 2.0850281715393066, "sampling/importance_sampling_ratio/max": 0.05348784476518631, "sampling/importance_sampling_ratio/mean": 0.017640406265854836, "sampling/importance_sampling_ratio/min": 0.00042824808042496443, "sampling/sampling_logp_difference/max": 2.332674980163574, "sampling/sampling_logp_difference/mean": 1.7570207118988037, "step": 5, "step_time": 8.192974794999373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.788055181503296, "epoch": 6e-05, "grad_norm": 0.029661983251571655, "kl": 0.0007373044900305104, "learning_rate": 1.4285714285714286e-06, "loss": -0.0005, "step": 6, "step_time": 4.266827427000862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 541.6875, "completions/mean_terminated_length": 541.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.803173184394836, "epoch": 7e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.004336625337600708, "kl": 0.0010216275259153917, "learning_rate": 1.7142857142857145e-06, "loss": 0.0002, "num_tokens": 199637.0, "reward": 1.30655038356781, "reward_std": 0.9892024993896484, "rewards/rollout_reward_func/mean": 1.30655038356781, "rewards/rollout_reward_func/std": 1.138155221939087, "sampling/importance_sampling_ratio/max": 0.028351690620183945, "sampling/importance_sampling_ratio/mean": 0.005052408203482628, "sampling/importance_sampling_ratio/min": 2.671416343005633e-15, "sampling/sampling_logp_difference/max": 4.701449394226074, "sampling/sampling_logp_difference/mean": 1.7298243045806885, "step": 7, "step_time": 9.286785637000321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.789715051651001, "epoch": 8e-05, "grad_norm": 0.004228756297379732, "kl": 0.000888259346538689, "learning_rate": 2.0000000000000003e-06, "loss": 0.0002, "step": 8, "step_time": 4.969429294999827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.03125, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 286.75, "completions/mean_terminated_length": 295.4838562011719, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.3694349527359, "epoch": 9e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.010730103589594364, "kl": 0.0009608958062017336, "learning_rate": 2.285714285714286e-06, "loss": -0.0, "num_tokens": 246380.0, "reward": 2.1979100704193115, "reward_std": 1.8867942094802856, "rewards/rollout_reward_func/mean": 2.1979100704193115, "rewards/rollout_reward_func/std": 2.1932425498962402, "sampling/importance_sampling_ratio/max": 0.03839043155312538, "sampling/importance_sampling_ratio/mean": 0.012242316268384457, "sampling/importance_sampling_ratio/min": 3.078865162819966e-08, "sampling/sampling_logp_difference/max": 3.9608242511749268, "sampling/sampling_logp_difference/mean": 1.4708642959594727, "step": 9, "step_time": 8.68651038400003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.358309984207153, "epoch": 0.0001, "grad_norm": 0.0108121233060956, "kl": 0.000685311508277664, "learning_rate": 2.571428571428571e-06, "loss": -0.0, "step": 10, "step_time": 5.445634689999224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 553.90625, "completions/mean_terminated_length": 553.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.565176486968994, "epoch": 0.00011, "frac_reward_zero_std": 0.0, "grad_norm": 0.0031674716155976057, "kl": 0.0009231339645339176, "learning_rate": 2.8571428571428573e-06, "loss": 0.0001, "num_tokens": 306137.0, "reward": 1.9738842248916626, "reward_std": 1.5114688873291016, "rewards/rollout_reward_func/mean": 1.9738842248916626, "rewards/rollout_reward_func/std": 1.8342463970184326, "sampling/importance_sampling_ratio/max": 0.02065931260585785, "sampling/importance_sampling_ratio/mean": 0.0028495141305029392, "sampling/importance_sampling_ratio/min": 5.273884899763661e-19, "sampling/sampling_logp_difference/max": 3.751443862915039, "sampling/sampling_logp_difference/mean": 1.6439871788024902, "step": 11, "step_time": 9.134554064999975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 8.561085760593414, "epoch": 0.00012, "grad_norm": 0.0026493030600249767, "kl": 0.0009400276176165789, "learning_rate": 3.142857142857143e-06, "loss": 0.0001, "step": 12, "step_time": 4.8191932119998455 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 162.4375, "completions/mean_terminated_length": 162.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.512920141220093, "epoch": 0.00013, "frac_reward_zero_std": 0.0, "grad_norm": 0.013727321289479733, "kl": 0.0018250496359542012, "learning_rate": 3.428571428571429e-06, "loss": -0.0004, "num_tokens": 349131.0, "reward": 1.47185218334198, "reward_std": 1.5472846031188965, "rewards/rollout_reward_func/mean": 1.47185218334198, "rewards/rollout_reward_func/std": 2.0390946865081787, "sampling/importance_sampling_ratio/max": 0.049390941858291626, "sampling/importance_sampling_ratio/mean": 0.020272064954042435, "sampling/importance_sampling_ratio/min": 2.633779558891547e-06, "sampling/sampling_logp_difference/max": 2.360596179962158, "sampling/sampling_logp_difference/mean": 1.5058674812316895, "step": 13, "step_time": 7.890245483999934 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "entropy": 8.52327024936676, "epoch": 0.00014, "grad_norm": 0.01371886394917965, "kl": 0.002251528945635073, "learning_rate": 3.7142857142857146e-06, "loss": -0.0004, "step": 14, "step_time": 4.277807705000669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 493.03125, "completions/mean_terminated_length": 508.4193420410156, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.769269943237305, "epoch": 0.00015, "frac_reward_zero_std": 0.0, "grad_norm": 0.004188260994851589, "kl": 0.0018109382945112884, "learning_rate": 4.000000000000001e-06, "loss": -0.0001, "num_tokens": 405372.0, "reward": 2.176278591156006, "reward_std": 1.8223553895950317, "rewards/rollout_reward_func/mean": 2.176278591156006, "rewards/rollout_reward_func/std": 1.8436557054519653, "sampling/importance_sampling_ratio/max": 0.021065089851617813, "sampling/importance_sampling_ratio/mean": 0.0036145278718322515, "sampling/importance_sampling_ratio/min": 4.2438622060991804e-13, "sampling/sampling_logp_difference/max": 3.6341652870178223, "sampling/sampling_logp_difference/mean": 1.674858808517456, "step": 15, "step_time": 9.12190689900035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.767740607261658, "epoch": 0.00016, "grad_norm": 0.004105101805180311, "kl": 0.002771631450741552, "learning_rate": 4.2857142857142855e-06, "loss": -0.0001, "step": 16, "step_time": 5.4121323870003835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 420.0, "completions/mean_terminated_length": 421.7241516113281, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.60806268453598, "epoch": 0.00017, "frac_reward_zero_std": 0.0, "grad_norm": 0.009108365513384342, "kl": 0.004436066417838447, "learning_rate": 4.571428571428572e-06, "loss": 0.0001, "num_tokens": 458204.0, "reward": 2.0131897926330566, "reward_std": 1.7921838760375977, "rewards/rollout_reward_func/mean": 2.0131897926330566, "rewards/rollout_reward_func/std": 1.917612910270691, "sampling/importance_sampling_ratio/max": 0.03794016316533089, "sampling/importance_sampling_ratio/mean": 0.00851379707455635, "sampling/importance_sampling_ratio/min": 5.970022844210435e-30, "sampling/sampling_logp_difference/max": 3.762781858444214, "sampling/sampling_logp_difference/mean": 1.7653887271881104, "step": 17, "step_time": 8.833719273000042 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "entropy": 8.605763673782349, "epoch": 0.00018, "grad_norm": 0.009003642946481705, "kl": 0.004826090880669653, "learning_rate": 4.857142857142858e-06, "loss": 0.0001, "step": 18, "step_time": 4.669556054000623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 356.1875, "completions/mean_terminated_length": 356.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.589665651321411, "epoch": 0.00019, "frac_reward_zero_std": 0.0, "grad_norm": 0.012068537063896656, "kl": 0.007034957525320351, "learning_rate": 5.142857142857142e-06, "loss": 0.0003, "num_tokens": 510714.0, "reward": 1.6357378959655762, "reward_std": 1.9768089056015015, "rewards/rollout_reward_func/mean": 1.6357378959655762, "rewards/rollout_reward_func/std": 1.9230132102966309, "sampling/importance_sampling_ratio/max": 0.0377332866191864, "sampling/importance_sampling_ratio/mean": 0.00977294985204935, "sampling/importance_sampling_ratio/min": 1.5810989850706392e-08, "sampling/sampling_logp_difference/max": 2.8323609828948975, "sampling/sampling_logp_difference/mean": 1.57611083984375, "step": 19, "step_time": 8.702261807000923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 8.588200807571411, "epoch": 0.0002, "grad_norm": 0.01012762077152729, "kl": 0.008162530430126935, "learning_rate": 5.428571428571429e-06, "loss": 0.0003, "step": 20, "step_time": 4.793898748000174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 725.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 169.8125, "completions/mean_terminated_length": 156.43333435058594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.763131499290466, "epoch": 0.00021, "frac_reward_zero_std": 0.0, "grad_norm": 0.014919566921889782, "kl": 0.021441200777189806, "learning_rate": 5.7142857142857145e-06, "loss": -0.0005, "num_tokens": 554259.0, "reward": 2.2263033390045166, "reward_std": 1.681884765625, "rewards/rollout_reward_func/mean": 2.2263033390045166, "rewards/rollout_reward_func/std": 1.8072566986083984, "sampling/importance_sampling_ratio/max": 0.08593336492776871, "sampling/importance_sampling_ratio/mean": 0.022409576922655106, "sampling/importance_sampling_ratio/min": 2.349876534956627e-22, "sampling/sampling_logp_difference/max": 4.372560501098633, "sampling/sampling_logp_difference/mean": 1.931687831878662, "step": 21, "step_time": 8.055510063999009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.737543225288391, "epoch": 0.00022, "grad_norm": 0.014876801520586014, "kl": 0.029705224180361256, "learning_rate": 6e-06, "loss": -0.0005, "step": 22, "step_time": 5.419155312999919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 182.1875, "completions/mean_terminated_length": 171.50001525878906, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.859729766845703, "epoch": 0.00023, "frac_reward_zero_std": 0.0, "grad_norm": 0.029221149161458015, "kl": 0.0323223132872954, "learning_rate": 6.285714285714286e-06, "loss": -0.0011, "num_tokens": 596571.0, "reward": 1.5583102703094482, "reward_std": 1.6529381275177002, "rewards/rollout_reward_func/mean": 1.5583102703094482, "rewards/rollout_reward_func/std": 1.7341761589050293, "sampling/importance_sampling_ratio/max": 0.06399935483932495, "sampling/importance_sampling_ratio/mean": 0.025177521631121635, "sampling/importance_sampling_ratio/min": 1.0080106696608216e-18, "sampling/sampling_logp_difference/max": 3.969541072845459, "sampling/sampling_logp_difference/mean": 1.8030247688293457, "step": 23, "step_time": 8.169906658999935 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 8.80251955986023, "epoch": 0.00024, "grad_norm": 0.029200905933976173, "kl": 0.04814133094623685, "learning_rate": 6.571428571428572e-06, "loss": -0.0012, "step": 24, "step_time": 4.54475868199961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 517.65625, "completions/mean_terminated_length": 530.7333374023438, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.177129030227661, "epoch": 0.00025, "frac_reward_zero_std": 0.0, "grad_norm": 0.008773105219006538, "kl": 0.03207368147559464, "learning_rate": 6.857142857142858e-06, "loss": -0.0007, "num_tokens": 653422.0, "reward": 2.0930328369140625, "reward_std": 1.470797061920166, "rewards/rollout_reward_func/mean": 2.0930328369140625, "rewards/rollout_reward_func/std": 1.5851061344146729, "sampling/importance_sampling_ratio/max": 0.08151775598526001, "sampling/importance_sampling_ratio/mean": 0.01129196584224701, "sampling/importance_sampling_ratio/min": 2.138438081125682e-13, "sampling/sampling_logp_difference/max": 3.773285388946533, "sampling/sampling_logp_difference/mean": 1.4078741073608398, "step": 25, "step_time": 9.088412857000094 }, { "clip_ratio/high_max": 0.043181818444281816, "clip_ratio/high_mean": 0.021590909222140908, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021590909222140908, "entropy": 8.089222967624664, "epoch": 0.00026, "grad_norm": 0.008307461626827717, "kl": 0.04364914959296584, "learning_rate": 7.1428571428571436e-06, "loss": -0.0007, "step": 26, "step_time": 4.877906865999648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 309.9375, "completions/mean_terminated_length": 309.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.212523341178894, "epoch": 0.00027, "frac_reward_zero_std": 0.0, "grad_norm": 0.014188895933330059, "kl": 0.0853999936953187, "learning_rate": 7.428571428571429e-06, "loss": -0.001, "num_tokens": 702104.0, "reward": 1.7692888975143433, "reward_std": 1.6422841548919678, "rewards/rollout_reward_func/mean": 1.7692888975143433, "rewards/rollout_reward_func/std": 1.989976406097412, "sampling/importance_sampling_ratio/max": 0.11609657108783722, "sampling/importance_sampling_ratio/mean": 0.027356663718819618, "sampling/importance_sampling_ratio/min": 9.024407518154476e-06, "sampling/sampling_logp_difference/max": 2.4522972106933594, "sampling/sampling_logp_difference/mean": 1.4562647342681885, "step": 27, "step_time": 9.140366876000371 }, { "clip_ratio/high_max": 0.12500000186264515, "clip_ratio/high_mean": 0.06250000093132257, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06250000093132257, "entropy": 8.088905096054077, "epoch": 0.00028, "grad_norm": 0.014231563545763493, "kl": 0.10766742378473282, "learning_rate": 7.714285714285716e-06, "loss": -0.0011, "step": 28, "step_time": 5.39836863499977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 254.8125, "completions/mean_terminated_length": 262.51611328125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.048985123634338, "epoch": 0.00029, "frac_reward_zero_std": 0.0, "grad_norm": 0.015394963324069977, "kl": 0.1432503336109221, "learning_rate": 8.000000000000001e-06, "loss": -0.001, "num_tokens": 749391.0, "reward": 1.818474292755127, "reward_std": 1.1649471521377563, "rewards/rollout_reward_func/mean": 1.818474292755127, "rewards/rollout_reward_func/std": 1.8718231916427612, "sampling/importance_sampling_ratio/max": 0.130377858877182, "sampling/importance_sampling_ratio/mean": 0.036613546311855316, "sampling/importance_sampling_ratio/min": 1.2765659362923287e-10, "sampling/sampling_logp_difference/max": 3.609269380569458, "sampling/sampling_logp_difference/mean": 1.4590439796447754, "step": 29, "step_time": 8.3333206719999 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031250000931322575, "entropy": 7.883777499198914, "epoch": 0.0003, "grad_norm": 0.01582338474690914, "kl": 0.17406905256211758, "learning_rate": 8.285714285714287e-06, "loss": -0.0011, "step": 30, "step_time": 4.778778166000393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 541.90625, "completions/mean_terminated_length": 558.8709716796875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.091109991073608, "epoch": 0.00031, "frac_reward_zero_std": 0.0, "grad_norm": 0.021641016006469727, "kl": 0.12537556886672974, "learning_rate": 8.571428571428571e-06, "loss": 0.0003, "num_tokens": 807902.0, "reward": 1.436830997467041, "reward_std": 1.0506994724273682, "rewards/rollout_reward_func/mean": 1.436830997467041, "rewards/rollout_reward_func/std": 1.2816261053085327, "sampling/importance_sampling_ratio/max": 0.07519304007291794, "sampling/importance_sampling_ratio/mean": 0.013723745942115784, "sampling/importance_sampling_ratio/min": 1.7366062713998758e-16, "sampling/sampling_logp_difference/max": 5.218207359313965, "sampling/sampling_logp_difference/mean": 1.5592684745788574, "step": 31, "step_time": 9.165752515999884 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 8.013701796531677, "epoch": 0.00032, "grad_norm": 0.021766290068626404, "kl": 0.1357055138796568, "learning_rate": 8.857142857142858e-06, "loss": 0.0003, "step": 32, "step_time": 4.949108714999966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 663.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 355.46875, "completions/mean_terminated_length": 337.3000183105469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.517293989658356, "epoch": 0.00033, "frac_reward_zero_std": 0.0, "grad_norm": 0.017694007605314255, "kl": 0.20308297593146563, "learning_rate": 9.142857142857144e-06, "loss": -0.0019, "num_tokens": 859075.0, "reward": 2.665842056274414, "reward_std": 2.0510294437408447, "rewards/rollout_reward_func/mean": 2.665842056274414, "rewards/rollout_reward_func/std": 2.058197498321533, "sampling/importance_sampling_ratio/max": 0.1805843561887741, "sampling/importance_sampling_ratio/mean": 0.033763326704502106, "sampling/importance_sampling_ratio/min": 9.382639013877456e-15, "sampling/sampling_logp_difference/max": 4.034926891326904, "sampling/sampling_logp_difference/mean": 1.42521071434021, "step": 33, "step_time": 9.64415260199985 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 7.461727142333984, "epoch": 0.00034, "grad_norm": 0.01653335802257061, "kl": 0.21375709865242243, "learning_rate": 9.42857142857143e-06, "loss": -0.002, "step": 34, "step_time": 4.6672892820001834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 287.5, "completions/mean_terminated_length": 272.1612854003906, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.252508640289307, "epoch": 0.00035, "frac_reward_zero_std": 0.0, "grad_norm": 0.04618426784873009, "kl": 0.294549023732543, "learning_rate": 9.714285714285715e-06, "loss": -0.001, "num_tokens": 908468.0, "reward": 1.9603195190429688, "reward_std": 1.7872586250305176, "rewards/rollout_reward_func/mean": 1.9603195190429688, "rewards/rollout_reward_func/std": 1.841855764389038, "sampling/importance_sampling_ratio/max": 0.19450248777866364, "sampling/importance_sampling_ratio/mean": 0.0430486798286438, "sampling/importance_sampling_ratio/min": 1.1371217567557323e-07, "sampling/sampling_logp_difference/max": 3.5069642066955566, "sampling/sampling_logp_difference/mean": 1.187075138092041, "step": 35, "step_time": 8.733247314000437 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026041666977107525, "entropy": 7.227072596549988, "epoch": 0.00036, "grad_norm": 0.0316547267138958, "kl": 0.26715745590627193, "learning_rate": 1e-05, "loss": -0.0011, "step": 36, "step_time": 4.828337421000015 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014756944496184587, "completions/clipped_ratio": 0.03125, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 390.0, "completions/mean_terminated_length": 402.06451416015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.85473507642746, "epoch": 0.00037, "frac_reward_zero_std": 0.0, "grad_norm": 0.01456042192876339, "kl": 0.20423047989606857, "learning_rate": 9.999999999884322e-06, "loss": -0.0029, "num_tokens": 961096.0, "reward": 3.3347973823547363, "reward_std": 1.635354995727539, "rewards/rollout_reward_func/mean": 3.3347973823547363, "rewards/rollout_reward_func/std": 1.591873288154602, "sampling/importance_sampling_ratio/max": 0.2037617266178131, "sampling/importance_sampling_ratio/mean": 0.04216703772544861, "sampling/importance_sampling_ratio/min": 3.60183348667997e-18, "sampling/sampling_logp_difference/max": 4.788333415985107, "sampling/sampling_logp_difference/mean": 1.3821991682052612, "step": 37, "step_time": 8.829268903999946 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014756944496184587, "entropy": 6.8267329931259155, "epoch": 0.00038, "grad_norm": 0.012915832921862602, "kl": 0.2027184907346964, "learning_rate": 9.999999999537282e-06, "loss": -0.003, "step": 38, "step_time": 5.279609900000196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 292.125, "completions/mean_terminated_length": 291.9666748046875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.531160056591034, "epoch": 0.00039, "frac_reward_zero_std": 0.0, "grad_norm": 0.015933845192193985, "kl": 0.2402525246143341, "learning_rate": 9.999999998958884e-06, "loss": -0.0013, "num_tokens": 1010524.0, "reward": 1.7904164791107178, "reward_std": 1.6788225173950195, "rewards/rollout_reward_func/mean": 1.7904164791107178, "rewards/rollout_reward_func/std": 1.8581712245941162, "sampling/importance_sampling_ratio/max": 0.042524565011262894, "sampling/importance_sampling_ratio/mean": 0.021317776292562485, "sampling/importance_sampling_ratio/min": 5.162857177539051e-24, "sampling/sampling_logp_difference/max": 12.552388191223145, "sampling/sampling_logp_difference/mean": 1.622736930847168, "step": 39, "step_time": 9.170014825000635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.470682978630066, "epoch": 0.0004, "grad_norm": 0.014897222630679607, "kl": 0.2379161100834608, "learning_rate": 9.999999998149125e-06, "loss": -0.0013, "step": 40, "step_time": 4.626162753000699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.03125, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 588.96875, "completions/mean_terminated_length": 594.258056640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.788816154003143, "epoch": 0.00041, "frac_reward_zero_std": 0.0, "grad_norm": 0.020633654668927193, "kl": 0.17094908049330115, "learning_rate": 9.99999999710801e-06, "loss": -0.0025, "num_tokens": 1070581.0, "reward": 2.306253433227539, "reward_std": 1.3609917163848877, "rewards/rollout_reward_func/mean": 2.306253433227539, "rewards/rollout_reward_func/std": 1.8414863348007202, "sampling/importance_sampling_ratio/max": 0.05647118017077446, "sampling/importance_sampling_ratio/mean": 0.019686147570610046, "sampling/importance_sampling_ratio/min": 8.113022520378068e-17, "sampling/sampling_logp_difference/max": 5.106669902801514, "sampling/sampling_logp_difference/mean": 1.3194385766983032, "step": 41, "step_time": 9.059421735999877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 6.697398841381073, "epoch": 0.00042, "grad_norm": 0.012766940519213676, "kl": 0.16542547149583697, "learning_rate": 9.999999995835533e-06, "loss": -0.0026, "step": 42, "step_time": 4.873758401001396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 368.4375, "completions/mean_terminated_length": 368.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.104367315769196, "epoch": 0.00043, "frac_reward_zero_std": 0.25, "grad_norm": 0.014938808046281338, "kl": 0.23283970914781094, "learning_rate": 9.999999994331697e-06, "loss": -0.0029, "num_tokens": 1121352.0, "reward": 2.358142614364624, "reward_std": 1.2651467323303223, "rewards/rollout_reward_func/mean": 2.358142614364624, "rewards/rollout_reward_func/std": 1.9181171655654907, "sampling/importance_sampling_ratio/max": 0.26942965388298035, "sampling/importance_sampling_ratio/mean": 0.06439891457557678, "sampling/importance_sampling_ratio/min": 0.00032740956521593034, "sampling/sampling_logp_difference/max": 2.4414572715759277, "sampling/sampling_logp_difference/mean": 1.2698404788970947, "step": 43, "step_time": 8.609952049000185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.033655643463135, "epoch": 0.00044, "grad_norm": 0.014094019308686256, "kl": 0.23375796806067228, "learning_rate": 9.999999992596503e-06, "loss": -0.003, "step": 44, "step_time": 5.138805102998958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0625, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 323.90625, "completions/mean_terminated_length": 315.8333435058594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.381109952926636, "epoch": 0.00045, "frac_reward_zero_std": 0.0, "grad_norm": 0.015758151188492775, "kl": 0.1762851346284151, "learning_rate": 9.999999990629948e-06, "loss": -0.0067, "num_tokens": 1168932.0, "reward": 2.3062658309936523, "reward_std": 1.6873681545257568, "rewards/rollout_reward_func/mean": 2.3062658309936523, "rewards/rollout_reward_func/std": 2.015537738800049, "sampling/importance_sampling_ratio/max": 0.28785669803619385, "sampling/importance_sampling_ratio/mean": 0.07849664986133575, "sampling/importance_sampling_ratio/min": 1.9031297972719374e-18, "sampling/sampling_logp_difference/max": 4.624307155609131, "sampling/sampling_logp_difference/mean": 1.4169467687606812, "step": 45, "step_time": 9.268211923999843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.324137270450592, "epoch": 0.00046, "grad_norm": 0.016143618151545525, "kl": 0.17720989137887955, "learning_rate": 9.999999988432035e-06, "loss": -0.0069, "step": 46, "step_time": 4.723417413000334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 289.6875, "completions/mean_terminated_length": 289.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.798922121524811, "epoch": 0.00047, "frac_reward_zero_std": 0.0, "grad_norm": 0.022382335737347603, "kl": 0.45002966932952404, "learning_rate": 9.999999986002761e-06, "loss": -0.0059, "num_tokens": 1217968.0, "reward": 1.891412377357483, "reward_std": 1.739563226699829, "rewards/rollout_reward_func/mean": 1.891412377357483, "rewards/rollout_reward_func/std": 2.1437840461730957, "sampling/importance_sampling_ratio/max": 0.09762566536664963, "sampling/importance_sampling_ratio/mean": 0.04518824815750122, "sampling/importance_sampling_ratio/min": 4.220652438657879e-10, "sampling/sampling_logp_difference/max": 4.6860880851745605, "sampling/sampling_logp_difference/mean": 1.201680064201355, "step": 47, "step_time": 8.263996364999457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 6.750116407871246, "epoch": 0.00048, "grad_norm": 0.015847016125917435, "kl": 0.3995134783908725, "learning_rate": 9.999999983342127e-06, "loss": -0.0061, "step": 48, "step_time": 4.558645490999879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 548.90625, "completions/mean_terminated_length": 548.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.013215720653534, "epoch": 0.00049, "frac_reward_zero_std": 0.0, "grad_norm": 0.021171115338802338, "kl": 0.3200267134234309, "learning_rate": 9.999999980450137e-06, "loss": -0.0033, "num_tokens": 1277645.0, "reward": 2.5847978591918945, "reward_std": 0.8504736423492432, "rewards/rollout_reward_func/mean": 2.5847978591918945, "rewards/rollout_reward_func/std": 2.012620687484741, "sampling/importance_sampling_ratio/max": 0.10617782175540924, "sampling/importance_sampling_ratio/mean": 0.057969365268945694, "sampling/importance_sampling_ratio/min": 0.0006385967135429382, "sampling/sampling_logp_difference/max": 2.5639331340789795, "sampling/sampling_logp_difference/mean": 0.9616080522537231, "step": 49, "step_time": 8.934477900000275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.897445023059845, "epoch": 0.0005, "grad_norm": 0.018860990181565285, "kl": 0.33296194672584534, "learning_rate": 9.999999977326787e-06, "loss": -0.0034, "step": 50, "step_time": 5.351620988000377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 537.46875, "completions/mean_terminated_length": 537.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.633717179298401, "epoch": 0.00051, "frac_reward_zero_std": 0.0, "grad_norm": 0.01536987628787756, "kl": 0.2537247408181429, "learning_rate": 9.999999973972076e-06, "loss": -0.0022, "num_tokens": 1337256.0, "reward": 2.237421989440918, "reward_std": 1.3681035041809082, "rewards/rollout_reward_func/mean": 2.237421989440918, "rewards/rollout_reward_func/std": 1.740581750869751, "sampling/importance_sampling_ratio/max": 0.1205218955874443, "sampling/importance_sampling_ratio/mean": 0.04655706137418747, "sampling/importance_sampling_ratio/min": 0.001190877752378583, "sampling/sampling_logp_difference/max": 2.396751880645752, "sampling/sampling_logp_difference/mean": 1.094531774520874, "step": 51, "step_time": 9.431871319000038 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 6.529919326305389, "epoch": 0.00052, "grad_norm": 0.010579893365502357, "kl": 0.26615126617252827, "learning_rate": 9.999999970386004e-06, "loss": -0.0023, "step": 52, "step_time": 4.8396601220001685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 324.125, "completions/mean_terminated_length": 324.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.771689236164093, "epoch": 0.00053, "frac_reward_zero_std": 0.0, "grad_norm": 0.015091736800968647, "kl": 0.2611819123849273, "learning_rate": 9.999999966568576e-06, "loss": -0.011, "num_tokens": 1386134.0, "reward": 2.217728853225708, "reward_std": 2.024404287338257, "rewards/rollout_reward_func/mean": 2.217728853225708, "rewards/rollout_reward_func/std": 2.3452394008636475, "sampling/importance_sampling_ratio/max": 0.3583432734012604, "sampling/importance_sampling_ratio/mean": 0.08469430357217789, "sampling/importance_sampling_ratio/min": 1.4002454964024292e-14, "sampling/sampling_logp_difference/max": 4.249520301818848, "sampling/sampling_logp_difference/mean": 1.3450038433074951, "step": 53, "step_time": 8.218689654000627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.760413706302643, "epoch": 0.00054, "grad_norm": 0.015123301185667515, "kl": 0.26832089852541685, "learning_rate": 9.999999962519787e-06, "loss": -0.0111, "step": 54, "step_time": 4.458178430999396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.03125, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 401.15625, "completions/mean_terminated_length": 393.1290283203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.331356406211853, "epoch": 0.00055, "frac_reward_zero_std": 0.0, "grad_norm": 0.07398711144924164, "kl": 0.45008302945643663, "learning_rate": 9.999999958239642e-06, "loss": -0.0078, "num_tokens": 1439006.0, "reward": 1.4235717058181763, "reward_std": 1.2011210918426514, "rewards/rollout_reward_func/mean": 1.4235717058181763, "rewards/rollout_reward_func/std": 1.8165862560272217, "sampling/importance_sampling_ratio/max": 0.37091660499572754, "sampling/importance_sampling_ratio/mean": 0.08141454309225082, "sampling/importance_sampling_ratio/min": 5.328838854689677e-16, "sampling/sampling_logp_difference/max": 6.0843353271484375, "sampling/sampling_logp_difference/mean": 1.2536330223083496, "step": 55, "step_time": 8.796601195999756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 6.339613497257233, "epoch": 0.00056, "grad_norm": 0.026661040261387825, "kl": 0.4182750675827265, "learning_rate": 9.999999953728133e-06, "loss": -0.0081, "step": 56, "step_time": 5.088884858999336 }, { "clip_ratio/high_max": 0.02025462966412306, "clip_ratio/high_mean": 0.01012731483206153, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01012731483206153, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 436.9375, "completions/mean_terminated_length": 436.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.944717228412628, "epoch": 0.00057, "frac_reward_zero_std": 0.0, "grad_norm": 0.01468927413225174, "kl": 0.3506404645740986, "learning_rate": 9.999999948985266e-06, "loss": -0.0092, "num_tokens": 1493962.0, "reward": 1.8320198059082031, "reward_std": 0.8225011825561523, "rewards/rollout_reward_func/mean": 1.8320198059082031, "rewards/rollout_reward_func/std": 1.2535020112991333, "sampling/importance_sampling_ratio/max": 0.3948986232280731, "sampling/importance_sampling_ratio/mean": 0.11361236125230789, "sampling/importance_sampling_ratio/min": 1.1934993257971545e-26, "sampling/sampling_logp_difference/max": 11.387396812438965, "sampling/sampling_logp_difference/mean": 1.256805658340454, "step": 57, "step_time": 9.449572570999862 }, { "clip_ratio/high_max": 0.004629629664123058, "clip_ratio/high_mean": 0.002314814832061529, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002314814832061529, "entropy": 5.935160547494888, "epoch": 0.00058, "grad_norm": 0.015919912606477737, "kl": 0.3587344065308571, "learning_rate": 9.99999994401104e-06, "loss": -0.0093, "step": 58, "step_time": 4.746457023000403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 480.59375, "completions/mean_terminated_length": 480.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.848488986492157, "epoch": 0.00059, "frac_reward_zero_std": 0.25, "grad_norm": 0.011293532326817513, "kl": 0.3660230152308941, "learning_rate": 9.999999938805455e-06, "loss": -0.0115, "num_tokens": 1551299.0, "reward": 2.3304104804992676, "reward_std": 1.6922229528427124, "rewards/rollout_reward_func/mean": 2.3304104804992676, "rewards/rollout_reward_func/std": 1.9971063137054443, "sampling/importance_sampling_ratio/max": 0.1643364131450653, "sampling/importance_sampling_ratio/mean": 0.08756053447723389, "sampling/importance_sampling_ratio/min": 0.0002673097769729793, "sampling/sampling_logp_difference/max": 2.872664213180542, "sampling/sampling_logp_difference/mean": 0.9991017580032349, "step": 59, "step_time": 8.483788936999645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.8199068903923035, "epoch": 0.0006, "grad_norm": 0.009261633269488811, "kl": 0.3585043679922819, "learning_rate": 9.999999933368511e-06, "loss": -0.0116, "step": 60, "step_time": 4.707603984000343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 475.6875, "completions/mean_terminated_length": 470.774169921875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.473706513643265, "epoch": 0.00061, "frac_reward_zero_std": 0.0, "grad_norm": 0.013257281854748726, "kl": 0.32057628221809864, "learning_rate": 9.999999927700208e-06, "loss": -0.0136, "num_tokens": 1608125.0, "reward": 3.2187983989715576, "reward_std": 1.597353458404541, "rewards/rollout_reward_func/mean": 3.2187983989715576, "rewards/rollout_reward_func/std": 2.062941551208496, "sampling/importance_sampling_ratio/max": 0.17399519681930542, "sampling/importance_sampling_ratio/mean": 0.09229454398155212, "sampling/importance_sampling_ratio/min": 5.896088738771565e-13, "sampling/sampling_logp_difference/max": 4.472219944000244, "sampling/sampling_logp_difference/mean": 1.0552775859832764, "step": 61, "step_time": 9.438191732000632 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 5.456495136022568, "epoch": 0.00062, "grad_norm": 0.01240911427885294, "kl": 0.32513533532619476, "learning_rate": 9.999999921800544e-06, "loss": -0.0137, "step": 62, "step_time": 4.807036896999307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 240.96875, "completions/mean_terminated_length": 226.9677276611328, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.3534833788871765, "epoch": 0.00063, "frac_reward_zero_std": 0.0, "grad_norm": 0.01809127815067768, "kl": 0.36241104267537594, "learning_rate": 9.999999915669521e-06, "loss": -0.0133, "num_tokens": 1652976.0, "reward": 2.7536964416503906, "reward_std": 1.9717084169387817, "rewards/rollout_reward_func/mean": 2.7536964416503906, "rewards/rollout_reward_func/std": 2.0205845832824707, "sampling/importance_sampling_ratio/max": 0.43795350193977356, "sampling/importance_sampling_ratio/mean": 0.12350660562515259, "sampling/importance_sampling_ratio/min": 5.083219400958683e-10, "sampling/sampling_logp_difference/max": 3.9410667419433594, "sampling/sampling_logp_difference/mean": 1.179007887840271, "step": 63, "step_time": 8.806214823998744 }, { "clip_ratio/high_max": 0.07859848625957966, "clip_ratio/high_mean": 0.03929924312978983, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03929924312978983, "entropy": 6.309255123138428, "epoch": 0.00064, "grad_norm": 0.013196753337979317, "kl": 0.3604668825864792, "learning_rate": 9.99999990930714e-06, "loss": -0.0134, "step": 64, "step_time": 4.585649621999437 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 520.15625, "completions/mean_terminated_length": 520.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.630438059568405, "epoch": 0.00065, "frac_reward_zero_std": 0.0, "grad_norm": 0.009761711582541466, "kl": 0.25976699963212013, "learning_rate": 9.999999902713398e-06, "loss": -0.0096, "num_tokens": 1710411.0, "reward": 2.439373731613159, "reward_std": 1.6874518394470215, "rewards/rollout_reward_func/mean": 2.439373731613159, "rewards/rollout_reward_func/std": 1.8576425313949585, "sampling/importance_sampling_ratio/max": 0.1951960325241089, "sampling/importance_sampling_ratio/mean": 0.06827103346586227, "sampling/importance_sampling_ratio/min": 1.1635305696700016e-07, "sampling/sampling_logp_difference/max": 4.405527591705322, "sampling/sampling_logp_difference/mean": 1.1796586513519287, "step": 65, "step_time": 8.827101629000026 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 6.571233093738556, "epoch": 0.00066, "grad_norm": 0.008613799698650837, "kl": 0.2607234949246049, "learning_rate": 9.999999895888298e-06, "loss": -0.0096, "step": 66, "step_time": 4.809835060999376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 301.1875, "completions/mean_terminated_length": 301.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.944601535797119, "epoch": 0.00067, "frac_reward_zero_std": 0.0, "grad_norm": 0.012496919371187687, "kl": 0.4631600920110941, "learning_rate": 9.99999988883184e-06, "loss": -0.0128, "num_tokens": 1759738.0, "reward": 1.9331152439117432, "reward_std": 1.1334162950515747, "rewards/rollout_reward_func/mean": 1.9331152439117432, "rewards/rollout_reward_func/std": 2.0543787479400635, "sampling/importance_sampling_ratio/max": 0.46345254778862, "sampling/importance_sampling_ratio/mean": 0.1404380202293396, "sampling/importance_sampling_ratio/min": 5.890969418942404e-07, "sampling/sampling_logp_difference/max": 4.386819839477539, "sampling/sampling_logp_difference/mean": 1.1129919290542603, "step": 67, "step_time": 8.765856677001011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 5.895752668380737, "epoch": 0.00068, "grad_norm": 0.012568553909659386, "kl": 0.46327478997409344, "learning_rate": 9.999999881544019e-06, "loss": -0.0128, "step": 68, "step_time": 5.110902637000436 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 452.34375, "completions/mean_terminated_length": 452.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.041722625494003, "epoch": 0.00069, "frac_reward_zero_std": 0.25, "grad_norm": 0.07526283711194992, "kl": 0.6064412295818329, "learning_rate": 9.999999874024841e-06, "loss": -0.0146, "num_tokens": 1814594.0, "reward": 3.077542781829834, "reward_std": 1.2306993007659912, "rewards/rollout_reward_func/mean": 3.077542781829834, "rewards/rollout_reward_func/std": 1.810649037361145, "sampling/importance_sampling_ratio/max": 0.4706134796142578, "sampling/importance_sampling_ratio/mean": 0.165305495262146, "sampling/importance_sampling_ratio/min": 0.0011951870983466506, "sampling/sampling_logp_difference/max": 2.497037887573242, "sampling/sampling_logp_difference/mean": 0.7777004837989807, "step": 69, "step_time": 8.851037519998954 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 4.988219231367111, "epoch": 0.0007, "grad_norm": 0.20997123420238495, "kl": 0.8107541762292385, "learning_rate": 9.999999866274303e-06, "loss": -0.0143, "step": 70, "step_time": 4.902573127999858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 275.4375, "completions/mean_terminated_length": 265.8709716796875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.022567570209503, "epoch": 0.00071, "frac_reward_zero_std": 0.0, "grad_norm": 0.02304830029606819, "kl": 0.34742444939911366, "learning_rate": 9.999999858292407e-06, "loss": -0.0139, "num_tokens": 1861821.0, "reward": 2.4483556747436523, "reward_std": 1.4657219648361206, "rewards/rollout_reward_func/mean": 2.4483556747436523, "rewards/rollout_reward_func/std": 1.791189432144165, "sampling/importance_sampling_ratio/max": 0.47685861587524414, "sampling/importance_sampling_ratio/mean": 0.16110098361968994, "sampling/importance_sampling_ratio/min": 1.8858786060560462e-11, "sampling/sampling_logp_difference/max": 4.634000301361084, "sampling/sampling_logp_difference/mean": 1.0669944286346436, "step": 71, "step_time": 7.997900912999285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.069274663925171, "epoch": 0.00072, "grad_norm": 0.02166938968002796, "kl": 0.34302423894405365, "learning_rate": 9.99999985007915e-06, "loss": -0.0139, "step": 72, "step_time": 4.490313349000189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 488.6875, "completions/mean_terminated_length": 503.9354553222656, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.873573303222656, "epoch": 0.00073, "frac_reward_zero_std": 0.0, "grad_norm": 0.01142603438347578, "kl": 0.3447978002950549, "learning_rate": 9.999999841634535e-06, "loss": -0.0091, "num_tokens": 1916473.0, "reward": 2.781144857406616, "reward_std": 0.9423757791519165, "rewards/rollout_reward_func/mean": 2.781144857406616, "rewards/rollout_reward_func/std": 1.5482388734817505, "sampling/importance_sampling_ratio/max": 0.4879491329193115, "sampling/importance_sampling_ratio/mean": 0.12894627451896667, "sampling/importance_sampling_ratio/min": 5.110472808822486e-12, "sampling/sampling_logp_difference/max": 3.6776225566864014, "sampling/sampling_logp_difference/mean": 1.0262196063995361, "step": 73, "step_time": 8.777917193999201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.9237218499183655, "epoch": 0.00074, "grad_norm": 0.011944163590669632, "kl": 0.3441983833909035, "learning_rate": 9.99999983295856e-06, "loss": -0.0091, "step": 74, "step_time": 5.257215922000341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 217.28125, "completions/mean_terminated_length": 206.09677124023438, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.533183515071869, "epoch": 0.00075, "frac_reward_zero_std": 0.5, "grad_norm": 0.04438586160540581, "kl": 0.6188310664147139, "learning_rate": 9.999999824051225e-06, "loss": -0.0088, "num_tokens": 1961274.0, "reward": 2.8512258529663086, "reward_std": 0.732064962387085, "rewards/rollout_reward_func/mean": 2.8512258529663086, "rewards/rollout_reward_func/std": 1.670456051826477, "sampling/importance_sampling_ratio/max": 0.49958670139312744, "sampling/importance_sampling_ratio/mean": 0.21123701333999634, "sampling/importance_sampling_ratio/min": 3.237672987783241e-14, "sampling/sampling_logp_difference/max": 5.040909767150879, "sampling/sampling_logp_difference/mean": 1.135288953781128, "step": 75, "step_time": 7.92215164100071 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.562716752290726, "epoch": 0.00076, "grad_norm": 0.02424151450395584, "kl": 0.565250052139163, "learning_rate": 9.999999814912531e-06, "loss": -0.0089, "step": 76, "step_time": 4.395662217000336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 291.0625, "completions/mean_terminated_length": 289.93548583984375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.875417947769165, "epoch": 0.00077, "frac_reward_zero_std": 0.0, "grad_norm": 0.015563178807497025, "kl": 0.41501787677407265, "learning_rate": 9.999999805542478e-06, "loss": -0.0118, "num_tokens": 2010634.0, "reward": 1.0790379047393799, "reward_std": 1.1985231637954712, "rewards/rollout_reward_func/mean": 1.0790379047393799, "rewards/rollout_reward_func/std": 1.5060786008834839, "sampling/importance_sampling_ratio/max": 0.5051907300949097, "sampling/importance_sampling_ratio/mean": 0.1884518414735794, "sampling/importance_sampling_ratio/min": 1.3433022472142397e-09, "sampling/sampling_logp_difference/max": 10.98376750946045, "sampling/sampling_logp_difference/mean": 1.1490110158920288, "step": 77, "step_time": 8.915195299999596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 5.885697066783905, "epoch": 0.00078, "grad_norm": 0.014125452376902103, "kl": 0.4149230867624283, "learning_rate": 9.999999795941065e-06, "loss": -0.0119, "step": 78, "step_time": 5.149517803999061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 291.9375, "completions/mean_terminated_length": 300.8387145996094, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.519651710987091, "epoch": 0.00079, "frac_reward_zero_std": 0.0, "grad_norm": 0.10713054239749908, "kl": 0.5323284231126308, "learning_rate": 9.999999786108293e-06, "loss": -0.0227, "num_tokens": 2059737.0, "reward": 1.5329793691635132, "reward_std": 0.5013623237609863, "rewards/rollout_reward_func/mean": 1.5329793691635132, "rewards/rollout_reward_func/std": 1.3323529958724976, "sampling/importance_sampling_ratio/max": 0.5639442801475525, "sampling/importance_sampling_ratio/mean": 0.25227874517440796, "sampling/importance_sampling_ratio/min": 2.7577478468139224e-14, "sampling/sampling_logp_difference/max": 3.539957046508789, "sampling/sampling_logp_difference/mean": 1.098191261291504, "step": 79, "step_time": 8.723111569000139 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.00631313119083643, "clip_ratio/low_min": 0.005681818351149559, "clip_ratio/region_mean": 0.01412563119083643, "entropy": 5.470509052276611, "epoch": 0.0008, "grad_norm": 0.06396406888961792, "kl": 0.5383136495947838, "learning_rate": 9.999999776044163e-06, "loss": -0.0233, "step": 80, "step_time": 4.934400118000212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 384.1875, "completions/mean_terminated_length": 371.0322570800781, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.105532318353653, "epoch": 0.00081, "frac_reward_zero_std": 0.25, "grad_norm": 0.014338403008878231, "kl": 0.5314097702503204, "learning_rate": 9.999999765748672e-06, "loss": -0.0119, "num_tokens": 2109547.0, "reward": 3.0377540588378906, "reward_std": 1.2538565397262573, "rewards/rollout_reward_func/mean": 3.0377540588378906, "rewards/rollout_reward_func/std": 1.6159894466400146, "sampling/importance_sampling_ratio/max": 0.5120093822479248, "sampling/importance_sampling_ratio/mean": 0.18446674942970276, "sampling/importance_sampling_ratio/min": 3.6131722613852446e-11, "sampling/sampling_logp_difference/max": 3.793142080307007, "sampling/sampling_logp_difference/mean": 0.8700344562530518, "step": 81, "step_time": 8.634114757999669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 5.07496240735054, "epoch": 0.00082, "grad_norm": 0.012873583473265171, "kl": 0.5282706655561924, "learning_rate": 9.999999755221823e-06, "loss": -0.012, "step": 82, "step_time": 4.737106287001097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 284.125, "completions/mean_terminated_length": 284.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.206637322902679, "epoch": 0.00083, "frac_reward_zero_std": 0.0, "grad_norm": 0.02513122372329235, "kl": 0.5926351137459278, "learning_rate": 9.999999744463613e-06, "loss": -0.0183, "num_tokens": 2156768.0, "reward": 2.840292453765869, "reward_std": 1.3401542901992798, "rewards/rollout_reward_func/mean": 2.840292453765869, "rewards/rollout_reward_func/std": 1.8102028369903564, "sampling/importance_sampling_ratio/max": 0.520283579826355, "sampling/importance_sampling_ratio/mean": 0.2087681144475937, "sampling/importance_sampling_ratio/min": 2.49877535329901e-10, "sampling/sampling_logp_difference/max": 4.733933448791504, "sampling/sampling_logp_difference/mean": 0.9762767553329468, "step": 83, "step_time": 8.479427639000278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.208840876817703, "epoch": 0.00084, "grad_norm": 0.03026541694998741, "kl": 0.6035371646285057, "learning_rate": 9.999999733474045e-06, "loss": -0.0183, "step": 84, "step_time": 5.254075981999904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 450.0, "completions/mean_terminated_length": 464.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.834830671548843, "epoch": 0.00085, "frac_reward_zero_std": 0.0, "grad_norm": 0.012555217370390892, "kl": 0.3606609106063843, "learning_rate": 9.999999722253117e-06, "loss": -0.0117, "num_tokens": 2211652.0, "reward": 1.648930311203003, "reward_std": 1.215954065322876, "rewards/rollout_reward_func/mean": 1.648930311203003, "rewards/rollout_reward_func/std": 1.3405512571334839, "sampling/importance_sampling_ratio/max": 0.5233410000801086, "sampling/importance_sampling_ratio/mean": 0.13886834681034088, "sampling/importance_sampling_ratio/min": 1.688752483300254e-16, "sampling/sampling_logp_difference/max": 3.9144070148468018, "sampling/sampling_logp_difference/mean": 1.1154017448425293, "step": 85, "step_time": 9.084167009999419 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 5.833049297332764, "epoch": 0.00086, "grad_norm": 0.012575902976095676, "kl": 0.3626660779118538, "learning_rate": 9.99999971080083e-06, "loss": -0.0118, "step": 86, "step_time": 5.319575449999775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 235.9375, "completions/mean_terminated_length": 243.03225708007812, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.763930678367615, "epoch": 0.00087, "frac_reward_zero_std": 0.0, "grad_norm": 0.014682337641716003, "kl": 0.39749570190906525, "learning_rate": 9.999999699117184e-06, "loss": -0.0146, "num_tokens": 2257453.0, "reward": 2.5003364086151123, "reward_std": 1.9458928108215332, "rewards/rollout_reward_func/mean": 2.5003364086151123, "rewards/rollout_reward_func/std": 2.2692220211029053, "sampling/importance_sampling_ratio/max": 0.5295130610466003, "sampling/importance_sampling_ratio/mean": 0.11728590726852417, "sampling/importance_sampling_ratio/min": 3.7461741065995813e-13, "sampling/sampling_logp_difference/max": 3.0392837524414062, "sampling/sampling_logp_difference/mean": 1.4054160118103027, "step": 87, "step_time": 8.034342769000887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.757734656333923, "epoch": 0.00088, "grad_norm": 0.013116477057337761, "kl": 0.38789064437150955, "learning_rate": 9.999999687202177e-06, "loss": -0.0146, "step": 88, "step_time": 4.444944719000432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 755.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 447.34375, "completions/mean_terminated_length": 451.4667053222656, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.2215496301651, "epoch": 0.00089, "frac_reward_zero_std": 0.0, "grad_norm": 0.013803564012050629, "kl": 0.5341093055903912, "learning_rate": 9.999999675055814e-06, "loss": -0.0177, "num_tokens": 2312250.0, "reward": 1.7047960758209229, "reward_std": 1.1020748615264893, "rewards/rollout_reward_func/mean": 1.7047960758209229, "rewards/rollout_reward_func/std": 1.8067169189453125, "sampling/importance_sampling_ratio/max": 0.5259312391281128, "sampling/importance_sampling_ratio/mean": 0.17643724381923676, "sampling/importance_sampling_ratio/min": 2.5637248400600665e-12, "sampling/sampling_logp_difference/max": 5.182948589324951, "sampling/sampling_logp_difference/mean": 1.0466536283493042, "step": 89, "step_time": 8.874949301000925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.204747676849365, "epoch": 0.0009, "grad_norm": 0.01240418292582035, "kl": 0.5262163020670414, "learning_rate": 9.999999662678088e-06, "loss": -0.0177, "step": 90, "step_time": 5.186326979000114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 558.34375, "completions/mean_terminated_length": 558.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.52349066734314, "epoch": 0.00091, "frac_reward_zero_std": 0.0, "grad_norm": 0.010751481167972088, "kl": 0.408858347684145, "learning_rate": 9.999999650069006e-06, "loss": -0.0171, "num_tokens": 2371581.0, "reward": 1.3657488822937012, "reward_std": 1.0520522594451904, "rewards/rollout_reward_func/mean": 1.3657488822937012, "rewards/rollout_reward_func/std": 1.2561410665512085, "sampling/importance_sampling_ratio/max": 0.2838555574417114, "sampling/importance_sampling_ratio/mean": 0.12384120374917984, "sampling/importance_sampling_ratio/min": 4.310294829390493e-11, "sampling/sampling_logp_difference/max": 3.8953909873962402, "sampling/sampling_logp_difference/mean": 0.9854850769042969, "step": 91, "step_time": 9.148271317999388 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 5.50886458158493, "epoch": 0.00092, "grad_norm": 0.011888951063156128, "kl": 0.4073612429201603, "learning_rate": 9.999999637228563e-06, "loss": -0.0171, "step": 92, "step_time": 5.326846187999763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 254.34375, "completions/mean_terminated_length": 254.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.463173568248749, "epoch": 0.00093, "frac_reward_zero_std": 0.25, "grad_norm": 0.011303669773042202, "kl": 0.4870901219546795, "learning_rate": 9.99999962415676e-06, "loss": -0.0089, "num_tokens": 2416625.0, "reward": 2.9061293601989746, "reward_std": 1.1144578456878662, "rewards/rollout_reward_func/mean": 2.9061293601989746, "rewards/rollout_reward_func/std": 1.872575283050537, "sampling/importance_sampling_ratio/max": 0.538914144039154, "sampling/importance_sampling_ratio/mean": 0.23089367151260376, "sampling/importance_sampling_ratio/min": 0.001189270755276084, "sampling/sampling_logp_difference/max": 2.552368640899658, "sampling/sampling_logp_difference/mean": 0.8883153200149536, "step": 93, "step_time": 8.310906286000318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.453524351119995, "epoch": 0.00094, "grad_norm": 0.011560036800801754, "kl": 0.49043361097574234, "learning_rate": 9.999999610853598e-06, "loss": -0.009, "step": 94, "step_time": 4.578007039998738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 323.0, "completions/mean_terminated_length": 323.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.127210021018982, "epoch": 0.00095, "frac_reward_zero_std": 0.0, "grad_norm": 0.016390182077884674, "kl": 0.40985831432044506, "learning_rate": 9.999999597319077e-06, "loss": -0.0144, "num_tokens": 2467574.0, "reward": 2.420485019683838, "reward_std": 1.8477942943572998, "rewards/rollout_reward_func/mean": 2.420485019683838, "rewards/rollout_reward_func/std": 2.099099636077881, "sampling/importance_sampling_ratio/max": 0.5398046970367432, "sampling/importance_sampling_ratio/mean": 0.14346127212047577, "sampling/importance_sampling_ratio/min": 7.899796268528991e-12, "sampling/sampling_logp_difference/max": 3.005741596221924, "sampling/sampling_logp_difference/mean": 1.1369695663452148, "step": 95, "step_time": 9.084458965998692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.112871825695038, "epoch": 0.00096, "grad_norm": 0.015214670449495316, "kl": 0.4128855764865875, "learning_rate": 9.999999583553198e-06, "loss": -0.0144, "step": 96, "step_time": 4.6464639670002725 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.0625, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 404.1875, "completions/mean_terminated_length": 413.0000305175781, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.4772584438323975, "epoch": 0.00097, "frac_reward_zero_std": 0.0, "grad_norm": 0.013531319797039032, "kl": 0.33689754270017147, "learning_rate": 9.999999569555958e-06, "loss": -0.0195, "num_tokens": 2519297.0, "reward": 1.2118922472000122, "reward_std": 0.9071276187896729, "rewards/rollout_reward_func/mean": 1.2118922472000122, "rewards/rollout_reward_func/std": 1.6393996477127075, "sampling/importance_sampling_ratio/max": 0.5455878376960754, "sampling/importance_sampling_ratio/mean": 0.12733827531337738, "sampling/importance_sampling_ratio/min": 3.602854342990569e-12, "sampling/sampling_logp_difference/max": 4.47752046585083, "sampling/sampling_logp_difference/mean": 1.2203165292739868, "step": 97, "step_time": 9.439923181000268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.469627916812897, "epoch": 0.00098, "grad_norm": 0.011723111383616924, "kl": 0.3316629286855459, "learning_rate": 9.99999955532736e-06, "loss": -0.0196, "step": 98, "step_time": 4.828754623999885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 536.90625, "completions/mean_terminated_length": 536.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.867543816566467, "epoch": 0.00099, "frac_reward_zero_std": 0.0, "grad_norm": 0.00992999505251646, "kl": 0.35577549040317535, "learning_rate": 9.999999540867401e-06, "loss": -0.0137, "num_tokens": 2578290.0, "reward": 2.7717137336730957, "reward_std": 1.5454163551330566, "rewards/rollout_reward_func/mean": 2.7717137336730957, "rewards/rollout_reward_func/std": 2.1133692264556885, "sampling/importance_sampling_ratio/max": 0.2941095232963562, "sampling/importance_sampling_ratio/mean": 0.1273624747991562, "sampling/importance_sampling_ratio/min": 3.663484793303695e-10, "sampling/sampling_logp_difference/max": 2.9304747581481934, "sampling/sampling_logp_difference/mean": 1.0121349096298218, "step": 99, "step_time": 9.026237381999636 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.866902112960815, "epoch": 0.001, "grad_norm": 0.008921584114432335, "kl": 0.3485883306711912, "learning_rate": 9.999999526176084e-06, "loss": -0.0137, "step": 100, "step_time": 4.884540873000333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 318.6875, "completions/mean_terminated_length": 318.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.43700635433197, "epoch": 0.00101, "frac_reward_zero_std": 0.25, "grad_norm": 0.006189876701682806, "kl": 0.5527824461460114, "learning_rate": 9.999999511253408e-06, "loss": -0.0067, "num_tokens": 2626871.0, "reward": 2.009554862976074, "reward_std": 0.6518650054931641, "rewards/rollout_reward_func/mean": 2.009554862976074, "rewards/rollout_reward_func/std": 1.5595077276229858, "sampling/importance_sampling_ratio/max": 0.5488349199295044, "sampling/importance_sampling_ratio/mean": 0.19652444124221802, "sampling/importance_sampling_ratio/min": 0.0011575144017115235, "sampling/sampling_logp_difference/max": 2.923558473587036, "sampling/sampling_logp_difference/mean": 0.9920997619628906, "step": 101, "step_time": 9.15831846600031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.427984565496445, "epoch": 0.00102, "grad_norm": 0.006101899314671755, "kl": 0.556145828217268, "learning_rate": 9.99999949609937e-06, "loss": -0.0067, "step": 102, "step_time": 4.759649325001192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 266.03125, "completions/mean_terminated_length": 257.8709716796875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.900549799203873, "epoch": 0.00103, "frac_reward_zero_std": 0.25, "grad_norm": 0.007550720125436783, "kl": 0.46073490381240845, "learning_rate": 9.999999480713976e-06, "loss": -0.0124, "num_tokens": 2674005.0, "reward": 3.7651147842407227, "reward_std": 1.2847379446029663, "rewards/rollout_reward_func/mean": 3.7651147842407227, "rewards/rollout_reward_func/std": 1.5383661985397339, "sampling/importance_sampling_ratio/max": 0.5506238341331482, "sampling/importance_sampling_ratio/mean": 0.26077592372894287, "sampling/importance_sampling_ratio/min": 9.013636733702646e-14, "sampling/sampling_logp_difference/max": 3.8743910789489746, "sampling/sampling_logp_difference/mean": 0.9717509150505066, "step": 103, "step_time": 8.535875374000625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.909915745258331, "epoch": 0.00104, "grad_norm": 0.007524185813963413, "kl": 0.45878610014915466, "learning_rate": 9.99999946509722e-06, "loss": -0.0124, "step": 104, "step_time": 4.519358364999334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 263.75, "completions/mean_terminated_length": 271.7419128417969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.765733242034912, "epoch": 0.00105, "frac_reward_zero_std": 0.25, "grad_norm": 0.006037894636392593, "kl": 0.5730921756476164, "learning_rate": 9.999999449249107e-06, "loss": -0.0164, "num_tokens": 2719481.0, "reward": 3.5972437858581543, "reward_std": 1.0879021883010864, "rewards/rollout_reward_func/mean": 3.5972437858581543, "rewards/rollout_reward_func/std": 1.4054116010665894, "sampling/importance_sampling_ratio/max": 0.5536695122718811, "sampling/importance_sampling_ratio/mean": 0.25917497277259827, "sampling/importance_sampling_ratio/min": 7.426475804095389e-06, "sampling/sampling_logp_difference/max": 3.8071212768554688, "sampling/sampling_logp_difference/mean": 0.7888709902763367, "step": 105, "step_time": 7.902388452000196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.764300674200058, "epoch": 0.00106, "grad_norm": 0.006110228598117828, "kl": 0.5746921207755804, "learning_rate": 9.999999433169634e-06, "loss": -0.0165, "step": 106, "step_time": 4.451162388999819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 504.03125, "completions/mean_terminated_length": 504.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.259044528007507, "epoch": 0.00107, "frac_reward_zero_std": 0.0, "grad_norm": 0.09284374862909317, "kl": 0.5678669139742851, "learning_rate": 9.999999416858801e-06, "loss": -0.023, "num_tokens": 2778110.0, "reward": 3.0518195629119873, "reward_std": 2.0326967239379883, "rewards/rollout_reward_func/mean": 3.0518195629119873, "rewards/rollout_reward_func/std": 2.1500182151794434, "sampling/importance_sampling_ratio/max": 0.3023638129234314, "sampling/importance_sampling_ratio/mean": 0.15813782811164856, "sampling/importance_sampling_ratio/min": 0.0001912050211103633, "sampling/sampling_logp_difference/max": 2.7088003158569336, "sampling/sampling_logp_difference/mean": 0.8632931709289551, "step": 107, "step_time": 9.265566470999602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.301291227340698, "epoch": 0.00108, "grad_norm": 0.0192717295140028, "kl": 0.40367304906249046, "learning_rate": 9.999999400316609e-06, "loss": -0.0233, "step": 108, "step_time": 4.872972249000213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 453.8125, "completions/mean_terminated_length": 454.06451416015625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.344372421503067, "epoch": 0.00109, "frac_reward_zero_std": 0.5, "grad_norm": 0.005915234796702862, "kl": 0.5280559062957764, "learning_rate": 9.999999383543059e-06, "loss": -0.0117, "num_tokens": 2832092.0, "reward": 2.0643255710601807, "reward_std": 0.36830055713653564, "rewards/rollout_reward_func/mean": 2.0643255710601807, "rewards/rollout_reward_func/std": 1.2795445919036865, "sampling/importance_sampling_ratio/max": 0.5460331439971924, "sampling/importance_sampling_ratio/mean": 0.2571074366569519, "sampling/importance_sampling_ratio/min": 3.3921960648563643e-15, "sampling/sampling_logp_difference/max": 4.154726505279541, "sampling/sampling_logp_difference/mean": 0.7197229862213135, "step": 109, "step_time": 9.136753535999105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.370859324932098, "epoch": 0.0011, "grad_norm": 0.006120497360825539, "kl": 0.5247279852628708, "learning_rate": 9.999999366538148e-06, "loss": -0.0117, "step": 110, "step_time": 4.750533235999228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 468.09375, "completions/mean_terminated_length": 468.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.781158804893494, "epoch": 0.00111, "frac_reward_zero_std": 0.0, "grad_norm": 0.009389814920723438, "kl": 0.40754758939146996, "learning_rate": 9.999999349301878e-06, "loss": -0.0128, "num_tokens": 2887456.0, "reward": 2.372213840484619, "reward_std": 1.1804518699645996, "rewards/rollout_reward_func/mean": 2.372213840484619, "rewards/rollout_reward_func/std": 1.6503901481628418, "sampling/importance_sampling_ratio/max": 0.5524845123291016, "sampling/importance_sampling_ratio/mean": 0.1906067430973053, "sampling/importance_sampling_ratio/min": 4.788781764827768e-16, "sampling/sampling_logp_difference/max": 3.3403866291046143, "sampling/sampling_logp_difference/mean": 1.0884251594543457, "step": 111, "step_time": 8.606538135000847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.815384209156036, "epoch": 0.00112, "grad_norm": 0.01099073514342308, "kl": 0.40177351236343384, "learning_rate": 9.999999331834249e-06, "loss": -0.0128, "step": 112, "step_time": 5.018896831999882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 444.90625, "completions/mean_terminated_length": 444.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.825027763843536, "epoch": 0.00113, "frac_reward_zero_std": 0.0, "grad_norm": 0.011304070241749287, "kl": 0.35968483053147793, "learning_rate": 9.99999931413526e-06, "loss": -0.0059, "num_tokens": 2943025.0, "reward": 2.8332765102386475, "reward_std": 1.7419302463531494, "rewards/rollout_reward_func/mean": 2.8332765102386475, "rewards/rollout_reward_func/std": 1.946202278137207, "sampling/importance_sampling_ratio/max": 0.30486950278282166, "sampling/importance_sampling_ratio/mean": 0.14108777046203613, "sampling/importance_sampling_ratio/min": 0.00019812805112451315, "sampling/sampling_logp_difference/max": 2.6881039142608643, "sampling/sampling_logp_difference/mean": 1.0696028470993042, "step": 113, "step_time": 8.58645070500097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.8290793895721436, "epoch": 0.00114, "grad_norm": 0.011425897479057312, "kl": 0.3614608943462372, "learning_rate": 9.999999296204912e-06, "loss": -0.0059, "step": 114, "step_time": 5.139510378000523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 235.75, "completions/mean_terminated_length": 235.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.922475636005402, "epoch": 0.00115, "frac_reward_zero_std": 0.5, "grad_norm": 0.023028887808322906, "kl": 0.4135777149349451, "learning_rate": 9.999999278043205e-06, "loss": -0.0119, "num_tokens": 2988835.0, "reward": 2.040597438812256, "reward_std": 0.6493960618972778, "rewards/rollout_reward_func/mean": 2.040597438812256, "rewards/rollout_reward_func/std": 1.5975624322891235, "sampling/importance_sampling_ratio/max": 0.5552006363868713, "sampling/importance_sampling_ratio/mean": 0.2123653143644333, "sampling/importance_sampling_ratio/min": 1.9730843694998335e-11, "sampling/sampling_logp_difference/max": 3.120391845703125, "sampling/sampling_logp_difference/mean": 1.0702314376831055, "step": 115, "step_time": 8.227128244000596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.934574216604233, "epoch": 0.00116, "grad_norm": 0.024487733840942383, "kl": 0.4113778416067362, "learning_rate": 9.99999925965014e-06, "loss": -0.0118, "step": 116, "step_time": 4.603303872000197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.03125, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 446.0, "completions/mean_terminated_length": 459.8709411621094, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.283336162567139, "epoch": 0.00117, "frac_reward_zero_std": 0.0, "grad_norm": 0.06837444752454758, "kl": 0.7454761080443859, "learning_rate": 9.999999241025713e-06, "loss": -0.0137, "num_tokens": 3044815.0, "reward": 1.6962122917175293, "reward_std": 0.9731729030609131, "rewards/rollout_reward_func/mean": 1.6962122917175293, "rewards/rollout_reward_func/std": 1.472454309463501, "sampling/importance_sampling_ratio/max": 0.5520793199539185, "sampling/importance_sampling_ratio/mean": 0.1955501139163971, "sampling/importance_sampling_ratio/min": 8.385171321124898e-15, "sampling/sampling_logp_difference/max": 3.5139248371124268, "sampling/sampling_logp_difference/mean": 1.0980048179626465, "step": 117, "step_time": 8.851413534999665 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.293146431446075, "epoch": 0.00118, "grad_norm": 0.025546826422214508, "kl": 0.6223187446594238, "learning_rate": 9.99999922216993e-06, "loss": -0.014, "step": 118, "step_time": 5.2780849090008815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 477.75, "completions/mean_terminated_length": 498.20001220703125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.602368354797363, "epoch": 0.00119, "frac_reward_zero_std": 0.0, "grad_norm": 0.01806759461760521, "kl": 0.340764039196074, "learning_rate": 9.999999203082784e-06, "loss": -0.0204, "num_tokens": 3099071.0, "reward": 1.5769758224487305, "reward_std": 1.3088091611862183, "rewards/rollout_reward_func/mean": 1.5769758224487305, "rewards/rollout_reward_func/std": 1.6474065780639648, "sampling/importance_sampling_ratio/max": 0.5519512295722961, "sampling/importance_sampling_ratio/mean": 0.15655829012393951, "sampling/importance_sampling_ratio/min": 3.2326309328439702e-18, "sampling/sampling_logp_difference/max": 3.693403959274292, "sampling/sampling_logp_difference/mean": 1.4753286838531494, "step": 119, "step_time": 9.247204750000492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.583022654056549, "epoch": 0.0012, "grad_norm": 0.016562430188059807, "kl": 0.33369210083037615, "learning_rate": 9.999999183764282e-06, "loss": -0.0205, "step": 120, "step_time": 5.400903367999945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 473.59375, "completions/mean_terminated_length": 473.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.3363790810108185, "epoch": 0.00121, "frac_reward_zero_std": 0.0, "grad_norm": 0.005732807330787182, "kl": 0.3513603312894702, "learning_rate": 9.999999164214418e-06, "loss": -0.015, "num_tokens": 3154721.0, "reward": 2.565702199935913, "reward_std": 0.9166498184204102, "rewards/rollout_reward_func/mean": 2.565702199935913, "rewards/rollout_reward_func/std": 1.641287922859192, "sampling/importance_sampling_ratio/max": 0.5513830184936523, "sampling/importance_sampling_ratio/mean": 0.21081653237342834, "sampling/importance_sampling_ratio/min": 9.868382777611373e-10, "sampling/sampling_logp_difference/max": 4.323619365692139, "sampling/sampling_logp_difference/mean": 0.9557435512542725, "step": 121, "step_time": 8.598964995000188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.306765913963318, "epoch": 0.00122, "grad_norm": 0.005753090605139732, "kl": 0.3516113171353936, "learning_rate": 9.999999144433197e-06, "loss": -0.015, "step": 122, "step_time": 4.775643616999787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 574.21875, "completions/mean_terminated_length": 574.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.031126022338867, "epoch": 0.00123, "frac_reward_zero_std": 0.0, "grad_norm": 0.014242246747016907, "kl": 0.27723702508956194, "learning_rate": 9.999999124420615e-06, "loss": -0.0165, "num_tokens": 3214228.0, "reward": 2.553722381591797, "reward_std": 1.49879789352417, "rewards/rollout_reward_func/mean": 2.553722381591797, "rewards/rollout_reward_func/std": 1.8885005712509155, "sampling/importance_sampling_ratio/max": 0.30776602029800415, "sampling/importance_sampling_ratio/mean": 0.13252753019332886, "sampling/importance_sampling_ratio/min": 1.6167473718305125e-13, "sampling/sampling_logp_difference/max": 3.7081410884857178, "sampling/sampling_logp_difference/mean": 1.0756418704986572, "step": 123, "step_time": 8.99680862499963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 6.018238723278046, "epoch": 0.00124, "grad_norm": 0.008585439994931221, "kl": 0.27489931136369705, "learning_rate": 9.999999104176675e-06, "loss": -0.0165, "step": 124, "step_time": 5.400279564999892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 428.40625, "completions/mean_terminated_length": 441.70965576171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.530455976724625, "epoch": 0.00125, "frac_reward_zero_std": 0.0, "grad_norm": 0.01981445588171482, "kl": 0.5246058944612741, "learning_rate": 9.999999083701375e-06, "loss": -0.0115, "num_tokens": 3268782.0, "reward": 2.442720651626587, "reward_std": 0.9873529672622681, "rewards/rollout_reward_func/mean": 2.442720651626587, "rewards/rollout_reward_func/std": 1.5368415117263794, "sampling/importance_sampling_ratio/max": 0.5591750144958496, "sampling/importance_sampling_ratio/mean": 0.1981470286846161, "sampling/importance_sampling_ratio/min": 6.880031043303685e-15, "sampling/sampling_logp_difference/max": 2.8299753665924072, "sampling/sampling_logp_difference/mean": 1.1136442422866821, "step": 125, "step_time": 8.792223250000006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.526207149028778, "epoch": 0.00126, "grad_norm": 0.07942020893096924, "kl": 0.5783120766282082, "learning_rate": 9.999999062994716e-06, "loss": -0.0115, "step": 126, "step_time": 5.212345460999586 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 355.09375, "completions/mean_terminated_length": 355.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.586715936660767, "epoch": 0.00127, "frac_reward_zero_std": 0.0, "grad_norm": 0.01894986629486084, "kl": 0.36187045089900494, "learning_rate": 9.999999042056698e-06, "loss": -0.0093, "num_tokens": 3320961.0, "reward": 3.5986928939819336, "reward_std": 1.4135947227478027, "rewards/rollout_reward_func/mean": 3.5986928939819336, "rewards/rollout_reward_func/std": 1.7381529808044434, "sampling/importance_sampling_ratio/max": 0.5567580461502075, "sampling/importance_sampling_ratio/mean": 0.18505819141864777, "sampling/importance_sampling_ratio/min": 3.734913612349951e-18, "sampling/sampling_logp_difference/max": 3.9289333820343018, "sampling/sampling_logp_difference/mean": 1.1177394390106201, "step": 127, "step_time": 8.4797745940009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003289473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "entropy": 5.59991979598999, "epoch": 0.00128, "grad_norm": 0.015152394771575928, "kl": 0.36163298040628433, "learning_rate": 9.99999902088732e-06, "loss": -0.0093, "step": 128, "step_time": 4.677577405001102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 428.5, "completions/mean_terminated_length": 428.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.128687739372253, "epoch": 0.00129, "frac_reward_zero_std": 0.25, "grad_norm": 0.005386632867157459, "kl": 0.29446713998913765, "learning_rate": 9.999998999486583e-06, "loss": -0.0113, "num_tokens": 3373920.0, "reward": 3.073700428009033, "reward_std": 1.0783164501190186, "rewards/rollout_reward_func/mean": 3.073700428009033, "rewards/rollout_reward_func/std": 1.8588871955871582, "sampling/importance_sampling_ratio/max": 0.5572874546051025, "sampling/importance_sampling_ratio/mean": 0.1692725121974945, "sampling/importance_sampling_ratio/min": 0.00015941473247949034, "sampling/sampling_logp_difference/max": 2.785651922225952, "sampling/sampling_logp_difference/mean": 1.0858612060546875, "step": 129, "step_time": 8.688911320999068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.14833676815033, "epoch": 0.0013, "grad_norm": 0.005992444232106209, "kl": 0.2919249450787902, "learning_rate": 9.999998977854486e-06, "loss": -0.0113, "step": 130, "step_time": 5.196040378998532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 425.46875, "completions/mean_terminated_length": 425.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.23365318775177, "epoch": 0.00131, "frac_reward_zero_std": 0.0, "grad_norm": 0.019059518352150917, "kl": 0.34147817455232143, "learning_rate": 9.99999895599103e-06, "loss": -0.0035, "num_tokens": 3427401.0, "reward": 2.473796844482422, "reward_std": 1.8125438690185547, "rewards/rollout_reward_func/mean": 2.473796844482422, "rewards/rollout_reward_func/std": 1.8101236820220947, "sampling/importance_sampling_ratio/max": 0.5523518323898315, "sampling/importance_sampling_ratio/mean": 0.14591509103775024, "sampling/importance_sampling_ratio/min": 0.00020133046200498939, "sampling/sampling_logp_difference/max": 3.7536349296569824, "sampling/sampling_logp_difference/mean": 1.2340688705444336, "step": 131, "step_time": 8.596268926000448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.220755457878113, "epoch": 0.00132, "grad_norm": 0.018474310636520386, "kl": 0.3436393868178129, "learning_rate": 9.999998933896215e-06, "loss": -0.0035, "step": 132, "step_time": 5.2319554900000185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 448.71875, "completions/mean_terminated_length": 448.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.855165183544159, "epoch": 0.00133, "frac_reward_zero_std": 0.0, "grad_norm": 0.021189430728554726, "kl": 0.4099526349455118, "learning_rate": 9.999998911570041e-06, "loss": -0.0112, "num_tokens": 3482090.0, "reward": 1.9857467412948608, "reward_std": 0.9780662655830383, "rewards/rollout_reward_func/mean": 1.9857467412948608, "rewards/rollout_reward_func/std": 1.4453747272491455, "sampling/importance_sampling_ratio/max": 0.5519795417785645, "sampling/importance_sampling_ratio/mean": 0.14946779608726501, "sampling/importance_sampling_ratio/min": 4.809954772476128e-20, "sampling/sampling_logp_difference/max": 13.282112121582031, "sampling/sampling_logp_difference/mean": 1.1622436046600342, "step": 133, "step_time": 8.62497339599986 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.8229920864105225, "epoch": 0.00134, "grad_norm": 0.02182932198047638, "kl": 0.4096921207383275, "learning_rate": 9.999998889012509e-06, "loss": -0.0112, "step": 134, "step_time": 4.7461931110005935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 405.90625, "completions/mean_terminated_length": 405.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.744675666093826, "epoch": 0.00135, "frac_reward_zero_std": 0.5, "grad_norm": 0.020258011296391487, "kl": 0.5242082104086876, "learning_rate": 9.999998866223617e-06, "loss": 0.0004, "num_tokens": 3535997.0, "reward": 2.09535551071167, "reward_std": 0.801947832107544, "rewards/rollout_reward_func/mean": 2.09535551071167, "rewards/rollout_reward_func/std": 1.5733643770217896, "sampling/importance_sampling_ratio/max": 0.5550893545150757, "sampling/importance_sampling_ratio/mean": 0.21865856647491455, "sampling/importance_sampling_ratio/min": 0.0004308926872909069, "sampling/sampling_logp_difference/max": 2.2621819972991943, "sampling/sampling_logp_difference/mean": 0.6881621479988098, "step": 135, "step_time": 8.649714739999581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 4.7144655585289, "epoch": 0.00136, "grad_norm": 0.017525319010019302, "kl": 0.5250850170850754, "learning_rate": 9.999998843203364e-06, "loss": 0.0004, "step": 136, "step_time": 5.175107476999074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 373.84375, "completions/mean_terminated_length": 373.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.857703149318695, "epoch": 0.00137, "frac_reward_zero_std": 0.25, "grad_norm": 0.00757873198017478, "kl": 0.39663390070199966, "learning_rate": 9.999998819951753e-06, "loss": -0.0103, "num_tokens": 3587640.0, "reward": 3.1773321628570557, "reward_std": 1.37308669090271, "rewards/rollout_reward_func/mean": 3.1773321628570557, "rewards/rollout_reward_func/std": 1.8172297477722168, "sampling/importance_sampling_ratio/max": 0.5551076531410217, "sampling/importance_sampling_ratio/mean": 0.21167081594467163, "sampling/importance_sampling_ratio/min": 0.00045508454786613584, "sampling/sampling_logp_difference/max": 3.646714210510254, "sampling/sampling_logp_difference/mean": 0.8012743592262268, "step": 137, "step_time": 8.443151505000515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.867439925670624, "epoch": 0.00138, "grad_norm": 0.007302064914256334, "kl": 0.39654075540602207, "learning_rate": 9.999998796468782e-06, "loss": -0.0102, "step": 138, "step_time": 5.160483013000885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 496.53125, "completions/mean_terminated_length": 512.0322265625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.3140488266944885, "epoch": 0.00139, "frac_reward_zero_std": 0.0, "grad_norm": 0.047983165830373764, "kl": 0.43244970217347145, "learning_rate": 9.999998772754452e-06, "loss": -0.0136, "num_tokens": 3642802.0, "reward": 2.342683792114258, "reward_std": 0.9078880548477173, "rewards/rollout_reward_func/mean": 2.342683792114258, "rewards/rollout_reward_func/std": 1.424059510231018, "sampling/importance_sampling_ratio/max": 0.5550379753112793, "sampling/importance_sampling_ratio/mean": 0.21025414764881134, "sampling/importance_sampling_ratio/min": 4.1330454277109865e-14, "sampling/sampling_logp_difference/max": 4.175216197967529, "sampling/sampling_logp_difference/mean": 0.9738059639930725, "step": 139, "step_time": 8.921468903000005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.2954131960868835, "epoch": 0.0014, "grad_norm": 0.03462895750999451, "kl": 0.43446778878569603, "learning_rate": 9.999998748808764e-06, "loss": -0.0138, "step": 140, "step_time": 4.874726669999291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 364.84375, "completions/mean_terminated_length": 369.3333435058594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.6628918051719666, "epoch": 0.00141, "frac_reward_zero_std": 0.0, "grad_norm": 0.048540182411670685, "kl": 0.37702805921435356, "learning_rate": 9.999998724631715e-06, "loss": -0.0215, "num_tokens": 3694811.0, "reward": 2.8997886180877686, "reward_std": 1.3242307901382446, "rewards/rollout_reward_func/mean": 2.8997886180877686, "rewards/rollout_reward_func/std": 1.9665751457214355, "sampling/importance_sampling_ratio/max": 0.30592989921569824, "sampling/importance_sampling_ratio/mean": 0.13573986291885376, "sampling/importance_sampling_ratio/min": 5.063214828144886e-15, "sampling/sampling_logp_difference/max": 4.425807476043701, "sampling/sampling_logp_difference/mean": 1.0662561655044556, "step": 141, "step_time": 8.895234876999439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.723241686820984, "epoch": 0.00142, "grad_norm": 0.044111333787441254, "kl": 0.3669211324304342, "learning_rate": 9.999998700223308e-06, "loss": -0.0217, "step": 142, "step_time": 4.655782125000314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 388.5625, "completions/mean_terminated_length": 388.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.8627594113349915, "epoch": 0.00143, "frac_reward_zero_std": 0.0, "grad_norm": 0.01214392390102148, "kl": 0.37197905220091343, "learning_rate": 9.999998675583542e-06, "loss": -0.0186, "num_tokens": 3748064.0, "reward": 2.9884138107299805, "reward_std": 1.394155740737915, "rewards/rollout_reward_func/mean": 2.9884138107299805, "rewards/rollout_reward_func/std": 1.8353135585784912, "sampling/importance_sampling_ratio/max": 0.5596773028373718, "sampling/importance_sampling_ratio/mean": 0.16319838166236877, "sampling/importance_sampling_ratio/min": 3.1172959769065756e-09, "sampling/sampling_logp_difference/max": 4.036569595336914, "sampling/sampling_logp_difference/mean": 1.0934593677520752, "step": 143, "step_time": 9.0844559330003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.029166667722165585, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029166667722165585, "entropy": 5.961900234222412, "epoch": 0.00144, "grad_norm": 0.0157622080296278, "kl": 0.3777771629393101, "learning_rate": 9.999998650712415e-06, "loss": -0.0185, "step": 144, "step_time": 5.404956216000301 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.0625, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 385.5625, "completions/mean_terminated_length": 389.3000183105469, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.166502773761749, "epoch": 0.00145, "frac_reward_zero_std": 0.0, "grad_norm": 0.027631990611553192, "kl": 0.5853241086006165, "learning_rate": 9.999998625609931e-06, "loss": -0.0256, "num_tokens": 3801052.0, "reward": 2.257948398590088, "reward_std": 1.2924938201904297, "rewards/rollout_reward_func/mean": 2.257948398590088, "rewards/rollout_reward_func/std": 1.759012222290039, "sampling/importance_sampling_ratio/max": 0.5587986707687378, "sampling/importance_sampling_ratio/mean": 0.2040739357471466, "sampling/importance_sampling_ratio/min": 1.3630546344400862e-12, "sampling/sampling_logp_difference/max": 3.9767651557922363, "sampling/sampling_logp_difference/mean": 0.9805353283882141, "step": 145, "step_time": 9.069786099000794 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 5.183136820793152, "epoch": 0.00146, "grad_norm": 0.026243364438414574, "kl": 0.5845871120691299, "learning_rate": 9.999998600276087e-06, "loss": -0.0256, "step": 146, "step_time": 4.969717237999248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 382.4375, "completions/mean_terminated_length": 382.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.606759130954742, "epoch": 0.00147, "frac_reward_zero_std": 0.0, "grad_norm": 0.06667429953813553, "kl": 0.5389902517199516, "learning_rate": 9.999998574710883e-06, "loss": -0.0173, "num_tokens": 3853406.0, "reward": 1.033826231956482, "reward_std": 0.8608077764511108, "rewards/rollout_reward_func/mean": 1.033826231956482, "rewards/rollout_reward_func/std": 1.2542723417282104, "sampling/importance_sampling_ratio/max": 0.3122768998146057, "sampling/importance_sampling_ratio/mean": 0.10800905525684357, "sampling/importance_sampling_ratio/min": 1.0712759864892608e-17, "sampling/sampling_logp_difference/max": 4.165693283081055, "sampling/sampling_logp_difference/mean": 1.3520691394805908, "step": 147, "step_time": 8.949542632000885 }, { "clip_ratio/high_max": 0.08159722294658422, "clip_ratio/high_mean": 0.04079861147329211, "clip_ratio/low_mean": 0.02434501238167286, "clip_ratio/low_min": 0.013888888992369175, "clip_ratio/region_mean": 0.06514362338930368, "entropy": 6.569398641586304, "epoch": 0.00148, "grad_norm": 0.014427587389945984, "kl": 0.3714602068066597, "learning_rate": 9.999998548914318e-06, "loss": -0.0175, "step": 148, "step_time": 4.723479898000278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 414.875, "completions/mean_terminated_length": 414.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.415063202381134, "epoch": 0.00149, "frac_reward_zero_std": 0.0, "grad_norm": 0.006415513344109058, "kl": 0.3409431576728821, "learning_rate": 9.999998522886397e-06, "loss": -0.0151, "num_tokens": 3907390.0, "reward": 2.522676467895508, "reward_std": 1.2102174758911133, "rewards/rollout_reward_func/mean": 2.522676467895508, "rewards/rollout_reward_func/std": 1.6199886798858643, "sampling/importance_sampling_ratio/max": 0.5543047189712524, "sampling/importance_sampling_ratio/mean": 0.20426242053508759, "sampling/importance_sampling_ratio/min": 0.0003250113222748041, "sampling/sampling_logp_difference/max": 2.426882266998291, "sampling/sampling_logp_difference/mean": 0.9172008037567139, "step": 149, "step_time": 9.147840422000172 }, { "clip_ratio/high_max": 0.1197916679084301, "clip_ratio/high_mean": 0.05989583395421505, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05989583395421505, "entropy": 5.369048357009888, "epoch": 0.0015, "grad_norm": 0.009035247378051281, "kl": 0.3433597218245268, "learning_rate": 9.999998496627115e-06, "loss": -0.0151, "step": 150, "step_time": 4.844408977000057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 462.09375, "completions/mean_terminated_length": 462.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.271999478340149, "epoch": 0.00151, "frac_reward_zero_std": 0.0, "grad_norm": 0.009318442083895206, "kl": 0.2838898357003927, "learning_rate": 9.999998470136475e-06, "loss": -0.0107, "num_tokens": 3963989.0, "reward": 2.4085025787353516, "reward_std": 1.5739270448684692, "rewards/rollout_reward_func/mean": 2.4085025787353516, "rewards/rollout_reward_func/std": 2.0519044399261475, "sampling/importance_sampling_ratio/max": 0.3041028082370758, "sampling/importance_sampling_ratio/mean": 0.10673074424266815, "sampling/importance_sampling_ratio/min": 3.432775631405483e-20, "sampling/sampling_logp_difference/max": 3.6299948692321777, "sampling/sampling_logp_difference/mean": 1.236303448677063, "step": 151, "step_time": 8.581440807000035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.249309480190277, "epoch": 0.00152, "grad_norm": 0.009570055641233921, "kl": 0.2857303377240896, "learning_rate": 9.999998443414474e-06, "loss": -0.0106, "step": 152, "step_time": 4.792721901000277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 274.21875, "completions/mean_terminated_length": 274.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.317317992448807, "epoch": 0.00153, "frac_reward_zero_std": 0.25, "grad_norm": 0.011901522986590862, "kl": 0.47474728897213936, "learning_rate": 9.999998416461115e-06, "loss": -0.0159, "num_tokens": 4010768.0, "reward": 3.833049774169922, "reward_std": 1.0186306238174438, "rewards/rollout_reward_func/mean": 3.833049774169922, "rewards/rollout_reward_func/std": 1.4386236667633057, "sampling/importance_sampling_ratio/max": 0.5543802380561829, "sampling/importance_sampling_ratio/mean": 0.2531720995903015, "sampling/importance_sampling_ratio/min": 1.1614738034196326e-13, "sampling/sampling_logp_difference/max": 4.201288223266602, "sampling/sampling_logp_difference/mean": 1.0024373531341553, "step": 153, "step_time": 8.65931397399936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.322837769985199, "epoch": 0.00154, "grad_norm": 0.011569921858608723, "kl": 0.4757602885365486, "learning_rate": 9.999998389276397e-06, "loss": -0.016, "step": 154, "step_time": 4.628009893999661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 608.03125, "completions/mean_terminated_length": 608.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.3297582268714905, "epoch": 0.00155, "frac_reward_zero_std": 0.0, "grad_norm": 0.012206662446260452, "kl": 0.3360460437834263, "learning_rate": 9.999998361860319e-06, "loss": -0.0247, "num_tokens": 4070767.0, "reward": 1.6205885410308838, "reward_std": 0.9136905670166016, "rewards/rollout_reward_func/mean": 1.6205885410308838, "rewards/rollout_reward_func/std": 1.4889774322509766, "sampling/importance_sampling_ratio/max": 0.307391494512558, "sampling/importance_sampling_ratio/mean": 0.12577039003372192, "sampling/importance_sampling_ratio/min": 3.3397945867208456e-12, "sampling/sampling_logp_difference/max": 3.970939874649048, "sampling/sampling_logp_difference/mean": 1.1507443189620972, "step": 155, "step_time": 9.421827424999265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.354883968830109, "epoch": 0.00156, "grad_norm": 0.011644980870187283, "kl": 0.3347685132175684, "learning_rate": 9.99999833421288e-06, "loss": -0.0247, "step": 156, "step_time": 4.989567845999318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.03125, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 306.9375, "completions/mean_terminated_length": 316.32257080078125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.295707732439041, "epoch": 0.00157, "frac_reward_zero_std": 0.5, "grad_norm": 0.006841656286269426, "kl": 0.45711028575897217, "learning_rate": 9.999998306334084e-06, "loss": 0.0064, "num_tokens": 4119100.0, "reward": 3.2127416133880615, "reward_std": 0.8213455677032471, "rewards/rollout_reward_func/mean": 3.2127416133880615, "rewards/rollout_reward_func/std": 1.3526753187179565, "sampling/importance_sampling_ratio/max": 0.5526928901672363, "sampling/importance_sampling_ratio/mean": 0.20098578929901123, "sampling/importance_sampling_ratio/min": 6.629302141958338e-13, "sampling/sampling_logp_difference/max": 3.455430030822754, "sampling/sampling_logp_difference/mean": 1.1175944805145264, "step": 157, "step_time": 8.230952549999529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.352945655584335, "epoch": 0.00158, "grad_norm": 0.0065981000661849976, "kl": 0.4474712498486042, "learning_rate": 9.99999827822393e-06, "loss": 0.0064, "step": 158, "step_time": 5.171249369000179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 282.75, "completions/mean_terminated_length": 282.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.617429673671722, "epoch": 0.00159, "frac_reward_zero_std": 0.0, "grad_norm": 0.012413682416081429, "kl": 0.44416868314146996, "learning_rate": 9.999998249882414e-06, "loss": -0.0191, "num_tokens": 4166080.0, "reward": 2.787449836730957, "reward_std": 1.4347116947174072, "rewards/rollout_reward_func/mean": 2.787449836730957, "rewards/rollout_reward_func/std": 1.7297974824905396, "sampling/importance_sampling_ratio/max": 0.5567501187324524, "sampling/importance_sampling_ratio/mean": 0.2343512773513794, "sampling/importance_sampling_ratio/min": 3.580179833845965e-16, "sampling/sampling_logp_difference/max": 3.5624475479125977, "sampling/sampling_logp_difference/mean": 1.0345864295959473, "step": 159, "step_time": 8.048230264000267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.655277848243713, "epoch": 0.0016, "grad_norm": 0.011771933175623417, "kl": 0.4446682333946228, "learning_rate": 9.999998221309542e-06, "loss": -0.0191, "step": 160, "step_time": 4.566467165000631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 233.40625, "completions/mean_terminated_length": 233.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.265278160572052, "epoch": 0.00161, "frac_reward_zero_std": 0.0, "grad_norm": 0.013585682958364487, "kl": 0.4800906181335449, "learning_rate": 9.999998192505309e-06, "loss": -0.0153, "num_tokens": 4211781.0, "reward": 2.0160932540893555, "reward_std": 1.3408175706863403, "rewards/rollout_reward_func/mean": 2.0160932540893555, "rewards/rollout_reward_func/std": 1.7153640985488892, "sampling/importance_sampling_ratio/max": 0.5554280281066895, "sampling/importance_sampling_ratio/mean": 0.18753069639205933, "sampling/importance_sampling_ratio/min": 8.604307595305727e-07, "sampling/sampling_logp_difference/max": 4.6766486167907715, "sampling/sampling_logp_difference/mean": 1.2480790615081787, "step": 161, "step_time": 8.229338431000087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.281177341938019, "epoch": 0.00162, "grad_norm": 0.012755308300256729, "kl": 0.4964945949614048, "learning_rate": 9.999998163469716e-06, "loss": -0.0154, "step": 162, "step_time": 4.38510970399966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 557.375, "completions/mean_terminated_length": 552.1290283203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.4221086502075195, "epoch": 0.00163, "frac_reward_zero_std": 0.0, "grad_norm": 0.007355245761573315, "kl": 0.330900888890028, "learning_rate": 9.999998134202764e-06, "loss": -0.0155, "num_tokens": 4269709.0, "reward": 2.1687724590301514, "reward_std": 1.417752981185913, "rewards/rollout_reward_func/mean": 2.1687724590301514, "rewards/rollout_reward_func/std": 1.9265196323394775, "sampling/importance_sampling_ratio/max": 0.30897635221481323, "sampling/importance_sampling_ratio/mean": 0.12372960150241852, "sampling/importance_sampling_ratio/min": 2.4737660090288395e-17, "sampling/sampling_logp_difference/max": 3.5442941188812256, "sampling/sampling_logp_difference/mean": 1.2593567371368408, "step": 163, "step_time": 8.674119632999918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.411408066749573, "epoch": 0.00164, "grad_norm": 0.006261439062654972, "kl": 0.32452805899083614, "learning_rate": 9.999998104704453e-06, "loss": -0.0155, "step": 164, "step_time": 5.15820460599889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 324.21875, "completions/mean_terminated_length": 324.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.876501500606537, "epoch": 0.00165, "frac_reward_zero_std": 0.0, "grad_norm": 0.010572957806289196, "kl": 0.314052717294544, "learning_rate": 9.999998074974785e-06, "loss": -0.0065, "num_tokens": 4317340.0, "reward": 2.824183702468872, "reward_std": 1.7697460651397705, "rewards/rollout_reward_func/mean": 2.824183702468872, "rewards/rollout_reward_func/std": 1.7482030391693115, "sampling/importance_sampling_ratio/max": 0.5577415227890015, "sampling/importance_sampling_ratio/mean": 0.18741005659103394, "sampling/importance_sampling_ratio/min": 0.0003228384011890739, "sampling/sampling_logp_difference/max": 2.3370163440704346, "sampling/sampling_logp_difference/mean": 1.2416086196899414, "step": 165, "step_time": 8.260381059999418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.816007494926453, "epoch": 0.00166, "grad_norm": 0.010662592947483063, "kl": 0.31538827205076814, "learning_rate": 9.999998045013754e-06, "loss": -0.0065, "step": 166, "step_time": 4.7127935759995125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 546.21875, "completions/mean_terminated_length": 546.21875, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "entropy": 4.806870490312576, "epoch": 0.00167, "frac_reward_zero_std": 0.0, "grad_norm": 0.00802002102136612, "kl": 0.3522371258586645, "learning_rate": 9.999998014821366e-06, "loss": -0.0125, "num_tokens": 4376707.0, "reward": 4.180271148681641, "reward_std": 1.340552568435669, "rewards/rollout_reward_func/mean": 4.180271148681641, "rewards/rollout_reward_func/std": 1.3307310342788696, "sampling/importance_sampling_ratio/max": 0.31147125363349915, "sampling/importance_sampling_ratio/mean": 0.18969742953777313, "sampling/importance_sampling_ratio/min": 2.4195236403606748e-17, "sampling/sampling_logp_difference/max": 3.992608070373535, "sampling/sampling_logp_difference/mean": 0.8682562112808228, "step": 167, "step_time": 9.083963717000188 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 4.7781093418598175, "epoch": 0.00168, "grad_norm": 0.008071971125900745, "kl": 0.3539010286331177, "learning_rate": 9.999997984397618e-06, "loss": -0.0125, "step": 168, "step_time": 4.795684135000101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 399.5, "completions/mean_terminated_length": 399.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.353841185569763, "epoch": 0.00169, "frac_reward_zero_std": 0.0, "grad_norm": 0.010215525515377522, "kl": 0.4602816812694073, "learning_rate": 9.999997953742511e-06, "loss": -0.0044, "num_tokens": 4427943.0, "reward": 2.107025146484375, "reward_std": 1.3496294021606445, "rewards/rollout_reward_func/mean": 2.107025146484375, "rewards/rollout_reward_func/std": 1.8857003450393677, "sampling/importance_sampling_ratio/max": 0.5592086315155029, "sampling/importance_sampling_ratio/mean": 0.20410695672035217, "sampling/importance_sampling_ratio/min": 3.2384990522604795e-11, "sampling/sampling_logp_difference/max": 3.561155080795288, "sampling/sampling_logp_difference/mean": 0.9430296421051025, "step": 169, "step_time": 8.44894137799929 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.303603112697601, "epoch": 0.0017, "grad_norm": 0.010156241245567799, "kl": 0.4662858620285988, "learning_rate": 9.999997922856044e-06, "loss": -0.0044, "step": 170, "step_time": 5.110951400999511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 281.78125, "completions/mean_terminated_length": 281.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.283334970474243, "epoch": 0.00171, "frac_reward_zero_std": 0.25, "grad_norm": 0.014388482086360455, "kl": 0.5276032239198685, "learning_rate": 9.999997891738219e-06, "loss": -0.0148, "num_tokens": 4475664.0, "reward": 3.1777913570404053, "reward_std": 0.9462900757789612, "rewards/rollout_reward_func/mean": 3.1777913570404053, "rewards/rollout_reward_func/std": 1.581575870513916, "sampling/importance_sampling_ratio/max": 0.5628533959388733, "sampling/importance_sampling_ratio/mean": 0.24305231869220734, "sampling/importance_sampling_ratio/min": 0.0005128039629198611, "sampling/sampling_logp_difference/max": 2.273298501968384, "sampling/sampling_logp_difference/mean": 0.8266535997390747, "step": 171, "step_time": 8.091583472000366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.257599115371704, "epoch": 0.00172, "grad_norm": 0.014056864194571972, "kl": 0.5337305925786495, "learning_rate": 9.999997860389035e-06, "loss": -0.0148, "step": 172, "step_time": 4.970710163999684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 548.75, "completions/mean_terminated_length": 548.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.598077058792114, "epoch": 0.00173, "frac_reward_zero_std": 0.0, "grad_norm": 0.007702630013227463, "kl": 0.26329412683844566, "learning_rate": 9.99999782880849e-06, "loss": -0.0126, "num_tokens": 4534968.0, "reward": 3.599264144897461, "reward_std": 1.2832717895507812, "rewards/rollout_reward_func/mean": 3.599264144897461, "rewards/rollout_reward_func/std": 1.7741010189056396, "sampling/importance_sampling_ratio/max": 0.3131314814090729, "sampling/importance_sampling_ratio/mean": 0.15425240993499756, "sampling/importance_sampling_ratio/min": 0.0001230030320584774, "sampling/sampling_logp_difference/max": 3.5723962783813477, "sampling/sampling_logp_difference/mean": 0.9245375394821167, "step": 173, "step_time": 8.712299219000215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.617673218250275, "epoch": 0.00174, "grad_norm": 0.007035167887806892, "kl": 0.2626843862235546, "learning_rate": 9.999997796996588e-06, "loss": -0.0126, "step": 174, "step_time": 4.817434919999414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 611.65625, "completions/mean_terminated_length": 611.65625, "completions/min_length": 18.0, "completions/min_terminated_length": 18.0, "entropy": 5.546254634857178, "epoch": 0.00175, "frac_reward_zero_std": 0.0, "grad_norm": 0.007665017154067755, "kl": 0.2765345424413681, "learning_rate": 9.999997764953326e-06, "loss": -0.002, "num_tokens": 4596111.0, "reward": 3.0970826148986816, "reward_std": 1.2895809412002563, "rewards/rollout_reward_func/mean": 3.0970826148986816, "rewards/rollout_reward_func/std": 1.63455331325531, "sampling/importance_sampling_ratio/max": 0.31010711193084717, "sampling/importance_sampling_ratio/mean": 0.14346721768379211, "sampling/importance_sampling_ratio/min": 2.920134970419719e-20, "sampling/sampling_logp_difference/max": 14.014349937438965, "sampling/sampling_logp_difference/mean": 1.0683830976486206, "step": 175, "step_time": 9.429678833000253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.569147884845734, "epoch": 0.00176, "grad_norm": 0.007943429052829742, "kl": 0.27362857572734356, "learning_rate": 9.999997732678706e-06, "loss": -0.002, "step": 176, "step_time": 4.9659347020001405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 435.15625, "completions/mean_terminated_length": 432.58062744140625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.725856065750122, "epoch": 0.00177, "frac_reward_zero_std": 0.0, "grad_norm": 0.009265062399208546, "kl": 0.3029242120683193, "learning_rate": 9.999997700172724e-06, "loss": -0.0202, "num_tokens": 4651072.0, "reward": 3.301889181137085, "reward_std": 1.8498287200927734, "rewards/rollout_reward_func/mean": 3.301889181137085, "rewards/rollout_reward_func/std": 2.1379148960113525, "sampling/importance_sampling_ratio/max": 0.3099573254585266, "sampling/importance_sampling_ratio/mean": 0.14696773886680603, "sampling/importance_sampling_ratio/min": 5.326042341532999e-13, "sampling/sampling_logp_difference/max": 4.241425037384033, "sampling/sampling_logp_difference/mean": 1.0613305568695068, "step": 177, "step_time": 8.000326000999848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.7441646456718445, "epoch": 0.00178, "grad_norm": 0.009017222560942173, "kl": 0.2993904184550047, "learning_rate": 9.999997667435383e-06, "loss": -0.0202, "step": 178, "step_time": 4.911030336000749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 627.59375, "completions/mean_terminated_length": 622.8709716796875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.733899414539337, "epoch": 0.00179, "frac_reward_zero_std": 0.0, "grad_norm": 0.015371856279671192, "kl": 0.39605566393584013, "learning_rate": 9.999997634466684e-06, "loss": -0.0028, "num_tokens": 4713163.0, "reward": 2.408566951751709, "reward_std": 1.2345832586288452, "rewards/rollout_reward_func/mean": 2.408566951751709, "rewards/rollout_reward_func/std": 1.5583417415618896, "sampling/importance_sampling_ratio/max": 0.3045631945133209, "sampling/importance_sampling_ratio/mean": 0.14088614284992218, "sampling/importance_sampling_ratio/min": 3.260920675229406e-17, "sampling/sampling_logp_difference/max": 12.893571853637695, "sampling/sampling_logp_difference/mean": 1.2033199071884155, "step": 179, "step_time": 9.336971251999785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.755895584821701, "epoch": 0.0018, "grad_norm": 0.013228918425738811, "kl": 0.38902273029088974, "learning_rate": 9.999997601266627e-06, "loss": -0.0029, "step": 180, "step_time": 4.989180980000128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 591.15625, "completions/mean_terminated_length": 591.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.095756113529205, "epoch": 0.00181, "frac_reward_zero_std": 0.0, "grad_norm": 0.007322733756154776, "kl": 0.3757977671921253, "learning_rate": 9.999997567835209e-06, "loss": -0.0162, "num_tokens": 4774032.0, "reward": 2.0675711631774902, "reward_std": 0.6423474550247192, "rewards/rollout_reward_func/mean": 2.0675711631774902, "rewards/rollout_reward_func/std": 1.5976166725158691, "sampling/importance_sampling_ratio/max": 0.30949416756629944, "sampling/importance_sampling_ratio/mean": 0.1696314811706543, "sampling/importance_sampling_ratio/min": 0.0006361076375469565, "sampling/sampling_logp_difference/max": 3.4188907146453857, "sampling/sampling_logp_difference/mean": 0.8557475805282593, "step": 181, "step_time": 9.625972572999672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.120969295501709, "epoch": 0.00182, "grad_norm": 0.006717793643474579, "kl": 0.3749655243009329, "learning_rate": 9.999997534172434e-06, "loss": -0.0162, "step": 182, "step_time": 4.893760320000183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 281.8125, "completions/mean_terminated_length": 281.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.411740601062775, "epoch": 0.00183, "frac_reward_zero_std": 0.25, "grad_norm": 0.01275318581610918, "kl": 0.4849829040467739, "learning_rate": 9.999997500278298e-06, "loss": -0.012, "num_tokens": 4821524.0, "reward": 3.822073459625244, "reward_std": 1.2214256525039673, "rewards/rollout_reward_func/mean": 3.822073459625244, "rewards/rollout_reward_func/std": 1.457135558128357, "sampling/importance_sampling_ratio/max": 0.5602318644523621, "sampling/importance_sampling_ratio/mean": 0.2258593738079071, "sampling/importance_sampling_ratio/min": 8.38409050629707e-06, "sampling/sampling_logp_difference/max": 3.740847587585449, "sampling/sampling_logp_difference/mean": 0.9152244925498962, "step": 183, "step_time": 7.8192755909999505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 5.4009730219841, "epoch": 0.00184, "grad_norm": 0.012373281642794609, "kl": 0.49285488575696945, "learning_rate": 9.999997466152803e-06, "loss": -0.012, "step": 184, "step_time": 4.937281341000016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 324.21875, "completions/mean_terminated_length": 324.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.895534723997116, "epoch": 0.00185, "frac_reward_zero_std": 0.25, "grad_norm": 0.12174241244792938, "kl": 1.1918272729963064, "learning_rate": 9.999997431795949e-06, "loss": -0.0079, "num_tokens": 4872521.0, "reward": 2.4634082317352295, "reward_std": 0.7030278444290161, "rewards/rollout_reward_func/mean": 2.4634082317352295, "rewards/rollout_reward_func/std": 1.6452993154525757, "sampling/importance_sampling_ratio/max": 0.5586066246032715, "sampling/importance_sampling_ratio/mean": 0.22821499407291412, "sampling/importance_sampling_ratio/min": 2.045808008557789e-24, "sampling/sampling_logp_difference/max": 11.78732967376709, "sampling/sampling_logp_difference/mean": 1.2645323276519775, "step": 185, "step_time": 8.363861161000386 }, { "clip_ratio/high_max": 0.021875000093132257, "clip_ratio/high_mean": 0.010937500046566129, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010937500046566129, "entropy": 5.90417143702507, "epoch": 0.00186, "grad_norm": 0.06167863681912422, "kl": 0.7666090168058872, "learning_rate": 9.999997397207736e-06, "loss": -0.0089, "step": 186, "step_time": 4.719343276000018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 563.84375, "completions/mean_terminated_length": 581.51611328125, "completions/min_length": 16.0, "completions/min_terminated_length": 312.0, "entropy": 4.6642705500125885, "epoch": 0.00187, "frac_reward_zero_std": 0.0, "grad_norm": 0.011750214733183384, "kl": 0.43587249517440796, "learning_rate": 9.999997362388163e-06, "loss": -0.0142, "num_tokens": 4933144.0, "reward": 2.8686366081237793, "reward_std": 1.2804789543151855, "rewards/rollout_reward_func/mean": 2.8686366081237793, "rewards/rollout_reward_func/std": 1.7704367637634277, "sampling/importance_sampling_ratio/max": 0.3205587863922119, "sampling/importance_sampling_ratio/mean": 0.20105448365211487, "sampling/importance_sampling_ratio/min": 1.161242035863097e-08, "sampling/sampling_logp_difference/max": 3.013947010040283, "sampling/sampling_logp_difference/mean": 0.8054625988006592, "step": 187, "step_time": 9.18670280699962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.6675141751766205, "epoch": 0.00188, "grad_norm": 0.009789633564651012, "kl": 0.4357186071574688, "learning_rate": 9.999997327337232e-06, "loss": -0.0142, "step": 188, "step_time": 4.775712918999034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 487.1875, "completions/mean_terminated_length": 487.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.183352828025818, "epoch": 0.00189, "frac_reward_zero_std": 0.0, "grad_norm": 0.007217842619866133, "kl": 0.36203115805983543, "learning_rate": 9.99999729205494e-06, "loss": -0.0087, "num_tokens": 4991196.0, "reward": 2.807063102722168, "reward_std": 1.7962517738342285, "rewards/rollout_reward_func/mean": 2.807063102722168, "rewards/rollout_reward_func/std": 2.033599853515625, "sampling/importance_sampling_ratio/max": 0.3048250079154968, "sampling/importance_sampling_ratio/mean": 0.13251593708992004, "sampling/importance_sampling_ratio/min": 0.00014422468666452914, "sampling/sampling_logp_difference/max": 2.6139421463012695, "sampling/sampling_logp_difference/mean": 1.1694201231002808, "step": 189, "step_time": 8.603061843000887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.190056622028351, "epoch": 0.0019, "grad_norm": 0.0067920200526714325, "kl": 0.3644371014088392, "learning_rate": 9.99999725654129e-06, "loss": -0.0087, "step": 190, "step_time": 5.3103096270001515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 593.125, "completions/mean_terminated_length": 585.0967407226562, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.34462833404541, "epoch": 0.00191, "frac_reward_zero_std": 0.0, "grad_norm": 0.0056811366230249405, "kl": 0.34313227608799934, "learning_rate": 9.999997220796281e-06, "loss": -0.0185, "num_tokens": 5052550.0, "reward": 3.0586280822753906, "reward_std": 1.8891185522079468, "rewards/rollout_reward_func/mean": 3.0586280822753906, "rewards/rollout_reward_func/std": 1.9580395221710205, "sampling/importance_sampling_ratio/max": 0.3123818635940552, "sampling/importance_sampling_ratio/mean": 0.17983976006507874, "sampling/importance_sampling_ratio/min": 1.8176118621212876e-21, "sampling/sampling_logp_difference/max": 3.9357526302337646, "sampling/sampling_logp_difference/mean": 1.0673115253448486, "step": 191, "step_time": 9.136948834000577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.350877106189728, "epoch": 0.00192, "grad_norm": 0.006298670079559088, "kl": 0.3456762544810772, "learning_rate": 9.999997184819913e-06, "loss": -0.0185, "step": 192, "step_time": 5.421018731999538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 463.0625, "completions/mean_terminated_length": 463.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.9372279047966, "epoch": 0.00193, "frac_reward_zero_std": 0.25, "grad_norm": 0.006436275318264961, "kl": 0.4286567382514477, "learning_rate": 9.999997148612186e-06, "loss": -0.0093, "num_tokens": 5107904.0, "reward": 2.5031309127807617, "reward_std": 0.7777051329612732, "rewards/rollout_reward_func/mean": 2.5031309127807617, "rewards/rollout_reward_func/std": 1.5234891176223755, "sampling/importance_sampling_ratio/max": 0.556627631187439, "sampling/importance_sampling_ratio/mean": 0.24726936221122742, "sampling/importance_sampling_ratio/min": 1.4663258658697762e-21, "sampling/sampling_logp_difference/max": 11.847373008728027, "sampling/sampling_logp_difference/mean": 1.019608974456787, "step": 193, "step_time": 8.926773718000277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.932731568813324, "epoch": 0.00194, "grad_norm": 0.006187028717249632, "kl": 0.42864546552300453, "learning_rate": 9.9999971121731e-06, "loss": -0.0093, "step": 194, "step_time": 4.810186862000137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 289.0, "completions/mean_terminated_length": 289.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.024335443973541, "epoch": 0.00195, "frac_reward_zero_std": 0.25, "grad_norm": 0.014793330803513527, "kl": 0.5311180464923382, "learning_rate": 9.999997075502653e-06, "loss": -0.0, "num_tokens": 5153943.0, "reward": 2.680002212524414, "reward_std": 1.0660200119018555, "rewards/rollout_reward_func/mean": 2.680002212524414, "rewards/rollout_reward_func/std": 1.608717441558838, "sampling/importance_sampling_ratio/max": 0.556300938129425, "sampling/importance_sampling_ratio/mean": 0.1869335174560547, "sampling/importance_sampling_ratio/min": 8.39509803904695e-14, "sampling/sampling_logp_difference/max": 3.1887903213500977, "sampling/sampling_logp_difference/mean": 1.1006312370300293, "step": 195, "step_time": 8.258161678000306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.993339031934738, "epoch": 0.00196, "grad_norm": 0.01399671845138073, "kl": 0.5270919986069202, "learning_rate": 9.999997038600848e-06, "loss": -0.0, "step": 196, "step_time": 5.157405102000212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 810.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 596.125, "completions/mean_terminated_length": 586.7333374023438, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.484014600515366, "epoch": 0.00197, "frac_reward_zero_std": 0.0, "grad_norm": 0.011650401167571545, "kl": 0.25742718297988176, "learning_rate": 9.999997001467682e-06, "loss": -0.0218, "num_tokens": 5214471.0, "reward": 2.370987892150879, "reward_std": 1.2418047189712524, "rewards/rollout_reward_func/mean": 2.370987892150879, "rewards/rollout_reward_func/std": 1.7970783710479736, "sampling/importance_sampling_ratio/max": 0.30147287249565125, "sampling/importance_sampling_ratio/mean": 0.12158507108688354, "sampling/importance_sampling_ratio/min": 1.8635179788931512e-17, "sampling/sampling_logp_difference/max": 4.204184055328369, "sampling/sampling_logp_difference/mean": 1.3044867515563965, "step": 197, "step_time": 9.104788559999633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.468224465847015, "epoch": 0.00198, "grad_norm": 0.011098474264144897, "kl": 0.2528890473768115, "learning_rate": 9.99999696410316e-06, "loss": -0.0218, "step": 198, "step_time": 5.2834800850005195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 425.25, "completions/mean_terminated_length": 425.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.416600346565247, "epoch": 0.00199, "frac_reward_zero_std": 0.25, "grad_norm": 0.6569156646728516, "kl": 1.3255293928086758, "learning_rate": 9.999996926507279e-06, "loss": -0.0115, "num_tokens": 5267520.0, "reward": 3.124060869216919, "reward_std": 1.0585153102874756, "rewards/rollout_reward_func/mean": 3.124060869216919, "rewards/rollout_reward_func/std": 1.6291773319244385, "sampling/importance_sampling_ratio/max": 0.5548077821731567, "sampling/importance_sampling_ratio/mean": 0.2113969922065735, "sampling/importance_sampling_ratio/min": 0.0004637441597878933, "sampling/sampling_logp_difference/max": 2.2059574127197266, "sampling/sampling_logp_difference/mean": 0.9068723320960999, "step": 199, "step_time": 8.44307382299985 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.432026922702789, "epoch": 0.002, "grad_norm": 0.008453106507658958, "kl": 0.38647904247045517, "learning_rate": 9.999996888680038e-06, "loss": -0.014, "step": 200, "step_time": 4.681182854999861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 360.5625, "completions/mean_terminated_length": 371.6773986816406, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.117161571979523, "epoch": 0.00201, "frac_reward_zero_std": 0.25, "grad_norm": 0.01611809805035591, "kl": 0.4253620970994234, "learning_rate": 9.999996850621436e-06, "loss": -0.0152, "num_tokens": 5318093.0, "reward": 2.912040948867798, "reward_std": 0.7651806473731995, "rewards/rollout_reward_func/mean": 2.912040948867798, "rewards/rollout_reward_func/std": 1.8777114152908325, "sampling/importance_sampling_ratio/max": 0.556606113910675, "sampling/importance_sampling_ratio/mean": 0.21097299456596375, "sampling/importance_sampling_ratio/min": 2.815273847378563e-10, "sampling/sampling_logp_difference/max": 4.575296401977539, "sampling/sampling_logp_difference/mean": 0.9073153734207153, "step": 201, "step_time": 8.590784671000165 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 5.127772510051727, "epoch": 0.00202, "grad_norm": 0.01754254475235939, "kl": 0.42357040755450726, "learning_rate": 9.999996812331476e-06, "loss": -0.0152, "step": 202, "step_time": 4.51354878300026 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "completions/clipped_ratio": 0.0625, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 246.34375, "completions/mean_terminated_length": 251.36668395996094, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.282517075538635, "epoch": 0.00203, "frac_reward_zero_std": 0.25, "grad_norm": 0.014741458930075169, "kl": 0.41361628845334053, "learning_rate": 9.999996773810157e-06, "loss": -0.009, "num_tokens": 5363579.0, "reward": 2.7251787185668945, "reward_std": 0.8450720310211182, "rewards/rollout_reward_func/mean": 2.7251787185668945, "rewards/rollout_reward_func/std": 1.7295880317687988, "sampling/importance_sampling_ratio/max": 0.5568343997001648, "sampling/importance_sampling_ratio/mean": 0.1812913417816162, "sampling/importance_sampling_ratio/min": 9.55414493809214e-19, "sampling/sampling_logp_difference/max": 12.835467338562012, "sampling/sampling_logp_difference/mean": 1.3004571199417114, "step": 203, "step_time": 8.776359550999132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.294387757778168, "epoch": 0.00204, "grad_norm": 0.018707161769270897, "kl": 0.41007015481591225, "learning_rate": 9.99999673505748e-06, "loss": -0.0089, "step": 204, "step_time": 5.04047468599947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 226.71875, "completions/mean_terminated_length": 233.51612854003906, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.579507499933243, "epoch": 0.00205, "frac_reward_zero_std": 0.25, "grad_norm": 0.015186581760644913, "kl": 0.4859627615660429, "learning_rate": 9.999996696073441e-06, "loss": -0.0153, "num_tokens": 5407892.0, "reward": 3.182164192199707, "reward_std": 1.4856499433517456, "rewards/rollout_reward_func/mean": 3.182164192199707, "rewards/rollout_reward_func/std": 1.7880464792251587, "sampling/importance_sampling_ratio/max": 0.5547261238098145, "sampling/importance_sampling_ratio/mean": 0.23533114790916443, "sampling/importance_sampling_ratio/min": 1.2176733044458061e-14, "sampling/sampling_logp_difference/max": 5.080104351043701, "sampling/sampling_logp_difference/mean": 1.195305347442627, "step": 205, "step_time": 8.019790785999703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0024999999441206455, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024999999441206455, "entropy": 5.567036747932434, "epoch": 0.00206, "grad_norm": 0.015953045338392258, "kl": 0.4858721327036619, "learning_rate": 9.999996656858045e-06, "loss": -0.0153, "step": 206, "step_time": 4.494372066999858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 257.25, "completions/mean_terminated_length": 265.0322570800781, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.651438236236572, "epoch": 0.00207, "frac_reward_zero_std": 0.25, "grad_norm": 0.02735012210905552, "kl": 0.4778033494949341, "learning_rate": 9.99999661741129e-06, "loss": -0.0107, "num_tokens": 5454600.0, "reward": 3.8261237144470215, "reward_std": 1.0173990726470947, "rewards/rollout_reward_func/mean": 3.8261237144470215, "rewards/rollout_reward_func/std": 1.3678010702133179, "sampling/importance_sampling_ratio/max": 0.5530157089233398, "sampling/importance_sampling_ratio/mean": 0.22838255763053894, "sampling/importance_sampling_ratio/min": 6.239156959964021e-07, "sampling/sampling_logp_difference/max": 3.2890069484710693, "sampling/sampling_logp_difference/mean": 0.932339072227478, "step": 207, "step_time": 8.438589544000479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.5857046246528625, "epoch": 0.00208, "grad_norm": 0.026330100372433662, "kl": 0.4851069003343582, "learning_rate": 9.999996577733175e-06, "loss": -0.0107, "step": 208, "step_time": 4.460181868000291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 397.125, "completions/mean_terminated_length": 397.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.61989688873291, "epoch": 0.00209, "frac_reward_zero_std": 0.25, "grad_norm": 0.03519469127058983, "kl": 0.4516414776444435, "learning_rate": 9.9999965378237e-06, "loss": -0.0089, "num_tokens": 5507788.0, "reward": 2.7961456775665283, "reward_std": 1.177910566329956, "rewards/rollout_reward_func/mean": 2.7961456775665283, "rewards/rollout_reward_func/std": 1.9247767925262451, "sampling/importance_sampling_ratio/max": 0.5473094582557678, "sampling/importance_sampling_ratio/mean": 0.18874725699424744, "sampling/importance_sampling_ratio/min": 0.00020230493100825697, "sampling/sampling_logp_difference/max": 2.5100324153900146, "sampling/sampling_logp_difference/mean": 0.9597841501235962, "step": 209, "step_time": 9.165979287000027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.536510765552521, "epoch": 0.0021, "grad_norm": 0.02714668959379196, "kl": 0.45711198449134827, "learning_rate": 9.999996497682868e-06, "loss": -0.009, "step": 210, "step_time": 4.771343512000385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 695.15625, "completions/mean_terminated_length": 695.15625, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "entropy": 5.806370496749878, "epoch": 0.00211, "frac_reward_zero_std": 0.0, "grad_norm": 0.02025657892227173, "kl": 0.2787773534655571, "learning_rate": 9.999996457310676e-06, "loss": -0.0145, "num_tokens": 5572137.0, "reward": 2.248298168182373, "reward_std": 0.9219829440116882, "rewards/rollout_reward_func/mean": 2.248298168182373, "rewards/rollout_reward_func/std": 1.4261958599090576, "sampling/importance_sampling_ratio/max": 0.301658034324646, "sampling/importance_sampling_ratio/mean": 0.13740620017051697, "sampling/importance_sampling_ratio/min": 9.007880552580616e-10, "sampling/sampling_logp_difference/max": 3.91507887840271, "sampling/sampling_logp_difference/mean": 0.9571284055709839, "step": 211, "step_time": 9.053193517001091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 5.7584188580513, "epoch": 0.00212, "grad_norm": 0.017554691061377525, "kl": 0.28337166644632816, "learning_rate": 9.999996416707125e-06, "loss": -0.0145, "step": 212, "step_time": 4.910072835999927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 306.875, "completions/mean_terminated_length": 306.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.477221041917801, "epoch": 0.00213, "frac_reward_zero_std": 0.5, "grad_norm": 0.012003457173705101, "kl": 0.6022003293037415, "learning_rate": 9.999996375872214e-06, "loss": -0.0063, "num_tokens": 5618850.0, "reward": 2.7603349685668945, "reward_std": 0.35092732310295105, "rewards/rollout_reward_func/mean": 2.7603349685668945, "rewards/rollout_reward_func/std": 1.3781588077545166, "sampling/importance_sampling_ratio/max": 0.5524585843086243, "sampling/importance_sampling_ratio/mean": 0.3040264844894409, "sampling/importance_sampling_ratio/min": 0.0008875139756128192, "sampling/sampling_logp_difference/max": 2.6261608600616455, "sampling/sampling_logp_difference/mean": 0.7000038623809814, "step": 213, "step_time": 8.743301202000112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.459619641304016, "epoch": 0.00214, "grad_norm": 0.010844763368368149, "kl": 0.6048287376761436, "learning_rate": 9.999996334805946e-06, "loss": -0.0063, "step": 214, "step_time": 4.693870484000399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 359.09375, "completions/mean_terminated_length": 370.1612854003906, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.843548268079758, "epoch": 0.00215, "frac_reward_zero_std": 0.0, "grad_norm": 0.047307442873716354, "kl": 1.0721620507538319, "learning_rate": 9.999996293508317e-06, "loss": -0.012, "num_tokens": 5670482.0, "reward": 2.6061129570007324, "reward_std": 0.6701341867446899, "rewards/rollout_reward_func/mean": 2.6061129570007324, "rewards/rollout_reward_func/std": 1.2390384674072266, "sampling/importance_sampling_ratio/max": 0.5575221180915833, "sampling/importance_sampling_ratio/mean": 0.2627297639846802, "sampling/importance_sampling_ratio/min": 5.8882815068272976e-08, "sampling/sampling_logp_difference/max": 2.7387423515319824, "sampling/sampling_logp_difference/mean": 0.7991349101066589, "step": 215, "step_time": 9.773670517000028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.866431772708893, "epoch": 0.00216, "grad_norm": 0.02840716764330864, "kl": 0.9429246261715889, "learning_rate": 9.999996251979329e-06, "loss": -0.0121, "step": 216, "step_time": 5.003656359999695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 386.65625, "completions/mean_terminated_length": 386.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.9683555364608765, "epoch": 0.00217, "frac_reward_zero_std": 0.25, "grad_norm": 0.056511927396059036, "kl": 0.435198824852705, "learning_rate": 9.999996210218981e-06, "loss": -0.0082, "num_tokens": 5723431.0, "reward": 2.3611326217651367, "reward_std": 1.23284113407135, "rewards/rollout_reward_func/mean": 2.3611326217651367, "rewards/rollout_reward_func/std": 2.1599881649017334, "sampling/importance_sampling_ratio/max": 0.5584315657615662, "sampling/importance_sampling_ratio/mean": 0.23213379085063934, "sampling/importance_sampling_ratio/min": 9.592416063242126e-06, "sampling/sampling_logp_difference/max": 3.0070853233337402, "sampling/sampling_logp_difference/mean": 0.7948013544082642, "step": 217, "step_time": 8.342223236000336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 4.992517560720444, "epoch": 0.00218, "grad_norm": 0.011323577724397182, "kl": 0.43109437450766563, "learning_rate": 9.999996168227277e-06, "loss": -0.0084, "step": 218, "step_time": 4.607645754999339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 301.9375, "completions/mean_terminated_length": 301.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.874771684408188, "epoch": 0.00219, "frac_reward_zero_std": 0.5, "grad_norm": 0.031172219663858414, "kl": 0.6132681779563427, "learning_rate": 9.999996126004213e-06, "loss": -0.0065, "num_tokens": 5770701.0, "reward": 2.9851279258728027, "reward_std": 0.8050676584243774, "rewards/rollout_reward_func/mean": 2.9851279258728027, "rewards/rollout_reward_func/std": 1.344331979751587, "sampling/importance_sampling_ratio/max": 0.5544815063476562, "sampling/importance_sampling_ratio/mean": 0.24593959748744965, "sampling/importance_sampling_ratio/min": 4.177666784559164e-14, "sampling/sampling_logp_difference/max": 4.385872840881348, "sampling/sampling_logp_difference/mean": 0.8459481000900269, "step": 219, "step_time": 8.750391007999951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 4.893818587064743, "epoch": 0.0022, "grad_norm": 0.03290743753314018, "kl": 0.6108976900577545, "learning_rate": 9.999996083549788e-06, "loss": -0.0065, "step": 220, "step_time": 4.7656036020011925 }, { "clip_ratio/high_max": 0.00657894741743803, "clip_ratio/high_mean": 0.003289473708719015, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009539473801851273, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 569.1875, "completions/mean_terminated_length": 569.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.575940668582916, "epoch": 0.00221, "frac_reward_zero_std": 0.0, "grad_norm": 0.011595762334764004, "kl": 0.347899217158556, "learning_rate": 9.999996040864003e-06, "loss": -0.0171, "num_tokens": 5830699.0, "reward": 3.000262498855591, "reward_std": 1.67076575756073, "rewards/rollout_reward_func/mean": 3.000262498855591, "rewards/rollout_reward_func/std": 1.8007228374481201, "sampling/importance_sampling_ratio/max": 0.3085794150829315, "sampling/importance_sampling_ratio/mean": 0.15168991684913635, "sampling/importance_sampling_ratio/min": 4.0518877315876e-12, "sampling/sampling_logp_difference/max": 2.8902509212493896, "sampling/sampling_logp_difference/mean": 0.9849369525909424, "step": 221, "step_time": 9.164814059999571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 5.600629568099976, "epoch": 0.00222, "grad_norm": 0.010892020538449287, "kl": 0.3483094722032547, "learning_rate": 9.999995997946861e-06, "loss": -0.0171, "step": 222, "step_time": 4.744326772000022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 410.9375, "completions/mean_terminated_length": 410.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.076228618621826, "epoch": 0.00223, "frac_reward_zero_std": 0.25, "grad_norm": 0.007272793911397457, "kl": 0.37392777390778065, "learning_rate": 9.999995954798361e-06, "loss": -0.0002, "num_tokens": 5882689.0, "reward": 2.952998161315918, "reward_std": 1.2456332445144653, "rewards/rollout_reward_func/mean": 2.952998161315918, "rewards/rollout_reward_func/std": 1.8146508932113647, "sampling/importance_sampling_ratio/max": 0.5566303133964539, "sampling/importance_sampling_ratio/mean": 0.1916414350271225, "sampling/importance_sampling_ratio/min": 5.414286691375744e-15, "sampling/sampling_logp_difference/max": 3.779116630554199, "sampling/sampling_logp_difference/mean": 1.1832430362701416, "step": 223, "step_time": 8.38111467600038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.1000664830207825, "epoch": 0.00224, "grad_norm": 0.00639992905780673, "kl": 0.37375164218246937, "learning_rate": 9.9999959114185e-06, "loss": -0.0002, "step": 224, "step_time": 5.022103202999915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 251.3125, "completions/mean_terminated_length": 258.9032287597656, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.927689909934998, "epoch": 0.00225, "frac_reward_zero_std": 0.0, "grad_norm": 0.027866557240486145, "kl": 0.44809701666235924, "learning_rate": 9.999995867807281e-06, "loss": -0.0124, "num_tokens": 5928498.0, "reward": 2.4831409454345703, "reward_std": 0.8753278255462646, "rewards/rollout_reward_func/mean": 2.4831409454345703, "rewards/rollout_reward_func/std": 1.5412579774856567, "sampling/importance_sampling_ratio/max": 0.5537245869636536, "sampling/importance_sampling_ratio/mean": 0.21668727695941925, "sampling/importance_sampling_ratio/min": 7.331964479995179e-10, "sampling/sampling_logp_difference/max": 3.372708797454834, "sampling/sampling_logp_difference/mean": 1.118770956993103, "step": 225, "step_time": 8.150611867999942 } ], "logging_steps": 1.0, "max_steps": 400000, "num_input_tokens_seen": 5928498, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }