diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18114 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 565, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 71.78125, + "completions/mean_terminated_length": 71.78125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.43730998039245605, + "epoch": 0.0017699115044247787, + "frac_reward_zero_std": 0.75, + "grad_norm": 14.733362336222088, + "kl": 0.0, + "learning_rate": 0.0, + "loss": 0.2291, + "num_tokens": 14258.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.748260736465454, + "sampling/importance_sampling_ratio/mean": 1.0003552436828613, + "sampling/importance_sampling_ratio/min": 0.3887093663215637, + "sampling/sampling_logp_difference/max": 0.9449234008789062, + "sampling/sampling_logp_difference/mean": 0.020397081971168518, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 61.546875, + "completions/mean_terminated_length": 61.546875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2651497721672058, + "epoch": 0.0035398230088495575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.4247787610619464e-09, + "loss": 0.0, + "num_tokens": 28357.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2627105712890625, + "sampling/importance_sampling_ratio/mean": 0.9985069036483765, + "sampling/importance_sampling_ratio/min": 0.6752722263336182, + "sampling/sampling_logp_difference/max": 0.3926393985748291, + "sampling/sampling_logp_difference/mean": 0.01767190359532833, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 32.0, + "completions/mean_terminated_length": 32.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2806015908718109, + "epoch": 0.005309734513274336, + "frac_reward_zero_std": 0.75, + "grad_norm": 23.26227259495331, + "kl": 0.0005717705935239792, + "learning_rate": 8.849557522123893e-09, + "loss": 0.1264, + "num_tokens": 44085.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994414448738098, + "sampling/importance_sampling_ratio/min": 0.6796828508377075, + "sampling/sampling_logp_difference/max": 1.3504528999328613, + "sampling/sampling_logp_difference/mean": 0.020524393767118454, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 35.875, + "completions/mean_terminated_length": 35.875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.23058325052261353, + "epoch": 0.007079646017699115, + "frac_reward_zero_std": 0.5, + "grad_norm": 10.797167113972659, + "kl": 0.0005771875730715692, + "learning_rate": 1.327433628318584e-08, + "loss": -0.0717, + "num_tokens": 58621.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3210742473602295, + "sampling/importance_sampling_ratio/mean": 1.002717137336731, + "sampling/importance_sampling_ratio/min": 0.5016065239906311, + "sampling/sampling_logp_difference/max": 0.6899392604827881, + "sampling/sampling_logp_difference/mean": 0.01839931309223175, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 89.03125, + "completions/mean_terminated_length": 89.03125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4322381019592285, + "epoch": 0.008849557522123894, + "frac_reward_zero_std": 0.5, + "grad_norm": 6.9613059543793785, + "kl": 0.0008884783601388335, + "learning_rate": 1.7699115044247786e-08, + "loss": 0.0106, + "num_tokens": 75759.0, + "reward": 0.125, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.5849732160568237, + "sampling/importance_sampling_ratio/mean": 0.9992538690567017, + "sampling/importance_sampling_ratio/min": 0.6169490814208984, + "sampling/sampling_logp_difference/max": 0.482968807220459, + "sampling/sampling_logp_difference/mean": 0.01715037226676941, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 91.265625, + "completions/mean_terminated_length": 91.265625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.45085543394088745, + "epoch": 0.010619469026548672, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013659497090174135, + "kl": 0.00046727299923077226, + "learning_rate": 2.2123893805309735e-08, + "loss": 0.0, + "num_tokens": 92496.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3823795318603516, + "sampling/importance_sampling_ratio/mean": 0.9996471405029297, + "sampling/importance_sampling_ratio/min": 0.6376644372940063, + "sampling/sampling_logp_difference/max": 0.44994306564331055, + "sampling/sampling_logp_difference/mean": 0.01948455721139908, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 91.75, + "completions/mean_terminated_length": 91.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.5193246006965637, + "epoch": 0.012389380530973451, + "frac_reward_zero_std": 0.75, + "grad_norm": 7.340318994776948, + "kl": 0.0008482749690301716, + "learning_rate": 2.654867256637168e-08, + "loss": 0.1265, + "num_tokens": 112880.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.9475336074829102, + "sampling/importance_sampling_ratio/mean": 1.0007236003875732, + "sampling/importance_sampling_ratio/min": 0.4580051898956299, + "sampling/sampling_logp_difference/max": 0.7808747291564941, + "sampling/sampling_logp_difference/mean": 0.023239202797412872, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 132.0, + "completions/max_terminated_length": 132.0, + "completions/mean_length": 36.015625, + "completions/mean_terminated_length": 36.015625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.20200984179973602, + "epoch": 0.01415929203539823, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.4503832105678285, + "kl": 0.0009719593799673021, + "learning_rate": 3.0973451327433626e-08, + "loss": 0.0025, + "num_tokens": 126161.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.3440406322479248, + "sampling/importance_sampling_ratio/mean": 1.001471757888794, + "sampling/importance_sampling_ratio/min": 0.7208306193351746, + "sampling/sampling_logp_difference/max": 0.32735109329223633, + "sampling/sampling_logp_difference/mean": 0.012839527800679207, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 117.671875, + "completions/mean_terminated_length": 117.671875, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 0.5152678489685059, + "epoch": 0.01592920353982301, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00915880506969509, + "kl": 0.0004841999616473913, + "learning_rate": 3.539823008849557e-08, + "loss": 0.0, + "num_tokens": 143500.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999586820602417, + "sampling/importance_sampling_ratio/min": 0.6153971552848816, + "sampling/sampling_logp_difference/max": 0.7832646369934082, + "sampling/sampling_logp_difference/mean": 0.018428195267915726, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 67.609375, + "completions/mean_terminated_length": 67.609375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2896856963634491, + "epoch": 0.017699115044247787, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022535488141927375, + "kl": 0.0009633470908738673, + "learning_rate": 3.982300884955752e-08, + "loss": 0.0, + "num_tokens": 159507.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4800368547439575, + "sampling/importance_sampling_ratio/mean": 0.9995597004890442, + "sampling/importance_sampling_ratio/min": 0.7342386245727539, + "sampling/sampling_logp_difference/max": 0.39206695556640625, + "sampling/sampling_logp_difference/mean": 0.012184200808405876, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 65.53125, + "completions/mean_terminated_length": 65.53125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.19404293596744537, + "epoch": 0.019469026548672566, + "frac_reward_zero_std": 0.25, + "grad_norm": 6.824992842595387, + "kl": 0.0015200059860944748, + "learning_rate": 4.424778761061947e-08, + "loss": -0.0771, + "num_tokens": 173685.0, + "reward": 0.90625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.4212366342544556, + "sampling/importance_sampling_ratio/mean": 1.0023908615112305, + "sampling/importance_sampling_ratio/min": 0.6726571917533875, + "sampling/sampling_logp_difference/max": 0.3965195417404175, + "sampling/sampling_logp_difference/mean": 0.014005091972649097, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 100.984375, + "completions/mean_terminated_length": 100.984375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.365769624710083, + "epoch": 0.021238938053097345, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014960286603652508, + "kl": 0.0005460747634060681, + "learning_rate": 4.8672566371681415e-08, + "loss": 0.0, + "num_tokens": 190452.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9997556209564209, + "sampling/importance_sampling_ratio/min": 0.4474831819534302, + "sampling/sampling_logp_difference/max": 0.8911995887756348, + "sampling/sampling_logp_difference/mean": 0.013871857896447182, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 106.375, + "completions/mean_terminated_length": 106.375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.303292840719223, + "epoch": 0.023008849557522124, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.5792693910632183, + "kl": 0.0007246644818224013, + "learning_rate": 5.309734513274336e-08, + "loss": -0.1241, + "num_tokens": 208620.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.6080149412155151, + "sampling/importance_sampling_ratio/mean": 1.0023037195205688, + "sampling/importance_sampling_ratio/min": 0.5335606932640076, + "sampling/sampling_logp_difference/max": 0.6281824111938477, + "sampling/sampling_logp_difference/mean": 0.017998535186052322, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 34.828125, + "completions/mean_terminated_length": 34.828125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.17239561676979065, + "epoch": 0.024778761061946902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014602460582552898, + "kl": 0.000353145704139024, + "learning_rate": 5.7522123893805306e-08, + "loss": 0.0, + "num_tokens": 224305.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5867887735366821, + "sampling/importance_sampling_ratio/mean": 1.0006211996078491, + "sampling/importance_sampling_ratio/min": 0.39414942264556885, + "sampling/sampling_logp_difference/max": 0.9310252666473389, + "sampling/sampling_logp_difference/mean": 0.013991497457027435, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 86.9375, + "completions/mean_terminated_length": 86.9375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3107295036315918, + "epoch": 0.02654867256637168, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007229437086416403, + "kl": 0.0005225496715866029, + "learning_rate": 6.194690265486725e-08, + "loss": 0.0, + "num_tokens": 243069.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4709138870239258, + "sampling/importance_sampling_ratio/mean": 0.9995893239974976, + "sampling/importance_sampling_ratio/min": 0.65098637342453, + "sampling/sampling_logp_difference/max": 0.4292665719985962, + "sampling/sampling_logp_difference/mean": 0.01991843990981579, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 46.296875, + "completions/mean_terminated_length": 46.296875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.32953619956970215, + "epoch": 0.02831858407079646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02767166983547206, + "kl": 0.00113023747690022, + "learning_rate": 6.63716814159292e-08, + "loss": 0.0, + "num_tokens": 257248.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6294353008270264, + "sampling/importance_sampling_ratio/mean": 0.9990890026092529, + "sampling/importance_sampling_ratio/min": 0.6623630523681641, + "sampling/sampling_logp_difference/max": 0.4882335662841797, + "sampling/sampling_logp_difference/mean": 0.0199204720556736, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 44.796875, + "completions/mean_terminated_length": 44.796875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.17392346262931824, + "epoch": 0.03008849557522124, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01600255624211106, + "kl": 0.0004199473187327385, + "learning_rate": 7.079646017699114e-08, + "loss": 0.0, + "num_tokens": 270659.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.328696370124817, + "sampling/importance_sampling_ratio/mean": 0.9982379674911499, + "sampling/importance_sampling_ratio/min": 0.6624902486801147, + "sampling/sampling_logp_difference/max": 0.4117494821548462, + "sampling/sampling_logp_difference/mean": 0.01237030141055584, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 126.0, + "completions/max_terminated_length": 126.0, + "completions/mean_length": 28.84375, + "completions/mean_terminated_length": 28.84375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.159395232796669, + "epoch": 0.03185840707964602, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.513435214167038, + "kl": 0.0023553031496703625, + "learning_rate": 7.52212389380531e-08, + "loss": -0.0075, + "num_tokens": 282889.0, + "reward": -0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.9696966409683228, + "sampling/importance_sampling_ratio/mean": 0.9974421262741089, + "sampling/importance_sampling_ratio/min": 0.5129315257072449, + "sampling/sampling_logp_difference/max": 0.6778795719146729, + "sampling/sampling_logp_difference/mean": 0.022663142532110214, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 73.859375, + "completions/mean_terminated_length": 73.859375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2566125690937042, + "epoch": 0.033628318584070796, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01530068562910834, + "kl": 0.0003858095151372254, + "learning_rate": 7.964601769911503e-08, + "loss": 0.0, + "num_tokens": 299264.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.795694351196289, + "sampling/importance_sampling_ratio/mean": 1.0008893013000488, + "sampling/importance_sampling_ratio/min": 0.5062554478645325, + "sampling/sampling_logp_difference/max": 0.6807138919830322, + "sampling/sampling_logp_difference/mean": 0.014813372865319252, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 89.984375, + "completions/mean_terminated_length": 89.984375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4346693754196167, + "epoch": 0.035398230088495575, + "frac_reward_zero_std": 0.5, + "grad_norm": 5.7359841983795175, + "kl": 0.0004782738978974521, + "learning_rate": 8.4070796460177e-08, + "loss": 0.0061, + "num_tokens": 314959.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.620740532875061, + "sampling/importance_sampling_ratio/mean": 0.9991461038589478, + "sampling/importance_sampling_ratio/min": 0.5147078633308411, + "sampling/sampling_logp_difference/max": 0.6641558408737183, + "sampling/sampling_logp_difference/mean": 0.01632934808731079, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 69.453125, + "completions/mean_terminated_length": 69.453125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3627976179122925, + "epoch": 0.03716814159292035, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.59221061699448, + "kl": 0.0004590965108945966, + "learning_rate": 8.849557522123894e-08, + "loss": 0.0508, + "num_tokens": 329756.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.8821483850479126, + "sampling/importance_sampling_ratio/mean": 1.0001276731491089, + "sampling/importance_sampling_ratio/min": 0.4833468794822693, + "sampling/sampling_logp_difference/max": 0.7270207405090332, + "sampling/sampling_logp_difference/mean": 0.02273380383849144, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 39.90625, + "completions/mean_terminated_length": 39.90625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.24342747032642365, + "epoch": 0.03893805309734513, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.7038806022357496, + "kl": 0.0023908826988190413, + "learning_rate": 9.292035398230089e-08, + "loss": 0.0698, + "num_tokens": 344998.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.872116208076477, + "sampling/importance_sampling_ratio/mean": 0.9997944235801697, + "sampling/importance_sampling_ratio/min": 0.19888915121555328, + "sampling/sampling_logp_difference/max": 1.6150076389312744, + "sampling/sampling_logp_difference/mean": 0.019733227789402008, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 39.5, + "completions/mean_terminated_length": 39.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.16345232725143433, + "epoch": 0.04070796460176991, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04997415895244329, + "kl": 0.0014484870480373502, + "learning_rate": 9.734513274336283e-08, + "loss": 0.0, + "num_tokens": 357894.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5752054452896118, + "sampling/importance_sampling_ratio/mean": 0.999841570854187, + "sampling/importance_sampling_ratio/min": 0.6862316131591797, + "sampling/sampling_logp_difference/max": 0.45438575744628906, + "sampling/sampling_logp_difference/mean": 0.019694484770298004, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 52.890625, + "completions/mean_terminated_length": 52.890625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.37000778317451477, + "epoch": 0.04247787610619469, + "frac_reward_zero_std": 0.75, + "grad_norm": 8.338317286703484, + "kl": 0.000826447329018265, + "learning_rate": 1.0176991150442478e-07, + "loss": -0.1221, + "num_tokens": 372159.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0006983280181885, + "sampling/importance_sampling_ratio/min": 0.5474603176116943, + "sampling/sampling_logp_difference/max": 0.7353544235229492, + "sampling/sampling_logp_difference/mean": 0.018441716209053993, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 65.078125, + "completions/mean_terminated_length": 65.078125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.33777666091918945, + "epoch": 0.04424778761061947, + "frac_reward_zero_std": 0.75, + "grad_norm": 11.659760033795731, + "kl": 0.0006184541853144765, + "learning_rate": 1.0619469026548672e-07, + "loss": -0.1307, + "num_tokens": 388756.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999577522277832, + "sampling/importance_sampling_ratio/min": 0.46445992588996887, + "sampling/sampling_logp_difference/max": 1.3790621757507324, + "sampling/sampling_logp_difference/mean": 0.019229542464017868, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 69.046875, + "completions/mean_terminated_length": 69.046875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.25738662481307983, + "epoch": 0.04601769911504425, + "frac_reward_zero_std": 0.75, + "grad_norm": 9.487071516705344, + "kl": 0.0003331995103508234, + "learning_rate": 1.1061946902654867e-07, + "loss": -0.2253, + "num_tokens": 404391.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.2855082750320435, + "sampling/importance_sampling_ratio/mean": 0.9986451864242554, + "sampling/importance_sampling_ratio/min": 0.6865718364715576, + "sampling/sampling_logp_difference/max": 0.3760443925857544, + "sampling/sampling_logp_difference/mean": 0.018185608088970184, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 78.109375, + "completions/mean_terminated_length": 78.109375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.44814449548721313, + "epoch": 0.047787610619469026, + "frac_reward_zero_std": 0.75, + "grad_norm": 7.756923111271524, + "kl": 0.00046124885557219386, + "learning_rate": 1.1504424778761061e-07, + "loss": -0.1409, + "num_tokens": 419470.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.4353742599487305, + "sampling/importance_sampling_ratio/mean": 0.9993364214897156, + "sampling/importance_sampling_ratio/min": 0.5488451719284058, + "sampling/sampling_logp_difference/max": 0.5999388694763184, + "sampling/sampling_logp_difference/mean": 0.018295522779226303, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 76.953125, + "completions/mean_terminated_length": 76.953125, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.31486934423446655, + "epoch": 0.049557522123893805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009386442378514733, + "kl": 0.00041278538992628455, + "learning_rate": 1.1946902654867256e-07, + "loss": 0.0, + "num_tokens": 435595.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5819377899169922, + "sampling/importance_sampling_ratio/mean": 1.0006744861602783, + "sampling/importance_sampling_ratio/min": 0.4626096785068512, + "sampling/sampling_logp_difference/max": 0.770871639251709, + "sampling/sampling_logp_difference/mean": 0.01879153959453106, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 98.671875, + "completions/mean_terminated_length": 98.671875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4051646590232849, + "epoch": 0.05132743362831858, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014069863796196987, + "kl": 0.0004532830498646945, + "learning_rate": 1.238938053097345e-07, + "loss": 0.0, + "num_tokens": 452342.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7859727144241333, + "sampling/importance_sampling_ratio/mean": 0.9995170831680298, + "sampling/importance_sampling_ratio/min": 0.5645666718482971, + "sampling/sampling_logp_difference/max": 0.579963207244873, + "sampling/sampling_logp_difference/mean": 0.017181504517793655, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 231.0, + "completions/max_terminated_length": 231.0, + "completions/mean_length": 41.71875, + "completions/mean_terminated_length": 41.71875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.22401276230812073, + "epoch": 0.05309734513274336, + "frac_reward_zero_std": 0.75, + "grad_norm": 6.61094425527448, + "kl": 0.00048795485054142773, + "learning_rate": 1.2831858407079647e-07, + "loss": 0.0822, + "num_tokens": 467092.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5831127166748047, + "sampling/importance_sampling_ratio/mean": 0.9997624158859253, + "sampling/importance_sampling_ratio/min": 0.5481081604957581, + "sampling/sampling_logp_difference/max": 0.6012825965881348, + "sampling/sampling_logp_difference/mean": 0.013244973495602608, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.71875, + "completions/mean_terminated_length": 15.71875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.07598844170570374, + "epoch": 0.05486725663716814, + "frac_reward_zero_std": 0.75, + "grad_norm": 9.874575477437448, + "kl": 0.00039827151340432465, + "learning_rate": 1.327433628318584e-07, + "loss": -0.0032, + "num_tokens": 480098.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.4121800661087036, + "sampling/importance_sampling_ratio/mean": 1.0020051002502441, + "sampling/importance_sampling_ratio/min": 0.5527469515800476, + "sampling/sampling_logp_difference/max": 0.5928549766540527, + "sampling/sampling_logp_difference/mean": 0.013948867097496986, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 39.421875, + "completions/mean_terminated_length": 39.421875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1963411271572113, + "epoch": 0.05663716814159292, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.856088448360933, + "kl": 0.0015413790242746472, + "learning_rate": 1.3716814159292035e-07, + "loss": -0.0009, + "num_tokens": 494525.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.797372579574585, + "sampling/importance_sampling_ratio/mean": 1.001387596130371, + "sampling/importance_sampling_ratio/min": 0.6285156011581421, + "sampling/sampling_logp_difference/max": 0.5863258838653564, + "sampling/sampling_logp_difference/mean": 0.01898869127035141, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 89.921875, + "completions/mean_terminated_length": 89.921875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.40926527976989746, + "epoch": 0.0584070796460177, + "frac_reward_zero_std": 0.75, + "grad_norm": 8.86902251988677, + "kl": 0.0004499046190176159, + "learning_rate": 1.4159292035398229e-07, + "loss": -0.3134, + "num_tokens": 511432.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.9430650472640991, + "sampling/importance_sampling_ratio/mean": 1.0010493993759155, + "sampling/importance_sampling_ratio/min": 0.62600177526474, + "sampling/sampling_logp_difference/max": 0.6642667055130005, + "sampling/sampling_logp_difference/mean": 0.016596786677837372, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 56.71875, + "completions/mean_terminated_length": 56.71875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2863810658454895, + "epoch": 0.06017699115044248, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021468556262928137, + "kl": 0.0007147843716666102, + "learning_rate": 1.4601769911504425e-07, + "loss": 0.0, + "num_tokens": 526534.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8784087896347046, + "sampling/importance_sampling_ratio/mean": 0.9997020363807678, + "sampling/importance_sampling_ratio/min": 0.6882390379905701, + "sampling/sampling_logp_difference/max": 0.630424976348877, + "sampling/sampling_logp_difference/mean": 0.017147017642855644, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 85.953125, + "completions/mean_terminated_length": 85.953125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4597724378108978, + "epoch": 0.061946902654867256, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.7681090870036456, + "kl": 0.0005873649497516453, + "learning_rate": 1.504424778761062e-07, + "loss": 0.0506, + "num_tokens": 545475.0, + "reward": 0.4375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5615980625152588, + "sampling/importance_sampling_ratio/mean": 0.9998810291290283, + "sampling/importance_sampling_ratio/min": 0.6170110106468201, + "sampling/sampling_logp_difference/max": 0.4828684329986572, + "sampling/sampling_logp_difference/mean": 0.016421593725681305, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 99.140625, + "completions/mean_terminated_length": 99.140625, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 0.5353041887283325, + "epoch": 0.06371681415929203, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.6580441755876607, + "kl": 0.0004714885726571083, + "learning_rate": 1.5486725663716813e-07, + "loss": 0.004, + "num_tokens": 561196.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4393292665481567, + "sampling/importance_sampling_ratio/mean": 0.9998955726623535, + "sampling/importance_sampling_ratio/min": 0.6063184142112732, + "sampling/sampling_logp_difference/max": 0.5003499984741211, + "sampling/sampling_logp_difference/mean": 0.01866915076971054, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 52.53125, + "completions/mean_terminated_length": 52.53125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3300973176956177, + "epoch": 0.06548672566371681, + "frac_reward_zero_std": 0.5, + "grad_norm": 20.36985643572742, + "kl": 0.0006205074023455381, + "learning_rate": 1.5929203539823007e-07, + "loss": 0.4553, + "num_tokens": 579022.0, + "reward": 0.84375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995156526565552, + "sampling/importance_sampling_ratio/min": 0.47040438652038574, + "sampling/sampling_logp_difference/max": 1.002622365951538, + "sampling/sampling_logp_difference/mean": 0.021546784788370132, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 81.078125, + "completions/mean_terminated_length": 81.078125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3800387382507324, + "epoch": 0.06725663716814159, + "frac_reward_zero_std": 0.75, + "grad_norm": 10.339012444363068, + "kl": 0.0006738771917298436, + "learning_rate": 1.6371681415929203e-07, + "loss": 0.1046, + "num_tokens": 594019.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005218982696533, + "sampling/importance_sampling_ratio/min": 0.5071792602539062, + "sampling/sampling_logp_difference/max": 0.8020846843719482, + "sampling/sampling_logp_difference/mean": 0.01670372113585472, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 259.0, + "completions/max_terminated_length": 259.0, + "completions/mean_length": 93.484375, + "completions/mean_terminated_length": 93.484375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4449535310268402, + "epoch": 0.06902654867256637, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.317212480269803, + "kl": 0.0006473226239904761, + "learning_rate": 1.68141592920354e-07, + "loss": -0.1538, + "num_tokens": 609394.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.8720818758010864, + "sampling/importance_sampling_ratio/mean": 0.9996934533119202, + "sampling/importance_sampling_ratio/min": 0.6724444627761841, + "sampling/sampling_logp_difference/max": 0.6270511150360107, + "sampling/sampling_logp_difference/mean": 0.017083389684557915, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 81.90625, + "completions/mean_terminated_length": 81.90625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.33130159974098206, + "epoch": 0.07079646017699115, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.038053707633894, + "kl": 0.0009376220987178385, + "learning_rate": 1.725663716814159e-07, + "loss": 0.0787, + "num_tokens": 625884.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4165065288543701, + "sampling/importance_sampling_ratio/mean": 1.0004032850265503, + "sampling/importance_sampling_ratio/min": 0.5999572277069092, + "sampling/sampling_logp_difference/max": 0.5108969211578369, + "sampling/sampling_logp_difference/mean": 0.019599106162786484, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 47.71875, + "completions/mean_terminated_length": 47.71875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1945371925830841, + "epoch": 0.07256637168141593, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.3978063035136974, + "kl": 0.0002761489013209939, + "learning_rate": 1.7699115044247788e-07, + "loss": 0.0003, + "num_tokens": 642314.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.3843897581100464, + "sampling/importance_sampling_ratio/mean": 1.0020694732666016, + "sampling/importance_sampling_ratio/min": 0.5751755833625793, + "sampling/sampling_logp_difference/max": 0.5530799627304077, + "sampling/sampling_logp_difference/mean": 0.01604965329170227, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 75.296875, + "completions/mean_terminated_length": 75.296875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.35504159331321716, + "epoch": 0.0743362831858407, + "frac_reward_zero_std": 0.5, + "grad_norm": 10.922965398733, + "kl": 0.0010760982986539602, + "learning_rate": 1.8141592920353982e-07, + "loss": 0.1741, + "num_tokens": 658733.0, + "reward": 0.25, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.5612423419952393, + "sampling/importance_sampling_ratio/mean": 0.9998565912246704, + "sampling/importance_sampling_ratio/min": 0.6919164061546326, + "sampling/sampling_logp_difference/max": 0.44548189640045166, + "sampling/sampling_logp_difference/mean": 0.017055584117770195, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 76.390625, + "completions/mean_terminated_length": 76.390625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3077981173992157, + "epoch": 0.07610619469026549, + "frac_reward_zero_std": 0.75, + "grad_norm": 8.929794139932175, + "kl": 0.000669544271659106, + "learning_rate": 1.8584070796460178e-07, + "loss": 0.1094, + "num_tokens": 674374.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.8034026622772217, + "sampling/importance_sampling_ratio/mean": 1.0019267797470093, + "sampling/importance_sampling_ratio/min": 0.48356494307518005, + "sampling/sampling_logp_difference/max": 0.726569652557373, + "sampling/sampling_logp_difference/mean": 0.028325039893388748, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 122.9375, + "completions/mean_terminated_length": 122.9375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.399412602186203, + "epoch": 0.07787610619469026, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.011838867228157263, + "kl": 0.0004714487586170435, + "learning_rate": 1.902654867256637e-07, + "loss": 0.0, + "num_tokens": 694210.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4246737957000732, + "sampling/importance_sampling_ratio/mean": 1.0002517700195312, + "sampling/importance_sampling_ratio/min": 0.6519432663917542, + "sampling/sampling_logp_difference/max": 0.427797794342041, + "sampling/sampling_logp_difference/mean": 0.014120910316705704, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 144.4375, + "completions/mean_terminated_length": 144.4375, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 0.5294415354728699, + "epoch": 0.07964601769911504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00685115726939914, + "kl": 0.0005526579916477203, + "learning_rate": 1.9469026548672566e-07, + "loss": 0.0, + "num_tokens": 712510.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6863336563110352, + "sampling/importance_sampling_ratio/mean": 0.9996277093887329, + "sampling/importance_sampling_ratio/min": 0.5524927973747253, + "sampling/sampling_logp_difference/max": 0.5933148860931396, + "sampling/sampling_logp_difference/mean": 0.019174396991729736, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 56.796875, + "completions/mean_terminated_length": 56.796875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.33027297258377075, + "epoch": 0.08141592920353982, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03531126435999156, + "kl": 0.001525461906567216, + "learning_rate": 1.991150442477876e-07, + "loss": 0.0, + "num_tokens": 726929.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4253754615783691, + "sampling/importance_sampling_ratio/mean": 1.002415418624878, + "sampling/importance_sampling_ratio/min": 0.6252660751342773, + "sampling/sampling_logp_difference/max": 0.46957799792289734, + "sampling/sampling_logp_difference/mean": 0.018032699823379517, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 55.4375, + "completions/mean_terminated_length": 55.4375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3531503677368164, + "epoch": 0.0831858407079646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03232207870409841, + "kl": 0.0017060365062206984, + "learning_rate": 2.0353982300884956e-07, + "loss": 0.0, + "num_tokens": 741069.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9697322845458984, + "sampling/importance_sampling_ratio/mean": 1.0025861263275146, + "sampling/importance_sampling_ratio/min": 0.6927888989448547, + "sampling/sampling_logp_difference/max": 0.6778976917266846, + "sampling/sampling_logp_difference/mean": 0.024530623108148575, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1186.0, + "completions/max_terminated_length": 1186.0, + "completions/mean_length": 136.765625, + "completions/mean_terminated_length": 136.765625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3937437832355499, + "epoch": 0.08495575221238938, + "frac_reward_zero_std": 0.25, + "grad_norm": 5.142458555957129, + "kl": 0.0008791740983724594, + "learning_rate": 2.0796460176991148e-07, + "loss": 0.3402, + "num_tokens": 759774.0, + "reward": -0.09375, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": -0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.628279685974121, + "sampling/importance_sampling_ratio/mean": 1.0001052618026733, + "sampling/importance_sampling_ratio/min": 0.49396607279777527, + "sampling/sampling_logp_difference/max": 0.7052884101867676, + "sampling/sampling_logp_difference/mean": 0.015893086791038513, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 55.25, + "completions/mean_terminated_length": 55.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.3387606143951416, + "epoch": 0.08672566371681416, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03927816701742412, + "kl": 0.001997170504182577, + "learning_rate": 2.1238938053097344e-07, + "loss": 0.0, + "num_tokens": 773310.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4214093685150146, + "sampling/importance_sampling_ratio/mean": 0.9992483854293823, + "sampling/importance_sampling_ratio/min": 0.44950494170188904, + "sampling/sampling_logp_difference/max": 0.7996084690093994, + "sampling/sampling_logp_difference/mean": 0.022078003734350204, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 103.703125, + "completions/mean_terminated_length": 103.703125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.5467600226402283, + "epoch": 0.08849557522123894, + "frac_reward_zero_std": 0.5, + "grad_norm": 19.329756091300126, + "kl": 0.00122804322745651, + "learning_rate": 2.1681415929203538e-07, + "loss": 0.3504, + "num_tokens": 790427.0, + "reward": -0.4375, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": -0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999853253364563, + "sampling/importance_sampling_ratio/min": 0.6109908223152161, + "sampling/sampling_logp_difference/max": 1.131711483001709, + "sampling/sampling_logp_difference/mean": 0.02128685638308525, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 92.84375, + "completions/mean_terminated_length": 92.84375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3790997266769409, + "epoch": 0.09026548672566372, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.3632322784690682, + "kl": 0.00037776207318529487, + "learning_rate": 2.2123893805309735e-07, + "loss": 0.0385, + "num_tokens": 806721.0, + "reward": 0.4375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.6228275299072266, + "sampling/importance_sampling_ratio/mean": 0.9994162321090698, + "sampling/importance_sampling_ratio/min": 0.5379316806793213, + "sampling/sampling_logp_difference/max": 0.6200237274169922, + "sampling/sampling_logp_difference/mean": 0.0147955771535635, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 66.234375, + "completions/mean_terminated_length": 66.234375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2744632959365845, + "epoch": 0.0920353982300885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014767897890408175, + "kl": 0.0005663610063493252, + "learning_rate": 2.2566371681415928e-07, + "loss": 0.0, + "num_tokens": 825280.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3668099641799927, + "sampling/importance_sampling_ratio/mean": 0.9997496604919434, + "sampling/importance_sampling_ratio/min": 0.628419041633606, + "sampling/sampling_logp_difference/max": 0.46454811096191406, + "sampling/sampling_logp_difference/mean": 0.012168833054602146, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 51.671875, + "completions/mean_terminated_length": 51.671875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.22427326440811157, + "epoch": 0.09380530973451327, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.227366331101068, + "kl": 0.002840688219293952, + "learning_rate": 2.3008849557522122e-07, + "loss": -0.1688, + "num_tokens": 840923.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992762804031372, + "sampling/importance_sampling_ratio/min": 0.5337795615196228, + "sampling/sampling_logp_difference/max": 0.7441730499267578, + "sampling/sampling_logp_difference/mean": 0.021255411207675934, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 89.21875, + "completions/mean_terminated_length": 89.21875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2630472183227539, + "epoch": 0.09557522123893805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008579094624632907, + "kl": 0.00046598073095083237, + "learning_rate": 2.345132743362832e-07, + "loss": 0.0, + "num_tokens": 858249.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.662152647972107, + "sampling/importance_sampling_ratio/mean": 1.0007331371307373, + "sampling/importance_sampling_ratio/min": 0.5581602454185486, + "sampling/sampling_logp_difference/max": 0.5831091403961182, + "sampling/sampling_logp_difference/mean": 0.016348931938409805, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 96.078125, + "completions/mean_terminated_length": 96.078125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.47166889905929565, + "epoch": 0.09734513274336283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.010793334962958628, + "kl": 0.0005595291731879115, + "learning_rate": 2.3893805309734513e-07, + "loss": 0.0, + "num_tokens": 875118.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7272074222564697, + "sampling/importance_sampling_ratio/mean": 0.9997779130935669, + "sampling/importance_sampling_ratio/min": 0.21025174856185913, + "sampling/sampling_logp_difference/max": 1.5594496726989746, + "sampling/sampling_logp_difference/mean": 0.018348487094044685, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 54.390625, + "completions/mean_terminated_length": 54.390625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.32978034019470215, + "epoch": 0.09911504424778761, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03360661011059917, + "kl": 0.0016041224589571357, + "learning_rate": 2.4336283185840704e-07, + "loss": 0.0, + "num_tokens": 891415.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.544557809829712, + "sampling/importance_sampling_ratio/mean": 0.9991042613983154, + "sampling/importance_sampling_ratio/min": 0.6957619786262512, + "sampling/sampling_logp_difference/max": 0.4347376823425293, + "sampling/sampling_logp_difference/mean": 0.01648845709860325, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 98.546875, + "completions/mean_terminated_length": 98.546875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.40308648347854614, + "epoch": 0.10088495575221239, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015656436625466928, + "kl": 0.00042458786629140377, + "learning_rate": 2.47787610619469e-07, + "loss": 0.0, + "num_tokens": 907450.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5252330303192139, + "sampling/importance_sampling_ratio/mean": 0.9999468922615051, + "sampling/importance_sampling_ratio/min": 0.6792412996292114, + "sampling/sampling_logp_difference/max": 0.4221472144126892, + "sampling/sampling_logp_difference/mean": 0.017018482089042664, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 64.71875, + "completions/mean_terminated_length": 64.71875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3652716875076294, + "epoch": 0.10265486725663717, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.7572585149403777, + "kl": 0.000706555787473917, + "learning_rate": 2.5221238938053097e-07, + "loss": 0.0038, + "num_tokens": 922536.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5869511365890503, + "sampling/importance_sampling_ratio/mean": 1.0007580518722534, + "sampling/importance_sampling_ratio/min": 0.4306277632713318, + "sampling/sampling_logp_difference/max": 0.8425111770629883, + "sampling/sampling_logp_difference/mean": 0.013503305613994598, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.765625, + "completions/mean_terminated_length": 15.765625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.08323395252227783, + "epoch": 0.10442477876106195, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10431399096983622, + "kl": 0.0004693373921327293, + "learning_rate": 2.5663716814159294e-07, + "loss": 0.0, + "num_tokens": 937513.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3673667907714844, + "sampling/importance_sampling_ratio/mean": 0.9999486207962036, + "sampling/importance_sampling_ratio/min": 0.8102474212646484, + "sampling/sampling_logp_difference/max": 0.3128868341445923, + "sampling/sampling_logp_difference/mean": 0.012339731678366661, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 103.0, + "completions/max_terminated_length": 103.0, + "completions/mean_length": 28.484375, + "completions/mean_terminated_length": 28.484375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.15878352522850037, + "epoch": 0.10619469026548672, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08825490514685248, + "kl": 0.0016191593604162335, + "learning_rate": 2.6106194690265485e-07, + "loss": 0.0, + "num_tokens": 950072.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3832296133041382, + "sampling/importance_sampling_ratio/mean": 0.9970600605010986, + "sampling/importance_sampling_ratio/min": 0.6773175597190857, + "sampling/sampling_logp_difference/max": 0.3896150588989258, + "sampling/sampling_logp_difference/mean": 0.021082065999507904, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 87.203125, + "completions/mean_terminated_length": 87.203125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2835385203361511, + "epoch": 0.1079646017699115, + "frac_reward_zero_std": 0.75, + "grad_norm": 7.039394391202979, + "kl": 0.000507982331328094, + "learning_rate": 2.654867256637168e-07, + "loss": 0.3259, + "num_tokens": 969429.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.4433850049972534, + "sampling/importance_sampling_ratio/mean": 0.9994461536407471, + "sampling/importance_sampling_ratio/min": 0.6672996878623962, + "sampling/sampling_logp_difference/max": 0.4045161008834839, + "sampling/sampling_logp_difference/mean": 0.014681078493595123, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 185.0, + "completions/max_terminated_length": 185.0, + "completions/mean_length": 80.875, + "completions/mean_terminated_length": 80.875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.42828965187072754, + "epoch": 0.10973451327433628, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.219259006737941, + "kl": 0.0006035867263562977, + "learning_rate": 2.6991150442477873e-07, + "loss": 0.0008, + "num_tokens": 984573.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999299168586731, + "sampling/importance_sampling_ratio/min": 0.5546641945838928, + "sampling/sampling_logp_difference/max": 1.1937751770019531, + "sampling/sampling_logp_difference/mean": 0.019111908972263336, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 38.890625, + "completions/mean_terminated_length": 38.890625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2435263991355896, + "epoch": 0.11150442477876106, + "frac_reward_zero_std": 0.75, + "grad_norm": 20.082345347768168, + "kl": 0.002106260508298874, + "learning_rate": 2.743362831858407e-07, + "loss": 0.1822, + "num_tokens": 998118.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.9094908237457275, + "sampling/importance_sampling_ratio/mean": 1.0004956722259521, + "sampling/importance_sampling_ratio/min": 0.7219632267951965, + "sampling/sampling_logp_difference/max": 0.6468366384506226, + "sampling/sampling_logp_difference/mean": 0.019784800708293915, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 156.0, + "completions/max_terminated_length": 156.0, + "completions/mean_length": 34.28125, + "completions/mean_terminated_length": 34.28125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.17724953591823578, + "epoch": 0.11327433628318584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055329392007934075, + "kl": 0.0014039300149306655, + "learning_rate": 2.787610619469026e-07, + "loss": 0.0, + "num_tokens": 1012360.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3268667459487915, + "sampling/importance_sampling_ratio/mean": 0.9999594688415527, + "sampling/importance_sampling_ratio/min": 0.5984827280044556, + "sampling/sampling_logp_difference/max": 0.5133576393127441, + "sampling/sampling_logp_difference/mean": 0.019908219575881958, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 74.0, + "completions/mean_terminated_length": 74.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.25674378871917725, + "epoch": 0.11504424778761062, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.358973041717826, + "kl": 0.0008139506680890918, + "learning_rate": 2.8318584070796457e-07, + "loss": -0.0235, + "num_tokens": 1027176.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5614769458770752, + "sampling/importance_sampling_ratio/mean": 0.9981610774993896, + "sampling/importance_sampling_ratio/min": 0.6101521253585815, + "sampling/sampling_logp_difference/max": 0.4940469264984131, + "sampling/sampling_logp_difference/mean": 0.021541573107242584, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 114.609375, + "completions/mean_terminated_length": 114.609375, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "entropy": 0.5083310008049011, + "epoch": 0.1168141592920354, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.6382262277093083, + "kl": 0.0006657812045887113, + "learning_rate": 2.8761061946902654e-07, + "loss": 0.0159, + "num_tokens": 1044191.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4574837684631348, + "sampling/importance_sampling_ratio/mean": 1.0002875328063965, + "sampling/importance_sampling_ratio/min": 0.6315723657608032, + "sampling/sampling_logp_difference/max": 0.45954275131225586, + "sampling/sampling_logp_difference/mean": 0.017609603703022003, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 68.65625, + "completions/mean_terminated_length": 68.65625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4105982780456543, + "epoch": 0.11858407079646018, + "frac_reward_zero_std": 0.5, + "grad_norm": 12.053850779982422, + "kl": 0.0023665111511945724, + "learning_rate": 2.920353982300885e-07, + "loss": 0.3613, + "num_tokens": 1058313.0, + "reward": 0.84375, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.3561493158340454, + "sampling/importance_sampling_ratio/mean": 0.9979859590530396, + "sampling/importance_sampling_ratio/min": 0.4442462921142578, + "sampling/sampling_logp_difference/max": 0.8113762140274048, + "sampling/sampling_logp_difference/mean": 0.022402148693799973, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 66.46875, + "completions/mean_terminated_length": 66.46875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1998130977153778, + "epoch": 0.12035398230088495, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022656157015304077, + "kl": 0.0007582076359540224, + "learning_rate": 2.9646017699115047e-07, + "loss": 0.0, + "num_tokens": 1072263.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.412645936012268, + "sampling/importance_sampling_ratio/mean": 0.9985370635986328, + "sampling/importance_sampling_ratio/min": 0.593745231628418, + "sampling/sampling_logp_difference/max": 0.5213049650192261, + "sampling/sampling_logp_difference/mean": 0.018213676288723946, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 137.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 54.5625, + "completions/mean_terminated_length": 54.5625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.271050363779068, + "epoch": 0.12212389380530973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013820374585321411, + "kl": 0.00034594995668157935, + "learning_rate": 3.008849557522124e-07, + "loss": 0.0, + "num_tokens": 1086347.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2971035242080688, + "sampling/importance_sampling_ratio/mean": 0.999480664730072, + "sampling/importance_sampling_ratio/min": 0.37043890357017517, + "sampling/sampling_logp_difference/max": 0.9930667877197266, + "sampling/sampling_logp_difference/mean": 0.011262361891567707, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 57.328125, + "completions/mean_terminated_length": 57.328125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.30324071645736694, + "epoch": 0.12389380530973451, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.125386142425997, + "kl": 0.0008926556329242885, + "learning_rate": 3.053097345132743e-07, + "loss": -0.0308, + "num_tokens": 1101872.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.3413037061691284, + "sampling/importance_sampling_ratio/mean": 0.9995997548103333, + "sampling/importance_sampling_ratio/min": 0.6632469892501831, + "sampling/sampling_logp_difference/max": 0.41060781478881836, + "sampling/sampling_logp_difference/mean": 0.015544610098004341, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 97.671875, + "completions/mean_terminated_length": 97.671875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.6352293491363525, + "epoch": 0.1256637168141593, + "frac_reward_zero_std": 0.5, + "grad_norm": 7.979445476665689, + "kl": 0.001399570144712925, + "learning_rate": 3.0973451327433626e-07, + "loss": 0.1946, + "num_tokens": 1118347.0, + "reward": 0.375, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.7627366781234741, + "sampling/importance_sampling_ratio/mean": 1.0006338357925415, + "sampling/importance_sampling_ratio/min": 0.5739402174949646, + "sampling/sampling_logp_difference/max": 0.5668675899505615, + "sampling/sampling_logp_difference/mean": 0.02080698311328888, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 674.0, + "completions/max_terminated_length": 674.0, + "completions/mean_length": 80.890625, + "completions/mean_terminated_length": 80.890625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3430714011192322, + "epoch": 0.12743362831858407, + "frac_reward_zero_std": 0.5, + "grad_norm": 12.02956264887036, + "kl": 0.006966053508222103, + "learning_rate": 3.141592920353982e-07, + "loss": 0.1236, + "num_tokens": 1133908.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9982129335403442, + "sampling/importance_sampling_ratio/min": 0.31648826599121094, + "sampling/sampling_logp_difference/max": 1.150469183921814, + "sampling/sampling_logp_difference/mean": 0.02067970484495163, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 92.75, + "completions/mean_terminated_length": 92.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.5193896293640137, + "epoch": 0.12920353982300886, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.848363550830051, + "kl": 0.010346418246626854, + "learning_rate": 3.1858407079646014e-07, + "loss": -0.0719, + "num_tokens": 1149572.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.555552363395691, + "sampling/importance_sampling_ratio/mean": 0.9990873336791992, + "sampling/importance_sampling_ratio/min": 0.48628026247024536, + "sampling/sampling_logp_difference/max": 0.7209701538085938, + "sampling/sampling_logp_difference/mean": 0.02255423739552498, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 64.125, + "completions/mean_terminated_length": 64.125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.30180686712265015, + "epoch": 0.13097345132743363, + "frac_reward_zero_std": 0.75, + "grad_norm": 10.872794222298856, + "kl": 0.001606640056706965, + "learning_rate": 3.230088495575221e-07, + "loss": -0.3004, + "num_tokens": 1163660.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.7250492572784424, + "sampling/importance_sampling_ratio/mean": 1.0003955364227295, + "sampling/importance_sampling_ratio/min": 0.5589487552642822, + "sampling/sampling_logp_difference/max": 0.5816974639892578, + "sampling/sampling_logp_difference/mean": 0.02236917056143284, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 208.0, + "completions/max_terminated_length": 208.0, + "completions/mean_length": 44.59375, + "completions/mean_terminated_length": 44.59375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2116633653640747, + "epoch": 0.13274336283185842, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.8699103275799596, + "kl": 0.006977086421102285, + "learning_rate": 3.2743362831858407e-07, + "loss": 0.0026, + "num_tokens": 1176994.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4212322235107422, + "sampling/importance_sampling_ratio/mean": 0.9926056265830994, + "sampling/importance_sampling_ratio/min": 0.6300075054168701, + "sampling/sampling_logp_difference/max": 0.4620234966278076, + "sampling/sampling_logp_difference/mean": 0.02601846680045128, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 99.203125, + "completions/mean_terminated_length": 99.203125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.41840457916259766, + "epoch": 0.13451327433628318, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016256057216556645, + "kl": 0.0008459609234705567, + "learning_rate": 3.3185840707964603e-07, + "loss": 0.0, + "num_tokens": 1194431.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000793218612671, + "sampling/importance_sampling_ratio/min": 0.5363112092018127, + "sampling/sampling_logp_difference/max": 0.8022520542144775, + "sampling/sampling_logp_difference/mean": 0.02195250429213047, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 101.984375, + "completions/mean_terminated_length": 101.984375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.37862005829811096, + "epoch": 0.13628318584070798, + "frac_reward_zero_std": 0.5, + "grad_norm": 10.132955638345939, + "kl": 0.0018726624548435211, + "learning_rate": 3.36283185840708e-07, + "loss": -0.3206, + "num_tokens": 1210270.0, + "reward": 0.78125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.5600541830062866, + "sampling/importance_sampling_ratio/mean": 1.0004552602767944, + "sampling/importance_sampling_ratio/min": 0.6160954833030701, + "sampling/sampling_logp_difference/max": 0.48435330390930176, + "sampling/sampling_logp_difference/mean": 0.014363247901201248, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 64.0, + "completions/mean_terminated_length": 64.0, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.3538378179073334, + "epoch": 0.13805309734513274, + "frac_reward_zero_std": 0.75, + "grad_norm": 9.268417053893387, + "kl": 0.005304547026753426, + "learning_rate": 3.4070796460176986e-07, + "loss": -0.1266, + "num_tokens": 1228782.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.5338397026062012, + "sampling/importance_sampling_ratio/mean": 1.000978708267212, + "sampling/importance_sampling_ratio/min": 0.598846971988678, + "sampling/sampling_logp_difference/max": 0.512749195098877, + "sampling/sampling_logp_difference/mean": 0.01858421415090561, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 96.515625, + "completions/mean_terminated_length": 96.515625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4295077621936798, + "epoch": 0.13982300884955753, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.009828059132190585, + "kl": 0.0005789292044937611, + "learning_rate": 3.451327433628318e-07, + "loss": 0.0, + "num_tokens": 1245087.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9877007007598877, + "sampling/importance_sampling_ratio/mean": 1.0007420778274536, + "sampling/importance_sampling_ratio/min": 0.6938974857330322, + "sampling/sampling_logp_difference/max": 0.6869785785675049, + "sampling/sampling_logp_difference/mean": 0.01571083441376686, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 77.5, + "completions/mean_terminated_length": 77.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2790239453315735, + "epoch": 0.1415929203539823, + "frac_reward_zero_std": 0.75, + "grad_norm": 7.747476493391317, + "kl": 0.003009934676811099, + "learning_rate": 3.495575221238938e-07, + "loss": 0.268, + "num_tokens": 1262559.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996301531791687, + "sampling/importance_sampling_ratio/min": 0.4342047870159149, + "sampling/sampling_logp_difference/max": 0.8342390060424805, + "sampling/sampling_logp_difference/mean": 0.019856680184602737, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 58.0, + "completions/mean_terminated_length": 58.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.20376178622245789, + "epoch": 0.1433628318584071, + "frac_reward_zero_std": 0.75, + "grad_norm": 12.49039325869809, + "kl": 0.0034272735938429832, + "learning_rate": 3.5398230088495575e-07, + "loss": -0.4271, + "num_tokens": 1277551.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.3819938898086548, + "sampling/importance_sampling_ratio/mean": 0.9996453523635864, + "sampling/importance_sampling_ratio/min": 0.5875664353370667, + "sampling/sampling_logp_difference/max": 0.5317659378051758, + "sampling/sampling_logp_difference/mean": 0.0125980693846941, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 63.875, + "completions/mean_terminated_length": 63.875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.23782622814178467, + "epoch": 0.14513274336283186, + "frac_reward_zero_std": 0.75, + "grad_norm": 13.890965293596501, + "kl": 0.0006324282148852944, + "learning_rate": 3.5840707964601767e-07, + "loss": 0.1274, + "num_tokens": 1294071.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.7391748428344727, + "sampling/importance_sampling_ratio/mean": 1.0006330013275146, + "sampling/importance_sampling_ratio/min": 0.5648089051246643, + "sampling/sampling_logp_difference/max": 0.57126784324646, + "sampling/sampling_logp_difference/mean": 0.015169214457273483, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 48.328125, + "completions/mean_terminated_length": 48.328125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.295986533164978, + "epoch": 0.14690265486725665, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02545414431293979, + "kl": 0.00065580167574808, + "learning_rate": 3.6283185840707963e-07, + "loss": 0.0, + "num_tokens": 1309340.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3446919918060303, + "sampling/importance_sampling_ratio/mean": 0.9991348385810852, + "sampling/importance_sampling_ratio/min": 0.6952046751976013, + "sampling/sampling_logp_difference/max": 0.36354899406433105, + "sampling/sampling_logp_difference/mean": 0.016460547223687172, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 60.15625, + "completions/mean_terminated_length": 60.15625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2950069308280945, + "epoch": 0.1486725663716814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03425744965710435, + "kl": 0.0017368867993354797, + "learning_rate": 3.672566371681416e-07, + "loss": 0.0, + "num_tokens": 1324326.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3615399599075317, + "sampling/importance_sampling_ratio/mean": 0.9983700513839722, + "sampling/importance_sampling_ratio/min": 0.556420087814331, + "sampling/sampling_logp_difference/max": 0.5862317085266113, + "sampling/sampling_logp_difference/mean": 0.018328040838241577, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 106.921875, + "completions/mean_terminated_length": 106.921875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.5123524069786072, + "epoch": 0.1504424778761062, + "frac_reward_zero_std": 0.5, + "grad_norm": 17.031908186631007, + "kl": 0.002637058962136507, + "learning_rate": 3.7168141592920356e-07, + "loss": 0.3122, + "num_tokens": 1341793.0, + "reward": 0.21875, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0009326934814453, + "sampling/importance_sampling_ratio/min": 0.5631783604621887, + "sampling/sampling_logp_difference/max": 1.1850533485412598, + "sampling/sampling_logp_difference/mean": 0.020340263843536377, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 91.390625, + "completions/mean_terminated_length": 91.390625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4329564571380615, + "epoch": 0.15221238938053097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023204412462173665, + "kl": 0.001921823131851852, + "learning_rate": 3.761061946902654e-07, + "loss": 0.0, + "num_tokens": 1357770.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6110613346099854, + "sampling/importance_sampling_ratio/mean": 1.0003230571746826, + "sampling/importance_sampling_ratio/min": 0.46672725677490234, + "sampling/sampling_logp_difference/max": 0.7620102167129517, + "sampling/sampling_logp_difference/mean": 0.018431926146149635, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 63.953125, + "completions/mean_terminated_length": 63.953125, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "entropy": 0.42276352643966675, + "epoch": 0.15398230088495576, + "frac_reward_zero_std": 0.25, + "grad_norm": 16.80517251352282, + "kl": 0.00828557275235653, + "learning_rate": 3.805309734513274e-07, + "loss": 0.3641, + "num_tokens": 1371687.0, + "reward": 0.78125, + "reward_std": 0.48935678601264954, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.7898368835449219, + "sampling/importance_sampling_ratio/mean": 1.0000966787338257, + "sampling/importance_sampling_ratio/min": 0.5008212327957153, + "sampling/sampling_logp_difference/max": 0.6915061473846436, + "sampling/sampling_logp_difference/mean": 0.019807137548923492, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 96.0625, + "completions/mean_terminated_length": 96.0625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3446410298347473, + "epoch": 0.15575221238938053, + "frac_reward_zero_std": 0.5, + "grad_norm": 9.858522314114232, + "kl": 0.0018287475686520338, + "learning_rate": 3.8495575221238935e-07, + "loss": 0.3426, + "num_tokens": 1388731.0, + "reward": 0.21875, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0008213520050049, + "sampling/importance_sampling_ratio/min": 0.33204153180122375, + "sampling/sampling_logp_difference/max": 1.1024951934814453, + "sampling/sampling_logp_difference/mean": 0.01567666232585907, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 85.3125, + "completions/mean_terminated_length": 85.3125, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.3201582133769989, + "epoch": 0.15752212389380532, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014084128619798917, + "kl": 0.000516217143740505, + "learning_rate": 3.893805309734513e-07, + "loss": 0.0, + "num_tokens": 1404319.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4318376779556274, + "sampling/importance_sampling_ratio/mean": 1.0020582675933838, + "sampling/importance_sampling_ratio/min": 0.6632694602012634, + "sampling/sampling_logp_difference/max": 0.41057395935058594, + "sampling/sampling_logp_difference/mean": 0.015921611338853836, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 63.234375, + "completions/mean_terminated_length": 63.234375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.27583983540534973, + "epoch": 0.1592920353982301, + "frac_reward_zero_std": 0.75, + "grad_norm": 15.744491824905102, + "kl": 0.0023850714787840843, + "learning_rate": 3.938053097345133e-07, + "loss": 0.0948, + "num_tokens": 1422062.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9983652830123901, + "sampling/importance_sampling_ratio/min": 0.6548058390617371, + "sampling/sampling_logp_difference/max": 0.9749901294708252, + "sampling/sampling_logp_difference/mean": 0.014708740636706352, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.34375, + "completions/mean_terminated_length": 17.34375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.06893637031316757, + "epoch": 0.16106194690265488, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19984842013006643, + "kl": 0.002403866034001112, + "learning_rate": 3.982300884955752e-07, + "loss": 0.0, + "num_tokens": 1439604.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.133671522140503, + "sampling/importance_sampling_ratio/mean": 0.9994039535522461, + "sampling/importance_sampling_ratio/min": 0.6547919511795044, + "sampling/sampling_logp_difference/max": 0.4234377145767212, + "sampling/sampling_logp_difference/mean": 0.007659217808395624, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 36.953125, + "completions/mean_terminated_length": 36.953125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.26978591084480286, + "epoch": 0.16283185840707964, + "frac_reward_zero_std": 0.75, + "grad_norm": 16.587903339877094, + "kl": 0.003500830614939332, + "learning_rate": 4.0265486725663716e-07, + "loss": 0.1325, + "num_tokens": 1455329.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6246740818023682, + "sampling/importance_sampling_ratio/mean": 1.0004960298538208, + "sampling/importance_sampling_ratio/min": 0.5870731472969055, + "sampling/sampling_logp_difference/max": 0.532605767250061, + "sampling/sampling_logp_difference/mean": 0.014703018590807915, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 84.109375, + "completions/mean_terminated_length": 84.109375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.47278204560279846, + "epoch": 0.16460176991150444, + "frac_reward_zero_std": 0.25, + "grad_norm": 11.06688751319423, + "kl": 0.003939971327781677, + "learning_rate": 4.0707964601769913e-07, + "loss": 0.1314, + "num_tokens": 1471768.0, + "reward": 0.03125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0012634992599487, + "sampling/importance_sampling_ratio/min": 0.46622326970100403, + "sampling/sampling_logp_difference/max": 0.8460280895233154, + "sampling/sampling_logp_difference/mean": 0.022663792595267296, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 59.6875, + "completions/mean_terminated_length": 59.6875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2976003885269165, + "epoch": 0.1663716814159292, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.901967299367, + "kl": 0.002563281450420618, + "learning_rate": 4.1150442477876104e-07, + "loss": -0.3815, + "num_tokens": 1485444.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.3954286575317383, + "sampling/importance_sampling_ratio/mean": 1.0000526905059814, + "sampling/importance_sampling_ratio/min": 0.6176431775093079, + "sampling/sampling_logp_difference/max": 0.481844425201416, + "sampling/sampling_logp_difference/mean": 0.015158753842115402, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 233.0, + "completions/max_terminated_length": 233.0, + "completions/mean_length": 51.46875, + "completions/mean_terminated_length": 51.46875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2984507083892822, + "epoch": 0.168141592920354, + "frac_reward_zero_std": 0.5, + "grad_norm": 13.83890318477302, + "kl": 0.05415243282914162, + "learning_rate": 4.1592920353982295e-07, + "loss": 0.2832, + "num_tokens": 1500914.0, + "reward": 0.5, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7520718574523926, + "sampling/importance_sampling_ratio/mean": 1.0006399154663086, + "sampling/importance_sampling_ratio/min": 0.4704074263572693, + "sampling/sampling_logp_difference/max": 0.7541561126708984, + "sampling/sampling_logp_difference/mean": 0.017951540648937225, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 129.0, + "completions/max_terminated_length": 129.0, + "completions/mean_length": 32.59375, + "completions/mean_terminated_length": 32.59375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.13821932673454285, + "epoch": 0.16991150442477876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19751930399724366, + "kl": 0.022612882778048515, + "learning_rate": 4.203539823008849e-07, + "loss": 0.0001, + "num_tokens": 1512440.0, + "reward": -1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7523033618927002, + "sampling/importance_sampling_ratio/mean": 0.9995698928833008, + "sampling/importance_sampling_ratio/min": 0.4824843406677246, + "sampling/sampling_logp_difference/max": 0.7288068532943726, + "sampling/sampling_logp_difference/mean": 0.02123548462986946, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 135.0, + "completions/max_terminated_length": 135.0, + "completions/mean_length": 21.15625, + "completions/mean_terminated_length": 21.15625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2681628465652466, + "epoch": 0.17168141592920355, + "frac_reward_zero_std": 0.25, + "grad_norm": 84.58976760954279, + "kl": 1.1065090894699097, + "learning_rate": 4.247787610619469e-07, + "loss": -0.0586, + "num_tokens": 1522786.0, + "reward": -0.03125, + "reward_std": 0.519389271736145, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.9697527885437012, + "sampling/importance_sampling_ratio/mean": 0.9997271299362183, + "sampling/importance_sampling_ratio/min": 0.0831858441233635, + "sampling/sampling_logp_difference/max": 2.486678123474121, + "sampling/sampling_logp_difference/mean": 0.025643471628427505, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 22.0, + "completions/max_terminated_length": 22.0, + "completions/mean_length": 16.875, + "completions/mean_terminated_length": 16.875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.06743040680885315, + "epoch": 0.17345132743362832, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.39896373700566834, + "kl": 0.010008303448557854, + "learning_rate": 4.2920353982300885e-07, + "loss": 0.0001, + "num_tokens": 1536762.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5342676639556885, + "sampling/importance_sampling_ratio/mean": 0.9996311068534851, + "sampling/importance_sampling_ratio/min": 0.6176677346229553, + "sampling/sampling_logp_difference/max": 0.48180460929870605, + "sampling/sampling_logp_difference/mean": 0.012232774868607521, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 39.625, + "completions/mean_terminated_length": 39.625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2937965393066406, + "epoch": 0.1752212389380531, + "frac_reward_zero_std": 0.75, + "grad_norm": 17.05345220718161, + "kl": 0.047442853450775146, + "learning_rate": 4.3362831858407076e-07, + "loss": 0.1838, + "num_tokens": 1551058.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.50980806350708, + "sampling/importance_sampling_ratio/mean": 0.9996635913848877, + "sampling/importance_sampling_ratio/min": 0.5629895925521851, + "sampling/sampling_logp_difference/max": 0.5744941234588623, + "sampling/sampling_logp_difference/mean": 0.012546870857477188, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 43.921875, + "completions/mean_terminated_length": 43.921875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.25557002425193787, + "epoch": 0.17699115044247787, + "frac_reward_zero_std": 0.5, + "grad_norm": 23.867050724829383, + "kl": 0.012761028483510017, + "learning_rate": 4.380530973451327e-07, + "loss": -0.2337, + "num_tokens": 1569261.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0012588500976562, + "sampling/importance_sampling_ratio/min": 0.5261755585670471, + "sampling/sampling_logp_difference/max": 0.8514184951782227, + "sampling/sampling_logp_difference/mean": 0.017819223925471306, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 16.0, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.06638449430465698, + "epoch": 0.17876106194690267, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.44630179415515814, + "kl": 0.024098360911011696, + "learning_rate": 4.424778761061947e-07, + "loss": 0.0002, + "num_tokens": 1581645.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2675453424453735, + "sampling/importance_sampling_ratio/mean": 0.999424397945404, + "sampling/importance_sampling_ratio/min": 0.6548995971679688, + "sampling/sampling_logp_difference/max": 0.42327332496643066, + "sampling/sampling_logp_difference/mean": 0.011951332911849022, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 36.296875, + "completions/mean_terminated_length": 36.296875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2074262946844101, + "epoch": 0.18053097345132743, + "frac_reward_zero_std": 0.75, + "grad_norm": 17.8257040036157, + "kl": 0.054125286638736725, + "learning_rate": 4.469026548672566e-07, + "loss": -0.144, + "num_tokens": 1593456.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6750662326812744, + "sampling/importance_sampling_ratio/mean": 0.9999299049377441, + "sampling/importance_sampling_ratio/min": 0.6054568886756897, + "sampling/sampling_logp_difference/max": 0.515852689743042, + "sampling/sampling_logp_difference/mean": 0.012604659423232079, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 44.046875, + "completions/mean_terminated_length": 44.046875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.32559850811958313, + "epoch": 0.18230088495575222, + "frac_reward_zero_std": 0.5, + "grad_norm": 32.22001959212673, + "kl": 0.13167370855808258, + "learning_rate": 4.5132743362831857e-07, + "loss": -0.3729, + "num_tokens": 1606947.0, + "reward": 0.25, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995090961456299, + "sampling/importance_sampling_ratio/min": 0.4825321435928345, + "sampling/sampling_logp_difference/max": 0.830096960067749, + "sampling/sampling_logp_difference/mean": 0.02871047519147396, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 59.21875, + "completions/mean_terminated_length": 59.21875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4233275055885315, + "epoch": 0.184070796460177, + "frac_reward_zero_std": 0.25, + "grad_norm": 32.05855820384328, + "kl": 0.05426112934947014, + "learning_rate": 4.557522123893805e-07, + "loss": -0.4524, + "num_tokens": 1620529.0, + "reward": 0.46875, + "reward_std": 0.5959457159042358, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9988217353820801, + "sampling/importance_sampling_ratio/min": 0.47307318449020386, + "sampling/sampling_logp_difference/max": 1.6695234775543213, + "sampling/sampling_logp_difference/mean": 0.026059508323669434, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 81.0, + "completions/max_terminated_length": 81.0, + "completions/mean_length": 16.578125, + "completions/mean_terminated_length": 16.578125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.14209511876106262, + "epoch": 0.18584070796460178, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0777847234694372, + "kl": 0.08551844954490662, + "learning_rate": 4.6017699115044245e-07, + "loss": 0.0008, + "num_tokens": 1636598.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2430133819580078, + "sampling/importance_sampling_ratio/mean": 1.001842737197876, + "sampling/importance_sampling_ratio/min": 0.42892172932624817, + "sampling/sampling_logp_difference/max": 0.8464808464050293, + "sampling/sampling_logp_difference/mean": 0.009883169084787369, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 35.625, + "completions/mean_terminated_length": 35.625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1706276834011078, + "epoch": 0.18761061946902655, + "frac_reward_zero_std": 0.75, + "grad_norm": 16.242571617604863, + "kl": 0.008568068966269493, + "learning_rate": 4.646017699115044e-07, + "loss": -0.3094, + "num_tokens": 1652014.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.7361836433410645, + "sampling/importance_sampling_ratio/mean": 0.9986557960510254, + "sampling/importance_sampling_ratio/min": 0.49548566341400146, + "sampling/sampling_logp_difference/max": 0.7022168040275574, + "sampling/sampling_logp_difference/mean": 0.017092259600758553, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 53.09375, + "completions/mean_terminated_length": 53.09375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2413337230682373, + "epoch": 0.18938053097345134, + "frac_reward_zero_std": 0.5, + "grad_norm": 22.783578598982626, + "kl": 0.0069497618824243546, + "learning_rate": 4.690265486725664e-07, + "loss": -0.2593, + "num_tokens": 1667156.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.5598926544189453, + "sampling/importance_sampling_ratio/mean": 0.9992170333862305, + "sampling/importance_sampling_ratio/min": 0.733043909072876, + "sampling/sampling_logp_difference/max": 0.44461703300476074, + "sampling/sampling_logp_difference/mean": 0.01676351949572563, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 30.765625, + "completions/mean_terminated_length": 30.765625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.17704860866069794, + "epoch": 0.1911504424778761, + "frac_reward_zero_std": 0.75, + "grad_norm": 22.062156584055394, + "kl": 0.015616398304700851, + "learning_rate": 4.734513274336283e-07, + "loss": 0.584, + "num_tokens": 1680021.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.6436699628829956, + "sampling/importance_sampling_ratio/mean": 1.0013000965118408, + "sampling/importance_sampling_ratio/min": 0.6871146559715271, + "sampling/sampling_logp_difference/max": 0.4969315528869629, + "sampling/sampling_logp_difference/mean": 0.013623833656311035, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 83.0625, + "completions/mean_terminated_length": 83.0625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2795030176639557, + "epoch": 0.1929203539823009, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.5345931936458213, + "kl": 0.0018449004273861647, + "learning_rate": 4.778761061946903e-07, + "loss": 0.0868, + "num_tokens": 1695865.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4435425996780396, + "sampling/importance_sampling_ratio/mean": 0.9981499314308167, + "sampling/importance_sampling_ratio/min": 0.6196099519729614, + "sampling/sampling_logp_difference/max": 0.4786651134490967, + "sampling/sampling_logp_difference/mean": 0.01591462641954422, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 58.171875, + "completions/mean_terminated_length": 58.171875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2562200129032135, + "epoch": 0.19469026548672566, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02541889548481687, + "kl": 0.0014256872236728668, + "learning_rate": 4.823008849557521e-07, + "loss": 0.0, + "num_tokens": 1709716.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6915197372436523, + "sampling/importance_sampling_ratio/mean": 0.9971394538879395, + "sampling/importance_sampling_ratio/min": 0.535825788974762, + "sampling/sampling_logp_difference/max": 0.6239461898803711, + "sampling/sampling_logp_difference/mean": 0.017007606104016304, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 23.390625, + "completions/mean_terminated_length": 23.390625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.21091923117637634, + "epoch": 0.19646017699115045, + "frac_reward_zero_std": 0.75, + "grad_norm": 29.116072245725366, + "kl": 0.023805273696780205, + "learning_rate": 4.867256637168141e-07, + "loss": 0.4792, + "num_tokens": 1725309.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.825963020324707, + "sampling/importance_sampling_ratio/mean": 0.9973196387290955, + "sampling/importance_sampling_ratio/min": 0.6877333521842957, + "sampling/sampling_logp_difference/max": 0.6021075248718262, + "sampling/sampling_logp_difference/mean": 0.02469644322991371, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 41.171875, + "completions/mean_terminated_length": 41.171875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.337478369474411, + "epoch": 0.19823008849557522, + "frac_reward_zero_std": 0.5, + "grad_norm": 28.478295365787474, + "kl": 0.019370798021554947, + "learning_rate": 4.91150442477876e-07, + "loss": -0.373, + "num_tokens": 1746520.0, + "reward": 0.65625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.4072273969650269, + "sampling/importance_sampling_ratio/mean": 1.0000637769699097, + "sampling/importance_sampling_ratio/min": 0.5572885274887085, + "sampling/sampling_logp_difference/max": 0.5846721529960632, + "sampling/sampling_logp_difference/mean": 0.015861041843891144, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 67.5, + "completions/mean_terminated_length": 67.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.43532299995422363, + "epoch": 0.2, + "frac_reward_zero_std": 0.25, + "grad_norm": 11.864682675131935, + "kl": 0.018026825040578842, + "learning_rate": 4.95575221238938e-07, + "loss": -0.2313, + "num_tokens": 1765192.0, + "reward": -0.1875, + "reward_std": 0.6991121172904968, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.565968632698059, + "sampling/importance_sampling_ratio/mean": 0.9986177682876587, + "sampling/importance_sampling_ratio/min": 0.6226667761802673, + "sampling/sampling_logp_difference/max": 0.4737436771392822, + "sampling/sampling_logp_difference/mean": 0.01891300082206726, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 72.5625, + "completions/mean_terminated_length": 72.5625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3272761106491089, + "epoch": 0.20176991150442478, + "frac_reward_zero_std": 0.5, + "grad_norm": 5.146958292258678, + "kl": 0.05377377197146416, + "learning_rate": 5e-07, + "loss": -0.005, + "num_tokens": 1782668.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0012102127075195, + "sampling/importance_sampling_ratio/min": 0.3370976150035858, + "sampling/sampling_logp_difference/max": 1.0873827934265137, + "sampling/sampling_logp_difference/mean": 0.024121500551700592, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 82.59375, + "completions/mean_terminated_length": 82.59375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.46006712317466736, + "epoch": 0.20353982300884957, + "frac_reward_zero_std": 0.25, + "grad_norm": 6.751666668869363, + "kl": 0.07440268248319626, + "learning_rate": 5.044247787610619e-07, + "loss": 0.0712, + "num_tokens": 1799058.0, + "reward": 0.90625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9985596537590027, + "sampling/importance_sampling_ratio/min": 0.4345816969871521, + "sampling/sampling_logp_difference/max": 0.972254753112793, + "sampling/sampling_logp_difference/mean": 0.019559338688850403, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 37.625, + "completions/mean_terminated_length": 37.625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2306581437587738, + "epoch": 0.20530973451327433, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09028985253936475, + "kl": 0.003620689269155264, + "learning_rate": 5.088495575221239e-07, + "loss": 0.0, + "num_tokens": 1815882.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6114362478256226, + "sampling/importance_sampling_ratio/mean": 0.9963452816009521, + "sampling/importance_sampling_ratio/min": 0.41382789611816406, + "sampling/sampling_logp_difference/max": 0.8823051452636719, + "sampling/sampling_logp_difference/mean": 0.019629651680588722, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 45.75, + "completions/mean_terminated_length": 45.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2207641899585724, + "epoch": 0.20707964601769913, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030791093669244915, + "kl": 0.001684409799054265, + "learning_rate": 5.132743362831859e-07, + "loss": 0.0, + "num_tokens": 1830506.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4853850603103638, + "sampling/importance_sampling_ratio/mean": 0.9980629086494446, + "sampling/importance_sampling_ratio/min": 0.444266676902771, + "sampling/sampling_logp_difference/max": 0.8113303184509277, + "sampling/sampling_logp_difference/mean": 0.028768619522452354, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 64.484375, + "completions/mean_terminated_length": 64.484375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3205970227718353, + "epoch": 0.2088495575221239, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03203958114580459, + "kl": 0.0018948880024254322, + "learning_rate": 5.176991150442478e-07, + "loss": 0.0, + "num_tokens": 1845561.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5698038339614868, + "sampling/importance_sampling_ratio/mean": 1.0007352828979492, + "sampling/importance_sampling_ratio/min": 0.6012983322143555, + "sampling/sampling_logp_difference/max": 0.5086641311645508, + "sampling/sampling_logp_difference/mean": 0.021763615310192108, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 993.0, + "completions/max_terminated_length": 993.0, + "completions/mean_length": 140.8125, + "completions/mean_terminated_length": 140.8125, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.5416373014450073, + "epoch": 0.21061946902654868, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.007404796935460493, + "kl": 0.0005785170360468328, + "learning_rate": 5.221238938053097e-07, + "loss": 0.0, + "num_tokens": 1863789.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4254950284957886, + "sampling/importance_sampling_ratio/mean": 1.000152349472046, + "sampling/importance_sampling_ratio/min": 0.6323530077934265, + "sampling/sampling_logp_difference/max": 0.45830750465393066, + "sampling/sampling_logp_difference/mean": 0.01834021881222725, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 49.9375, + "completions/mean_terminated_length": 49.9375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1841767281293869, + "epoch": 0.21238938053097345, + "frac_reward_zero_std": 0.75, + "grad_norm": 14.962615729352883, + "kl": 0.005641619674861431, + "learning_rate": 5.265486725663717e-07, + "loss": 0.2669, + "num_tokens": 1878841.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.765735149383545, + "sampling/importance_sampling_ratio/mean": 0.9993394613265991, + "sampling/importance_sampling_ratio/min": 0.7040172815322876, + "sampling/sampling_logp_difference/max": 0.568567156791687, + "sampling/sampling_logp_difference/mean": 0.009488564915955067, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 44.046875, + "completions/mean_terminated_length": 44.046875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2508974075317383, + "epoch": 0.21415929203539824, + "frac_reward_zero_std": 0.75, + "grad_norm": 15.138540143090314, + "kl": 0.003447249997407198, + "learning_rate": 5.309734513274336e-07, + "loss": -0.2376, + "num_tokens": 1895772.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.6151796579360962, + "sampling/importance_sampling_ratio/mean": 0.9982805252075195, + "sampling/importance_sampling_ratio/min": 0.6492539644241333, + "sampling/sampling_logp_difference/max": 0.4794461727142334, + "sampling/sampling_logp_difference/mean": 0.017452435567975044, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 38.140625, + "completions/mean_terminated_length": 38.140625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3044247627258301, + "epoch": 0.215929203539823, + "frac_reward_zero_std": 0.25, + "grad_norm": 19.479083260753757, + "kl": 0.01046132855117321, + "learning_rate": 5.353982300884956e-07, + "loss": 0.0352, + "num_tokens": 1911141.0, + "reward": 0.34375, + "reward_std": 0.5827301740646362, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.5219072103500366, + "sampling/importance_sampling_ratio/mean": 0.9994726181030273, + "sampling/importance_sampling_ratio/min": 0.5666710734367371, + "sampling/sampling_logp_difference/max": 0.5679762363433838, + "sampling/sampling_logp_difference/mean": 0.016208771616220474, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 105.75, + "completions/mean_terminated_length": 105.75, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.519478440284729, + "epoch": 0.2176991150442478, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019650278961170106, + "kl": 0.0011656515998765826, + "learning_rate": 5.398230088495575e-07, + "loss": 0.0, + "num_tokens": 1927541.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8163784742355347, + "sampling/importance_sampling_ratio/mean": 1.000349998474121, + "sampling/importance_sampling_ratio/min": 0.6328713893890381, + "sampling/sampling_logp_difference/max": 0.5968446731567383, + "sampling/sampling_logp_difference/mean": 0.019682027399539948, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 106.96875, + "completions/mean_terminated_length": 106.96875, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "entropy": 0.45931580662727356, + "epoch": 0.21946902654867256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.008681335336834708, + "kl": 0.0005357326008379459, + "learning_rate": 5.442477876106194e-07, + "loss": 0.0, + "num_tokens": 1944499.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.929969310760498, + "sampling/importance_sampling_ratio/mean": 1.0011227130889893, + "sampling/importance_sampling_ratio/min": 0.6561875939369202, + "sampling/sampling_logp_difference/max": 0.6575040817260742, + "sampling/sampling_logp_difference/mean": 0.01847759075462818, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 110.75, + "completions/mean_terminated_length": 110.75, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.37561601400375366, + "epoch": 0.22123893805309736, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.9753687076399795, + "kl": 0.0010785853955894709, + "learning_rate": 5.486725663716814e-07, + "loss": -0.0333, + "num_tokens": 1961459.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995502829551697, + "sampling/importance_sampling_ratio/min": 0.6624337434768677, + "sampling/sampling_logp_difference/max": 1.584089994430542, + "sampling/sampling_logp_difference/mean": 0.016993992030620575, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 70.53125, + "completions/mean_terminated_length": 70.53125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.31924882531166077, + "epoch": 0.22300884955752212, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031328937808989196, + "kl": 0.0014360514469444752, + "learning_rate": 5.530973451327434e-07, + "loss": 0.0, + "num_tokens": 1975653.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.8715254068374634, + "sampling/importance_sampling_ratio/mean": 1.0022432804107666, + "sampling/importance_sampling_ratio/min": 0.6149583458900452, + "sampling/sampling_logp_difference/max": 0.6267538070678711, + "sampling/sampling_logp_difference/mean": 0.025006473064422607, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 82.21875, + "completions/mean_terminated_length": 82.21875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3541611433029175, + "epoch": 0.2247787610619469, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013457624899861007, + "kl": 0.0013129812432453036, + "learning_rate": 5.575221238938052e-07, + "loss": 0.0, + "num_tokens": 1995827.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.574439525604248, + "sampling/importance_sampling_ratio/mean": 0.9993070960044861, + "sampling/importance_sampling_ratio/min": 0.5381397008895874, + "sampling/sampling_logp_difference/max": 0.6196370124816895, + "sampling/sampling_logp_difference/mean": 0.014719847589731216, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 67.015625, + "completions/mean_terminated_length": 67.015625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.44300490617752075, + "epoch": 0.22654867256637168, + "frac_reward_zero_std": 0.75, + "grad_norm": 6.134743959337715, + "kl": 0.0014869027072563767, + "learning_rate": 5.619469026548672e-07, + "loss": 0.2426, + "num_tokens": 2011460.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.8721041679382324, + "sampling/importance_sampling_ratio/mean": 0.9977869987487793, + "sampling/importance_sampling_ratio/min": 0.4702486991882324, + "sampling/sampling_logp_difference/max": 0.7544935941696167, + "sampling/sampling_logp_difference/mean": 0.023253358900547028, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 81.8125, + "completions/mean_terminated_length": 81.8125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2518032491207123, + "epoch": 0.22831858407079647, + "frac_reward_zero_std": 0.75, + "grad_norm": 8.862384196324069, + "kl": 0.0007762154564261436, + "learning_rate": 5.663716814159291e-07, + "loss": 0.1619, + "num_tokens": 2027848.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0013949871063232, + "sampling/importance_sampling_ratio/min": 0.5217262506484985, + "sampling/sampling_logp_difference/max": 1.1203207969665527, + "sampling/sampling_logp_difference/mean": 0.01854543387889862, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 71.625, + "completions/mean_terminated_length": 71.625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.299152135848999, + "epoch": 0.23008849557522124, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014935968388077706, + "kl": 0.0009952923282980919, + "learning_rate": 5.707964601769911e-07, + "loss": 0.0, + "num_tokens": 2042800.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6621967554092407, + "sampling/importance_sampling_ratio/mean": 1.0008690357208252, + "sampling/importance_sampling_ratio/min": 0.4874541759490967, + "sampling/sampling_logp_difference/max": 0.7185590267181396, + "sampling/sampling_logp_difference/mean": 0.018116779625415802, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 129.546875, + "completions/mean_terminated_length": 129.546875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4606068730354309, + "epoch": 0.23185840707964603, + "frac_reward_zero_std": 0.5, + "grad_norm": 6.560787920244211, + "kl": 0.001249688328243792, + "learning_rate": 5.752212389380531e-07, + "loss": 0.1933, + "num_tokens": 2060803.0, + "reward": 0.03125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5971571207046509, + "sampling/importance_sampling_ratio/mean": 1.0004818439483643, + "sampling/importance_sampling_ratio/min": 0.5036928653717041, + "sampling/sampling_logp_difference/max": 0.685788631439209, + "sampling/sampling_logp_difference/mean": 0.01681518740952015, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 84.25, + "completions/mean_terminated_length": 84.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3058205246925354, + "epoch": 0.2336283185840708, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01254687731931507, + "kl": 0.0007779281586408615, + "learning_rate": 5.79646017699115e-07, + "loss": 0.0, + "num_tokens": 2078243.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002914667129517, + "sampling/importance_sampling_ratio/min": 0.4855709969997406, + "sampling/sampling_logp_difference/max": 0.7224297523498535, + "sampling/sampling_logp_difference/mean": 0.02297021448612213, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 69.484375, + "completions/mean_terminated_length": 69.484375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.26227444410324097, + "epoch": 0.23539823008849559, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016136415805378446, + "kl": 0.0010202470002695918, + "learning_rate": 5.84070796460177e-07, + "loss": 0.0, + "num_tokens": 2096418.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.781939148902893, + "sampling/importance_sampling_ratio/mean": 0.9994803667068481, + "sampling/importance_sampling_ratio/min": 0.5584296584129333, + "sampling/sampling_logp_difference/max": 0.5826265811920166, + "sampling/sampling_logp_difference/mean": 0.014151913113892078, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 72.875, + "completions/mean_terminated_length": 72.875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.24580150842666626, + "epoch": 0.23716814159292035, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02503235510969517, + "kl": 0.001629705191589892, + "learning_rate": 5.88495575221239e-07, + "loss": 0.0, + "num_tokens": 2112890.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5154296159744263, + "sampling/importance_sampling_ratio/mean": 1.0017865896224976, + "sampling/importance_sampling_ratio/min": 0.6210728287696838, + "sampling/sampling_logp_difference/max": 0.4763069152832031, + "sampling/sampling_logp_difference/mean": 0.01634804531931877, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 75.90625, + "completions/mean_terminated_length": 75.90625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.5235762000083923, + "epoch": 0.23893805309734514, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01575589802905874, + "kl": 0.001007390907034278, + "learning_rate": 5.929203539823009e-07, + "loss": 0.0, + "num_tokens": 2129028.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.001007080078125, + "sampling/importance_sampling_ratio/min": 0.6624895334243774, + "sampling/sampling_logp_difference/max": 0.7274326086044312, + "sampling/sampling_logp_difference/mean": 0.02209286577999592, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 88.375, + "completions/mean_terminated_length": 88.375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.5046366453170776, + "epoch": 0.2407079646017699, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.578048939539401, + "kl": 0.0036787153221666813, + "learning_rate": 5.973451327433628e-07, + "loss": 0.1217, + "num_tokens": 2144508.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.4554598331451416, + "sampling/importance_sampling_ratio/mean": 0.9992860555648804, + "sampling/importance_sampling_ratio/min": 0.25643423199653625, + "sampling/sampling_logp_difference/max": 1.3608829975128174, + "sampling/sampling_logp_difference/mean": 0.022801248356699944, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 123.0, + "completions/max_terminated_length": 123.0, + "completions/mean_length": 33.5625, + "completions/mean_terminated_length": 33.5625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.18427319824695587, + "epoch": 0.2424778761061947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03157405100169581, + "kl": 0.0014091174816712737, + "learning_rate": 6.017699115044248e-07, + "loss": 0.0, + "num_tokens": 2156688.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0007842779159546, + "sampling/importance_sampling_ratio/min": 0.627287745475769, + "sampling/sampling_logp_difference/max": 0.8145256042480469, + "sampling/sampling_logp_difference/mean": 0.017598610371351242, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 71.640625, + "completions/mean_terminated_length": 71.640625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3980875611305237, + "epoch": 0.24424778761061947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016564111690985255, + "kl": 0.0008818708593025804, + "learning_rate": 6.061946902654867e-07, + "loss": 0.0, + "num_tokens": 2173097.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4036502838134766, + "sampling/importance_sampling_ratio/mean": 1.0016671419143677, + "sampling/importance_sampling_ratio/min": 0.6285651922225952, + "sampling/sampling_logp_difference/max": 0.4643155336380005, + "sampling/sampling_logp_difference/mean": 0.01826133392751217, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 81.828125, + "completions/mean_terminated_length": 81.828125, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "entropy": 0.4850948452949524, + "epoch": 0.24601769911504426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01713387326277667, + "kl": 0.001043393975123763, + "learning_rate": 6.106194690265486e-07, + "loss": 0.0, + "num_tokens": 2188382.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.910315752029419, + "sampling/importance_sampling_ratio/mean": 0.9982123374938965, + "sampling/importance_sampling_ratio/min": 0.5476705431938171, + "sampling/sampling_logp_difference/max": 0.647268533706665, + "sampling/sampling_logp_difference/mean": 0.020284462720155716, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 99.71875, + "completions/mean_terminated_length": 99.71875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3546068072319031, + "epoch": 0.24778761061946902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01725404162883253, + "kl": 0.0017146074678748846, + "learning_rate": 6.150442477876105e-07, + "loss": 0.0, + "num_tokens": 2204524.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3572512865066528, + "sampling/importance_sampling_ratio/mean": 1.0001609325408936, + "sampling/importance_sampling_ratio/min": 0.4417930245399475, + "sampling/sampling_logp_difference/max": 0.8169138431549072, + "sampling/sampling_logp_difference/mean": 0.01794680394232273, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 172.0, + "completions/max_terminated_length": 172.0, + "completions/mean_length": 35.140625, + "completions/mean_terminated_length": 35.140625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.18727999925613403, + "epoch": 0.24955752212389382, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09529720865059939, + "kl": 0.0010563858086243272, + "learning_rate": 6.194690265486725e-07, + "loss": 0.0, + "num_tokens": 2216885.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9989814758300781, + "sampling/importance_sampling_ratio/min": 0.7282373905181885, + "sampling/sampling_logp_difference/max": 0.814521074295044, + "sampling/sampling_logp_difference/mean": 0.018082180991768837, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 16.328125, + "completions/mean_terminated_length": 16.328125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.06842117011547089, + "epoch": 0.2513274336283186, + "frac_reward_zero_std": 0.75, + "grad_norm": 9.075752721630387, + "kl": 0.004978060256689787, + "learning_rate": 6.238938053097345e-07, + "loss": -0.0013, + "num_tokens": 2230346.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.266401767730713, + "sampling/importance_sampling_ratio/mean": 1.0020692348480225, + "sampling/importance_sampling_ratio/min": 0.6714641451835632, + "sampling/sampling_logp_difference/max": 0.39829468727111816, + "sampling/sampling_logp_difference/mean": 0.010533824563026428, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 74.671875, + "completions/mean_terminated_length": 74.671875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3647806644439697, + "epoch": 0.25309734513274335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015980370154043362, + "kl": 0.0007940245559439063, + "learning_rate": 6.283185840707964e-07, + "loss": 0.0, + "num_tokens": 2245157.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.424361228942871, + "sampling/importance_sampling_ratio/mean": 1.000733494758606, + "sampling/importance_sampling_ratio/min": 0.7051947116851807, + "sampling/sampling_logp_difference/max": 0.353723406791687, + "sampling/sampling_logp_difference/mean": 0.015422477386891842, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 141.390625, + "completions/mean_terminated_length": 141.390625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.5192233324050903, + "epoch": 0.25486725663716814, + "frac_reward_zero_std": 0.5, + "grad_norm": 10.03788217298924, + "kl": 0.0028998591005802155, + "learning_rate": 6.327433628318584e-07, + "loss": -0.0586, + "num_tokens": 2262686.0, + "reward": -0.15625, + "reward_std": 0.47978055477142334, + "rewards/decision_reward_func/mean": -0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.6302465200424194, + "sampling/importance_sampling_ratio/mean": 0.9998656511306763, + "sampling/importance_sampling_ratio/min": 0.5490341782569885, + "sampling/sampling_logp_difference/max": 0.5995945930480957, + "sampling/sampling_logp_difference/mean": 0.019007448107004166, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 101.109375, + "completions/mean_terminated_length": 101.109375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3655405640602112, + "epoch": 0.25663716814159293, + "frac_reward_zero_std": 0.75, + "grad_norm": 6.607652442786013, + "kl": 0.002844990696758032, + "learning_rate": 6.371681415929203e-07, + "loss": 0.0627, + "num_tokens": 2279573.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995540380477905, + "sampling/importance_sampling_ratio/min": 0.6338145136833191, + "sampling/sampling_logp_difference/max": 1.1002683639526367, + "sampling/sampling_logp_difference/mean": 0.021086499094963074, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 141.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 48.328125, + "completions/mean_terminated_length": 48.328125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2803422212600708, + "epoch": 0.2584070796460177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02802108714282455, + "kl": 0.0013906147796660662, + "learning_rate": 6.415929203539822e-07, + "loss": 0.0, + "num_tokens": 2295178.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2905248403549194, + "sampling/importance_sampling_ratio/mean": 0.9989687204360962, + "sampling/importance_sampling_ratio/min": 0.4375634789466858, + "sampling/sampling_logp_difference/max": 0.8265335559844971, + "sampling/sampling_logp_difference/mean": 0.015749461948871613, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 187.0, + "completions/max_terminated_length": 187.0, + "completions/mean_length": 62.203125, + "completions/mean_terminated_length": 62.203125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2617567479610443, + "epoch": 0.26017699115044246, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024533085491038015, + "kl": 0.001395686762407422, + "learning_rate": 6.460176991150442e-07, + "loss": 0.0, + "num_tokens": 2316247.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5753140449523926, + "sampling/importance_sampling_ratio/mean": 0.9995524287223816, + "sampling/importance_sampling_ratio/min": 0.7271898984909058, + "sampling/sampling_logp_difference/max": 0.4544546604156494, + "sampling/sampling_logp_difference/mean": 0.011754988692700863, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.84375, + "completions/mean_terminated_length": 15.84375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.07275533676147461, + "epoch": 0.26194690265486725, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09102567414694034, + "kl": 0.0017056685173884034, + "learning_rate": 6.504424778761062e-07, + "loss": 0.0, + "num_tokens": 2330141.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.197696566581726, + "sampling/importance_sampling_ratio/mean": 1.000204086303711, + "sampling/importance_sampling_ratio/min": 0.8154329061508179, + "sampling/sampling_logp_difference/max": 0.20403611660003662, + "sampling/sampling_logp_difference/mean": 0.0072796279564499855, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 38.5625, + "completions/mean_terminated_length": 38.5625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.14959517121315002, + "epoch": 0.26371681415929205, + "frac_reward_zero_std": 0.75, + "grad_norm": 12.158302193342701, + "kl": 0.009750444442033768, + "learning_rate": 6.548672566371681e-07, + "loss": -0.1449, + "num_tokens": 2343185.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.7331476211547852, + "sampling/importance_sampling_ratio/mean": 0.9996535778045654, + "sampling/importance_sampling_ratio/min": 0.255138635635376, + "sampling/sampling_logp_difference/max": 1.36594820022583, + "sampling/sampling_logp_difference/mean": 0.016084929928183556, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 138.0, + "completions/max_terminated_length": 138.0, + "completions/mean_length": 35.59375, + "completions/mean_terminated_length": 35.59375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1963880956172943, + "epoch": 0.26548672566371684, + "frac_reward_zero_std": 0.75, + "grad_norm": 7.1218924747110615, + "kl": 0.005532016046345234, + "learning_rate": 6.592920353982301e-07, + "loss": -0.1457, + "num_tokens": 2358039.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004479885101318, + "sampling/importance_sampling_ratio/min": 0.5923722982406616, + "sampling/sampling_logp_difference/max": 1.9722204208374023, + "sampling/sampling_logp_difference/mean": 0.013359785079956055, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 78.109375, + "completions/mean_terminated_length": 78.109375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2529459297657013, + "epoch": 0.2672566371681416, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044873586553780806, + "kl": 0.005619180388748646, + "learning_rate": 6.637168141592921e-07, + "loss": 0.0, + "num_tokens": 2373422.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9478777647018433, + "sampling/importance_sampling_ratio/mean": 1.0008846521377563, + "sampling/importance_sampling_ratio/min": 0.45570361614227295, + "sampling/sampling_logp_difference/max": 0.7859126925468445, + "sampling/sampling_logp_difference/mean": 0.015898194164037704, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 65.640625, + "completions/mean_terminated_length": 65.640625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.30609267950057983, + "epoch": 0.26902654867256637, + "frac_reward_zero_std": 0.75, + "grad_norm": 11.061460167266631, + "kl": 0.010558098554611206, + "learning_rate": 6.68141592920354e-07, + "loss": -0.0973, + "num_tokens": 2389335.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5556261539459229, + "sampling/importance_sampling_ratio/mean": 1.0001894235610962, + "sampling/importance_sampling_ratio/min": 0.572333037853241, + "sampling/sampling_logp_difference/max": 0.5580341815948486, + "sampling/sampling_logp_difference/mean": 0.014833297580480576, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 36.953125, + "completions/mean_terminated_length": 36.953125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.216139018535614, + "epoch": 0.27079646017699116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14097604253434792, + "kl": 0.017395276576280594, + "learning_rate": 6.72566371681416e-07, + "loss": 0.0001, + "num_tokens": 2404404.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.879258394241333, + "sampling/importance_sampling_ratio/mean": 0.9984070062637329, + "sampling/importance_sampling_ratio/min": 0.6819742321968079, + "sampling/sampling_logp_difference/max": 0.6308772563934326, + "sampling/sampling_logp_difference/mean": 0.012710087932646275, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 48.265625, + "completions/mean_terminated_length": 48.265625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1130940243601799, + "epoch": 0.27256637168141595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12575044725568155, + "kl": 0.007728739641606808, + "learning_rate": 6.769911504424779e-07, + "loss": 0.0, + "num_tokens": 2420661.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4918367862701416, + "sampling/importance_sampling_ratio/mean": 0.9995026588439941, + "sampling/importance_sampling_ratio/min": 0.6138349175453186, + "sampling/sampling_logp_difference/max": 0.48802924156188965, + "sampling/sampling_logp_difference/mean": 0.020763784646987915, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 59.953125, + "completions/mean_terminated_length": 59.953125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.21759197115898132, + "epoch": 0.2743362831858407, + "frac_reward_zero_std": 0.5, + "grad_norm": 18.00910866136154, + "kl": 0.010943894274532795, + "learning_rate": 6.814159292035397e-07, + "loss": -0.221, + "num_tokens": 2433986.0, + "reward": 0.84375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.969313383102417, + "sampling/importance_sampling_ratio/mean": 1.002643346786499, + "sampling/importance_sampling_ratio/min": 0.538426399230957, + "sampling/sampling_logp_difference/max": 0.677685022354126, + "sampling/sampling_logp_difference/mean": 0.024278851225972176, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 41.84375, + "completions/mean_terminated_length": 41.84375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.24851931631565094, + "epoch": 0.2761061946902655, + "frac_reward_zero_std": 0.75, + "grad_norm": 19.56733698517216, + "kl": 0.0038888424169272184, + "learning_rate": 6.858407079646017e-07, + "loss": -0.2139, + "num_tokens": 2447512.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.663854956626892, + "sampling/importance_sampling_ratio/mean": 1.0001685619354248, + "sampling/importance_sampling_ratio/min": 0.46025779843330383, + "sampling/sampling_logp_difference/max": 0.7759685516357422, + "sampling/sampling_logp_difference/mean": 0.012126958929002285, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 82.734375, + "completions/mean_terminated_length": 82.734375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.24554727971553802, + "epoch": 0.2778761061946903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02984062761107293, + "kl": 0.001689649187028408, + "learning_rate": 6.902654867256636e-07, + "loss": 0.0, + "num_tokens": 2464983.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4411574602127075, + "sampling/importance_sampling_ratio/mean": 1.000152349472046, + "sampling/importance_sampling_ratio/min": 0.6802230477333069, + "sampling/sampling_logp_difference/max": 0.38533449172973633, + "sampling/sampling_logp_difference/mean": 0.013160878792405128, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 88.640625, + "completions/mean_terminated_length": 88.640625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2844574451446533, + "epoch": 0.27964601769911507, + "frac_reward_zero_std": 0.5, + "grad_norm": 8.589166739359197, + "kl": 0.0038004093803465366, + "learning_rate": 6.946902654867256e-07, + "loss": 0.0108, + "num_tokens": 2481360.0, + "reward": 0.84375, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.5664699077606201, + "sampling/importance_sampling_ratio/mean": 0.9985457062721252, + "sampling/importance_sampling_ratio/min": 0.5478703379631042, + "sampling/sampling_logp_difference/max": 0.6017166376113892, + "sampling/sampling_logp_difference/mean": 0.017418205738067627, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 29.828125, + "completions/mean_terminated_length": 29.828125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.13992829620838165, + "epoch": 0.2814159292035398, + "frac_reward_zero_std": 0.75, + "grad_norm": 22.822138577571543, + "kl": 0.007147709373384714, + "learning_rate": 6.991150442477876e-07, + "loss": -0.2478, + "num_tokens": 2494325.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.9513019323349, + "sampling/importance_sampling_ratio/mean": 0.9980819225311279, + "sampling/importance_sampling_ratio/min": 0.49336642026901245, + "sampling/sampling_logp_difference/max": 0.70650315284729, + "sampling/sampling_logp_difference/mean": 0.01531197689473629, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 151.0, + "completions/max_terminated_length": 151.0, + "completions/mean_length": 36.28125, + "completions/mean_terminated_length": 36.28125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1655530482530594, + "epoch": 0.2831858407079646, + "frac_reward_zero_std": 0.75, + "grad_norm": 9.220842673661625, + "kl": 0.005232499912381172, + "learning_rate": 7.035398230088495e-07, + "loss": 0.0035, + "num_tokens": 2508055.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6507823467254639, + "sampling/importance_sampling_ratio/mean": 1.0039724111557007, + "sampling/importance_sampling_ratio/min": 0.5558677315711975, + "sampling/sampling_logp_difference/max": 0.5872249603271484, + "sampling/sampling_logp_difference/mean": 0.01936100609600544, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 37.234375, + "completions/mean_terminated_length": 37.234375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.14813996851444244, + "epoch": 0.2849557522123894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06153866209942643, + "kl": 0.0026678910944610834, + "learning_rate": 7.079646017699115e-07, + "loss": 0.0, + "num_tokens": 2521126.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.862612009048462, + "sampling/importance_sampling_ratio/mean": 0.9947205781936646, + "sampling/importance_sampling_ratio/min": 0.5880116820335388, + "sampling/sampling_logp_difference/max": 0.621979832649231, + "sampling/sampling_logp_difference/mean": 0.024796247482299805, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 97.875, + "completions/mean_terminated_length": 97.875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4262969493865967, + "epoch": 0.2867256637168142, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.013023578679145853, + "kl": 0.0014449837617576122, + "learning_rate": 7.123893805309734e-07, + "loss": 0.0, + "num_tokens": 2538334.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.434495449066162, + "sampling/importance_sampling_ratio/mean": 0.9998641014099121, + "sampling/importance_sampling_ratio/min": 0.613407552242279, + "sampling/sampling_logp_difference/max": 0.4887256622314453, + "sampling/sampling_logp_difference/mean": 0.017236698418855667, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 48.375, + "completions/mean_terminated_length": 48.375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.21288928389549255, + "epoch": 0.2884955752212389, + "frac_reward_zero_std": 0.75, + "grad_norm": 15.800928761698737, + "kl": 0.004314340651035309, + "learning_rate": 7.168141592920353e-07, + "loss": 0.37, + "num_tokens": 2554374.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.3523565530776978, + "sampling/importance_sampling_ratio/mean": 1.0010709762573242, + "sampling/importance_sampling_ratio/min": 0.6108425259590149, + "sampling/sampling_logp_difference/max": 0.4929161071777344, + "sampling/sampling_logp_difference/mean": 0.012879086658358574, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 97.3125, + "completions/mean_terminated_length": 97.3125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.21863287687301636, + "epoch": 0.2902654867256637, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01218772305508651, + "kl": 0.0015893366653472185, + "learning_rate": 7.212389380530973e-07, + "loss": 0.0, + "num_tokens": 2569834.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000678539276123, + "sampling/importance_sampling_ratio/min": 0.5868110060691833, + "sampling/sampling_logp_difference/max": 0.9002819061279297, + "sampling/sampling_logp_difference/mean": 0.012089526280760765, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 85.046875, + "completions/mean_terminated_length": 85.046875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.303574800491333, + "epoch": 0.2920353982300885, + "frac_reward_zero_std": 0.5, + "grad_norm": 5.2945564536762735, + "kl": 0.003153447061777115, + "learning_rate": 7.256637168141593e-07, + "loss": 0.1266, + "num_tokens": 2587805.0, + "reward": 0.0, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.7874300479888916, + "sampling/importance_sampling_ratio/mean": 0.9995529651641846, + "sampling/importance_sampling_ratio/min": 0.6549264788627625, + "sampling/sampling_logp_difference/max": 0.5807788372039795, + "sampling/sampling_logp_difference/mean": 0.020131688565015793, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 62.4375, + "completions/mean_terminated_length": 62.4375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3352992534637451, + "epoch": 0.2938053097345133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05527729473556852, + "kl": 0.006250219885259867, + "learning_rate": 7.300884955752212e-07, + "loss": 0.0, + "num_tokens": 2603433.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.315501093864441, + "sampling/importance_sampling_ratio/mean": 0.9993723630905151, + "sampling/importance_sampling_ratio/min": 0.65906822681427, + "sampling/sampling_logp_difference/max": 0.4169282913208008, + "sampling/sampling_logp_difference/mean": 0.01699444092810154, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 645.0, + "completions/max_terminated_length": 645.0, + "completions/mean_length": 122.328125, + "completions/mean_terminated_length": 122.328125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.34051042795181274, + "epoch": 0.29557522123893804, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.2065456771565835, + "kl": 0.004882376175373793, + "learning_rate": 7.345132743362832e-07, + "loss": 0.1907, + "num_tokens": 2620878.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.4985473155975342, + "sampling/importance_sampling_ratio/mean": 0.9992001056671143, + "sampling/importance_sampling_ratio/min": 0.5491763353347778, + "sampling/sampling_logp_difference/max": 0.5993356704711914, + "sampling/sampling_logp_difference/mean": 0.015716129913926125, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 83.671875, + "completions/mean_terminated_length": 83.671875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4155896306037903, + "epoch": 0.2973451327433628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02412913910928674, + "kl": 0.002413155511021614, + "learning_rate": 7.389380530973452e-07, + "loss": 0.0, + "num_tokens": 2635865.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5505778789520264, + "sampling/importance_sampling_ratio/mean": 0.9991369247436523, + "sampling/importance_sampling_ratio/min": 0.5488505959510803, + "sampling/sampling_logp_difference/max": 0.5999289751052856, + "sampling/sampling_logp_difference/mean": 0.018513940274715424, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 95.734375, + "completions/mean_terminated_length": 95.734375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.31833508610725403, + "epoch": 0.2991150442477876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018637248678998186, + "kl": 0.0012480122968554497, + "learning_rate": 7.433628318584071e-07, + "loss": 0.0, + "num_tokens": 2652520.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5856602191925049, + "sampling/importance_sampling_ratio/mean": 1.0013822317123413, + "sampling/importance_sampling_ratio/min": 0.573755145072937, + "sampling/sampling_logp_difference/max": 0.55555260181427, + "sampling/sampling_logp_difference/mean": 0.015990041196346283, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 66.3125, + "completions/mean_terminated_length": 66.3125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.36399298906326294, + "epoch": 0.3008849557522124, + "frac_reward_zero_std": 0.5, + "grad_norm": 11.85054564249182, + "kl": 0.0069623468443751335, + "learning_rate": 7.477876106194691e-07, + "loss": 0.3686, + "num_tokens": 2670876.0, + "reward": 0.625, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.423884391784668, + "sampling/importance_sampling_ratio/mean": 0.999387264251709, + "sampling/importance_sampling_ratio/min": 0.5508221983909607, + "sampling/sampling_logp_difference/max": 0.5963431596755981, + "sampling/sampling_logp_difference/mean": 0.01819230616092682, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 65.265625, + "completions/mean_terminated_length": 65.265625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3241598606109619, + "epoch": 0.30265486725663715, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03064381044639263, + "kl": 0.0015449854545295238, + "learning_rate": 7.522123893805308e-07, + "loss": 0.0, + "num_tokens": 2690733.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9951161742210388, + "sampling/importance_sampling_ratio/min": 0.5531994104385376, + "sampling/sampling_logp_difference/max": 0.7696962356567383, + "sampling/sampling_logp_difference/mean": 0.02130197361111641, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 60.546875, + "completions/mean_terminated_length": 60.546875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.34832802414894104, + "epoch": 0.30442477876106194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01942212919039001, + "kl": 0.0010704933665692806, + "learning_rate": 7.566371681415928e-07, + "loss": 0.0, + "num_tokens": 2710400.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3826576471328735, + "sampling/importance_sampling_ratio/mean": 0.9970792531967163, + "sampling/importance_sampling_ratio/min": 0.6549003720283508, + "sampling/sampling_logp_difference/max": 0.42327213287353516, + "sampling/sampling_logp_difference/mean": 0.02165486104786396, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 99.46875, + "completions/mean_terminated_length": 99.46875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.5507562160491943, + "epoch": 0.30619469026548674, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.2067772660198575, + "kl": 0.0015644296072423458, + "learning_rate": 7.610619469026548e-07, + "loss": 0.1168, + "num_tokens": 2725870.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6677830219268799, + "sampling/importance_sampling_ratio/mean": 0.9991893768310547, + "sampling/importance_sampling_ratio/min": 0.3073960840702057, + "sampling/sampling_logp_difference/max": 1.1796181201934814, + "sampling/sampling_logp_difference/mean": 0.0195973701775074, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 65.59375, + "completions/mean_terminated_length": 65.59375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2421538382768631, + "epoch": 0.30796460176991153, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029068109786864894, + "kl": 0.0013104917015880346, + "learning_rate": 7.654867256637167e-07, + "loss": 0.0, + "num_tokens": 2740260.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995702505111694, + "sampling/importance_sampling_ratio/min": 0.6254555583000183, + "sampling/sampling_logp_difference/max": 0.7899044752120972, + "sampling/sampling_logp_difference/mean": 0.022774621844291687, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 75.921875, + "completions/mean_terminated_length": 75.921875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3197605013847351, + "epoch": 0.30973451327433627, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.6400337164230887, + "kl": 0.0015748351579532027, + "learning_rate": 7.699115044247787e-07, + "loss": -0.0426, + "num_tokens": 2753055.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5528416633605957, + "sampling/importance_sampling_ratio/mean": 0.9991353750228882, + "sampling/importance_sampling_ratio/min": 0.6129763126373291, + "sampling/sampling_logp_difference/max": 0.4894289970397949, + "sampling/sampling_logp_difference/mean": 0.013405116274952888, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 113.0, + "completions/max_terminated_length": 113.0, + "completions/mean_length": 34.75, + "completions/mean_terminated_length": 34.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1282852590084076, + "epoch": 0.31150442477876106, + "frac_reward_zero_std": 0.75, + "grad_norm": 5.965701958805946, + "kl": 0.0016409424133598804, + "learning_rate": 7.743362831858407e-07, + "loss": 0.0365, + "num_tokens": 2766895.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.306551456451416, + "sampling/importance_sampling_ratio/mean": 1.00221848487854, + "sampling/importance_sampling_ratio/min": 0.6547200679779053, + "sampling/sampling_logp_difference/max": 0.42354750633239746, + "sampling/sampling_logp_difference/mean": 0.012759683653712273, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 86.859375, + "completions/mean_terminated_length": 86.859375, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "entropy": 0.3257979154586792, + "epoch": 0.31327433628318585, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.1971454945435758, + "kl": 0.0011299046454951167, + "learning_rate": 7.787610619469026e-07, + "loss": 0.0277, + "num_tokens": 2784630.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.28737211227417, + "sampling/importance_sampling_ratio/mean": 1.0015079975128174, + "sampling/importance_sampling_ratio/min": 0.6338574886322021, + "sampling/sampling_logp_difference/max": 0.4559311866760254, + "sampling/sampling_logp_difference/mean": 0.014637185260653496, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 35.921875, + "completions/mean_terminated_length": 35.921875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.24986636638641357, + "epoch": 0.31504424778761064, + "frac_reward_zero_std": 0.75, + "grad_norm": 7.890195329991731, + "kl": 0.006909585557878017, + "learning_rate": 7.831858407079646e-07, + "loss": 0.2545, + "num_tokens": 2797281.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.8822228908538818, + "sampling/importance_sampling_ratio/mean": 0.999498724937439, + "sampling/importance_sampling_ratio/min": 0.277087539434433, + "sampling/sampling_logp_difference/max": 1.2834217548370361, + "sampling/sampling_logp_difference/mean": 0.01401694305241108, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 71.921875, + "completions/mean_terminated_length": 71.921875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3881117105484009, + "epoch": 0.3168141592920354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0174497927871592, + "kl": 0.001206007320433855, + "learning_rate": 7.876106194690266e-07, + "loss": 0.0, + "num_tokens": 2812412.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.54652738571167, + "sampling/importance_sampling_ratio/mean": 1.0000944137573242, + "sampling/importance_sampling_ratio/min": 0.4926236569881439, + "sampling/sampling_logp_difference/max": 0.7080097198486328, + "sampling/sampling_logp_difference/mean": 0.01670251041650772, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 81.234375, + "completions/mean_terminated_length": 81.234375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.37518447637557983, + "epoch": 0.3185840707964602, + "frac_reward_zero_std": 0.75, + "grad_norm": 11.298833240783317, + "kl": 0.0012631581630557775, + "learning_rate": 7.920353982300884e-07, + "loss": 0.1495, + "num_tokens": 2829723.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9986304640769958, + "sampling/importance_sampling_ratio/min": 0.6893054246902466, + "sampling/sampling_logp_difference/max": 0.7487752437591553, + "sampling/sampling_logp_difference/mean": 0.01565636694431305, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 68.59375, + "completions/mean_terminated_length": 68.59375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.19516851007938385, + "epoch": 0.32035398230088497, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022769574972825203, + "kl": 0.0016036881133913994, + "learning_rate": 7.964601769911504e-07, + "loss": 0.0, + "num_tokens": 2843313.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.887073040008545, + "sampling/importance_sampling_ratio/mean": 0.9991527795791626, + "sampling/importance_sampling_ratio/min": 0.5863591432571411, + "sampling/sampling_logp_difference/max": 0.6350269317626953, + "sampling/sampling_logp_difference/mean": 0.019246196374297142, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 39.78125, + "completions/mean_terminated_length": 39.78125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2743300795555115, + "epoch": 0.32212389380530976, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03835264627824913, + "kl": 0.0019648310262709856, + "learning_rate": 8.008849557522124e-07, + "loss": 0.0, + "num_tokens": 2858579.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.516083836555481, + "sampling/importance_sampling_ratio/mean": 1.0001496076583862, + "sampling/importance_sampling_ratio/min": 0.688202440738678, + "sampling/sampling_logp_difference/max": 0.4161306619644165, + "sampling/sampling_logp_difference/mean": 0.014625566080212593, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 90.515625, + "completions/mean_terminated_length": 90.515625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.40749549865722656, + "epoch": 0.3238938053097345, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021686218872358942, + "kl": 0.0016722225118428469, + "learning_rate": 8.053097345132743e-07, + "loss": 0.0, + "num_tokens": 2875172.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6169039011001587, + "sampling/importance_sampling_ratio/mean": 1.0007729530334473, + "sampling/importance_sampling_ratio/min": 0.6497799158096313, + "sampling/sampling_logp_difference/max": 0.4805130958557129, + "sampling/sampling_logp_difference/mean": 0.014663636684417725, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 96.640625, + "completions/mean_terminated_length": 96.640625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4288424253463745, + "epoch": 0.3256637168141593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032833059064123725, + "kl": 0.0040725404396653175, + "learning_rate": 8.097345132743363e-07, + "loss": 0.0, + "num_tokens": 2891581.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0008609294891357, + "sampling/importance_sampling_ratio/min": 0.6394206285476685, + "sampling/sampling_logp_difference/max": 0.7237358093261719, + "sampling/sampling_logp_difference/mean": 0.02272997982800007, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 82.953125, + "completions/mean_terminated_length": 82.953125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4191206097602844, + "epoch": 0.3274336283185841, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017823889993753107, + "kl": 0.0016019068425521255, + "learning_rate": 8.141592920353983e-07, + "loss": 0.0, + "num_tokens": 2906906.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3651630878448486, + "sampling/importance_sampling_ratio/mean": 0.998401403427124, + "sampling/importance_sampling_ratio/min": 0.6447701454162598, + "sampling/sampling_logp_difference/max": 0.4388613700866699, + "sampling/sampling_logp_difference/mean": 0.020062603056430817, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 67.65625, + "completions/mean_terminated_length": 67.65625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3816743493080139, + "epoch": 0.3292035398230089, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018024805681682764, + "kl": 0.0013134465552866459, + "learning_rate": 8.185840707964602e-07, + "loss": 0.0, + "num_tokens": 2922388.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.706222653388977, + "sampling/importance_sampling_ratio/mean": 0.9995259046554565, + "sampling/importance_sampling_ratio/min": 0.5038576722145081, + "sampling/sampling_logp_difference/max": 0.6854615211486816, + "sampling/sampling_logp_difference/mean": 0.016284022480249405, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 88.015625, + "completions/mean_terminated_length": 88.015625, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.4469081163406372, + "epoch": 0.3309734513274336, + "frac_reward_zero_std": 0.5, + "grad_norm": 6.371433628774608, + "kl": 0.00285714166238904, + "learning_rate": 8.230088495575221e-07, + "loss": -0.0155, + "num_tokens": 2938437.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4655020236968994, + "sampling/importance_sampling_ratio/mean": 0.9996187686920166, + "sampling/importance_sampling_ratio/min": 0.6309859752655029, + "sampling/sampling_logp_difference/max": 0.46047163009643555, + "sampling/sampling_logp_difference/mean": 0.01801963523030281, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 96.90625, + "completions/mean_terminated_length": 96.90625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.21593278646469116, + "epoch": 0.3327433628318584, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.2014978144866415, + "kl": 0.0016711852513253689, + "learning_rate": 8.274336283185839e-07, + "loss": 0.0088, + "num_tokens": 2955631.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.4248566627502441, + "sampling/importance_sampling_ratio/mean": 1.0010021924972534, + "sampling/importance_sampling_ratio/min": 0.3327132761478424, + "sampling/sampling_logp_difference/max": 1.1004741191864014, + "sampling/sampling_logp_difference/mean": 0.015623951330780983, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 96.140625, + "completions/mean_terminated_length": 96.140625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.45695868134498596, + "epoch": 0.3345132743362832, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.4257601631699997, + "kl": 0.0024920753203332424, + "learning_rate": 8.318584070796459e-07, + "loss": 0.0294, + "num_tokens": 2972056.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6508358716964722, + "sampling/importance_sampling_ratio/mean": 1.0013859272003174, + "sampling/importance_sampling_ratio/min": 0.5403315424919128, + "sampling/sampling_logp_difference/max": 0.615572452545166, + "sampling/sampling_logp_difference/mean": 0.019048312678933144, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 37.03125, + "completions/mean_terminated_length": 37.03125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.13859203457832336, + "epoch": 0.336283185840708, + "frac_reward_zero_std": 0.75, + "grad_norm": 8.097821127529105, + "kl": 0.005911725573241711, + "learning_rate": 8.362831858407079e-07, + "loss": 0.2139, + "num_tokens": 2985642.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4755970239639282, + "sampling/importance_sampling_ratio/mean": 1.0006300210952759, + "sampling/importance_sampling_ratio/min": 0.6624336838722229, + "sampling/sampling_logp_difference/max": 0.41183483600616455, + "sampling/sampling_logp_difference/mean": 0.01723603904247284, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 53.203125, + "completions/mean_terminated_length": 53.203125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3490672707557678, + "epoch": 0.3380530973451327, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07868897743366499, + "kl": 0.005042393691837788, + "learning_rate": 8.407079646017698e-07, + "loss": 0.0, + "num_tokens": 2999239.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3826329708099365, + "sampling/importance_sampling_ratio/mean": 0.9992291927337646, + "sampling/importance_sampling_ratio/min": 0.5984382033348083, + "sampling/sampling_logp_difference/max": 0.5134320259094238, + "sampling/sampling_logp_difference/mean": 0.021959085017442703, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 71.734375, + "completions/mean_terminated_length": 71.734375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.307967871427536, + "epoch": 0.3398230088495575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025977733696619907, + "kl": 0.0022195237688720226, + "learning_rate": 8.451327433628318e-07, + "loss": 0.0, + "num_tokens": 3016038.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5421792268753052, + "sampling/importance_sampling_ratio/mean": 1.0004302263259888, + "sampling/importance_sampling_ratio/min": 0.5187493562698364, + "sampling/sampling_logp_difference/max": 0.656334400177002, + "sampling/sampling_logp_difference/mean": 0.012343363836407661, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 65.75, + "completions/mean_terminated_length": 65.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3567613661289215, + "epoch": 0.3415929203539823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0460271782944802, + "kl": 0.016741083934903145, + "learning_rate": 8.495575221238938e-07, + "loss": 0.0001, + "num_tokens": 3031846.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0010594129562378, + "sampling/importance_sampling_ratio/min": 0.5105498433113098, + "sampling/sampling_logp_difference/max": 0.7371149063110352, + "sampling/sampling_logp_difference/mean": 0.019204776734113693, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 66.328125, + "completions/mean_terminated_length": 66.328125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3064947724342346, + "epoch": 0.3433628318584071, + "frac_reward_zero_std": 0.75, + "grad_norm": 5.828728528219838, + "kl": 0.003697988111525774, + "learning_rate": 8.539823008849557e-07, + "loss": -0.1107, + "num_tokens": 3046859.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001063346862793, + "sampling/importance_sampling_ratio/min": 0.6153205633163452, + "sampling/sampling_logp_difference/max": 0.8466815948486328, + "sampling/sampling_logp_difference/mean": 0.01753907836973667, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 95.890625, + "completions/mean_terminated_length": 95.890625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.45068109035491943, + "epoch": 0.34513274336283184, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027973118552110467, + "kl": 0.002571109915152192, + "learning_rate": 8.584070796460177e-07, + "loss": 0.0, + "num_tokens": 3062788.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000136137008667, + "sampling/importance_sampling_ratio/min": 0.6952780485153198, + "sampling/sampling_logp_difference/max": 0.8257927894592285, + "sampling/sampling_logp_difference/mean": 0.015602972358465195, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 149.0, + "completions/max_terminated_length": 149.0, + "completions/mean_length": 47.3125, + "completions/mean_terminated_length": 47.3125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.29853731393814087, + "epoch": 0.34690265486725663, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07086573487077374, + "kl": 0.035713452845811844, + "learning_rate": 8.628318584070797e-07, + "loss": 0.0001, + "num_tokens": 3076456.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4578688144683838, + "sampling/importance_sampling_ratio/mean": 0.9997751712799072, + "sampling/importance_sampling_ratio/min": 0.3986743092536926, + "sampling/sampling_logp_difference/max": 0.9196105003356934, + "sampling/sampling_logp_difference/mean": 0.020105183124542236, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 50.34375, + "completions/mean_terminated_length": 50.34375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3566172122955322, + "epoch": 0.3486725663716814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050854542449691606, + "kl": 0.02495628222823143, + "learning_rate": 8.672566371681415e-07, + "loss": 0.0001, + "num_tokens": 3090718.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6565725803375244, + "sampling/importance_sampling_ratio/mean": 1.0001130104064941, + "sampling/importance_sampling_ratio/min": 0.6099073886871338, + "sampling/sampling_logp_difference/max": 0.5047507286071777, + "sampling/sampling_logp_difference/mean": 0.017076190561056137, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 88.65625, + "completions/mean_terminated_length": 88.65625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.6213746070861816, + "epoch": 0.3504424778761062, + "frac_reward_zero_std": 0.75, + "grad_norm": 12.629178819001702, + "kl": 0.003796565579250455, + "learning_rate": 8.716814159292035e-07, + "loss": 0.0921, + "num_tokens": 3106456.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.5601270198822021, + "sampling/importance_sampling_ratio/mean": 0.9996728897094727, + "sampling/importance_sampling_ratio/min": 0.6842789053916931, + "sampling/sampling_logp_difference/max": 0.4447672367095947, + "sampling/sampling_logp_difference/mean": 0.021474361419677734, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 84.3125, + "completions/mean_terminated_length": 84.3125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3975464701652527, + "epoch": 0.35221238938053095, + "frac_reward_zero_std": 0.75, + "grad_norm": 6.019198297739022, + "kl": 0.007026151288300753, + "learning_rate": 8.761061946902655e-07, + "loss": 0.2102, + "num_tokens": 3122780.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.674494743347168, + "sampling/importance_sampling_ratio/mean": 1.0001417398452759, + "sampling/importance_sampling_ratio/min": 0.3294057250022888, + "sampling/sampling_logp_difference/max": 1.1104650497436523, + "sampling/sampling_logp_difference/mean": 0.020129112526774406, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 40.71875, + "completions/mean_terminated_length": 40.71875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.19297020137310028, + "epoch": 0.35398230088495575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03618304098894548, + "kl": 0.001933026360347867, + "learning_rate": 8.805309734513274e-07, + "loss": 0.0, + "num_tokens": 3143066.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5666741132736206, + "sampling/importance_sampling_ratio/mean": 0.9981307983398438, + "sampling/importance_sampling_ratio/min": 0.40572628378868103, + "sampling/sampling_logp_difference/max": 0.9020766019821167, + "sampling/sampling_logp_difference/mean": 0.021881964057683945, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 13.6875, + "completions/mean_terminated_length": 13.6875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.06337769329547882, + "epoch": 0.35575221238938054, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3730626436437873, + "kl": 0.005040082149207592, + "learning_rate": 8.849557522123894e-07, + "loss": 0.0001, + "num_tokens": 3153878.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5257240533828735, + "sampling/importance_sampling_ratio/mean": 0.9977389574050903, + "sampling/importance_sampling_ratio/min": 0.4783879518508911, + "sampling/sampling_logp_difference/max": 0.7373332977294922, + "sampling/sampling_logp_difference/mean": 0.012821320444345474, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 20.921875, + "completions/mean_terminated_length": 20.921875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.16319364309310913, + "epoch": 0.35752212389380533, + "frac_reward_zero_std": 0.75, + "grad_norm": 29.22328448711984, + "kl": 0.04330842196941376, + "learning_rate": 8.893805309734513e-07, + "loss": -0.4998, + "num_tokens": 3168065.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.6684596538543701, + "sampling/importance_sampling_ratio/mean": 0.9991862177848816, + "sampling/importance_sampling_ratio/min": 0.6727373003959656, + "sampling/sampling_logp_difference/max": 0.5119008421897888, + "sampling/sampling_logp_difference/mean": 0.022439446300268173, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 38.34375, + "completions/mean_terminated_length": 38.34375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.19670294225215912, + "epoch": 0.35929203539823007, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1867018254244633, + "kl": 0.03805701807141304, + "learning_rate": 8.938053097345132e-07, + "loss": 0.0002, + "num_tokens": 3180439.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3781383037567139, + "sampling/importance_sampling_ratio/mean": 1.00034761428833, + "sampling/importance_sampling_ratio/min": 0.6061520576477051, + "sampling/sampling_logp_difference/max": 0.500624418258667, + "sampling/sampling_logp_difference/mean": 0.016402604058384895, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 124.0, + "completions/max_terminated_length": 124.0, + "completions/mean_length": 16.984375, + "completions/mean_terminated_length": 16.984375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.08758561313152313, + "epoch": 0.36106194690265486, + "frac_reward_zero_std": 0.75, + "grad_norm": 67.96663604069774, + "kl": 0.03069724701344967, + "learning_rate": 8.982300884955752e-07, + "loss": -0.3958, + "num_tokens": 3194870.0, + "reward": -0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.944921612739563, + "sampling/importance_sampling_ratio/mean": 1.0006206035614014, + "sampling/importance_sampling_ratio/min": 0.6622530221939087, + "sampling/sampling_logp_difference/max": 0.6652216911315918, + "sampling/sampling_logp_difference/mean": 0.010742882266640663, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.328125, + "completions/mean_terminated_length": 15.328125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.06518861651420593, + "epoch": 0.36283185840707965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09771435391875398, + "kl": 0.02997596189379692, + "learning_rate": 9.026548672566371e-07, + "loss": 0.0002, + "num_tokens": 3208795.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.0926145315170288, + "sampling/importance_sampling_ratio/mean": 0.999653160572052, + "sampling/importance_sampling_ratio/min": 0.6137433052062988, + "sampling/sampling_logp_difference/max": 0.4881784915924072, + "sampling/sampling_logp_difference/mean": 0.005968471057713032, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 58.0625, + "completions/mean_terminated_length": 58.0625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2882225811481476, + "epoch": 0.36460176991150445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.055160220710038725, + "kl": 0.003100259928032756, + "learning_rate": 9.07079646017699e-07, + "loss": 0.0, + "num_tokens": 3222911.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3295564651489258, + "sampling/importance_sampling_ratio/mean": 0.9999579191207886, + "sampling/importance_sampling_ratio/min": 0.6964197754859924, + "sampling/sampling_logp_difference/max": 0.36180269718170166, + "sampling/sampling_logp_difference/mean": 0.010741502977907658, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 68.4375, + "completions/mean_terminated_length": 68.4375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3214592933654785, + "epoch": 0.3663716814159292, + "frac_reward_zero_std": 0.75, + "grad_norm": 6.336265981679781, + "kl": 0.005980383604764938, + "learning_rate": 9.11504424778761e-07, + "loss": -0.0132, + "num_tokens": 3237387.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.3671600818634033, + "sampling/importance_sampling_ratio/mean": 1.0011777877807617, + "sampling/importance_sampling_ratio/min": 0.6046066284179688, + "sampling/sampling_logp_difference/max": 0.5031771659851074, + "sampling/sampling_logp_difference/mean": 0.015451844781637192, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 108.15625, + "completions/mean_terminated_length": 108.15625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.35494041442871094, + "epoch": 0.368141592920354, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.6888265767189843, + "kl": 0.002561464672908187, + "learning_rate": 9.159292035398229e-07, + "loss": 0.0946, + "num_tokens": 3254645.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9973541498184204, + "sampling/importance_sampling_ratio/min": 0.6656482815742493, + "sampling/sampling_logp_difference/max": 0.7017228603363037, + "sampling/sampling_logp_difference/mean": 0.015536784194409847, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 106.453125, + "completions/mean_terminated_length": 106.453125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.5013563632965088, + "epoch": 0.36991150442477877, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05157514700743391, + "kl": 0.0033573838882148266, + "learning_rate": 9.203539823008849e-07, + "loss": 0.0, + "num_tokens": 3270530.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6783130168914795, + "sampling/importance_sampling_ratio/mean": 0.9998264312744141, + "sampling/importance_sampling_ratio/min": 0.5489054918289185, + "sampling/sampling_logp_difference/max": 0.5998289585113525, + "sampling/sampling_logp_difference/mean": 0.018359629437327385, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 44.453125, + "completions/mean_terminated_length": 44.453125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1675224006175995, + "epoch": 0.37168141592920356, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03374225208507583, + "kl": 0.001434197649359703, + "learning_rate": 9.247787610619469e-07, + "loss": 0.0, + "num_tokens": 3285727.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7428385019302368, + "sampling/importance_sampling_ratio/mean": 0.9988299608230591, + "sampling/importance_sampling_ratio/min": 0.5252789855003357, + "sampling/sampling_logp_difference/max": 0.6438257694244385, + "sampling/sampling_logp_difference/mean": 0.01490543782711029, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 165.0, + "completions/max_terminated_length": 165.0, + "completions/mean_length": 40.21875, + "completions/mean_terminated_length": 40.21875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.18517419695854187, + "epoch": 0.3734513274336283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050757562647429574, + "kl": 0.002568627241998911, + "learning_rate": 9.292035398230088e-07, + "loss": 0.0, + "num_tokens": 3300285.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.381851315498352, + "sampling/importance_sampling_ratio/mean": 0.9986093044281006, + "sampling/importance_sampling_ratio/min": 0.609358549118042, + "sampling/sampling_logp_difference/max": 0.4953484535217285, + "sampling/sampling_logp_difference/mean": 0.012276686728000641, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 71.78125, + "completions/mean_terminated_length": 71.78125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2278621345758438, + "epoch": 0.3752212389380531, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03204199727428532, + "kl": 0.003042126540094614, + "learning_rate": 9.336283185840708e-07, + "loss": 0.0, + "num_tokens": 3316879.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.281581997871399, + "sampling/importance_sampling_ratio/mean": 0.9995130896568298, + "sampling/importance_sampling_ratio/min": 0.6369761228561401, + "sampling/sampling_logp_difference/max": 0.4510231018066406, + "sampling/sampling_logp_difference/mean": 0.014066744595766068, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 38.34375, + "completions/mean_terminated_length": 38.34375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.17596986889839172, + "epoch": 0.3769911504424779, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09589961445803016, + "kl": 0.004617048427462578, + "learning_rate": 9.380530973451328e-07, + "loss": 0.0, + "num_tokens": 3331989.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.341760516166687, + "sampling/importance_sampling_ratio/mean": 0.9994263648986816, + "sampling/importance_sampling_ratio/min": 0.7102124094963074, + "sampling/sampling_logp_difference/max": 0.342191219329834, + "sampling/sampling_logp_difference/mean": 0.010547621175646782, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 102.0, + "completions/mean_terminated_length": 102.0, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "entropy": 0.46252167224884033, + "epoch": 0.3787610619469027, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03145555998263438, + "kl": 0.003119957633316517, + "learning_rate": 9.424778761061947e-07, + "loss": 0.0, + "num_tokens": 3348373.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6221009492874146, + "sampling/importance_sampling_ratio/mean": 0.9998818039894104, + "sampling/importance_sampling_ratio/min": 0.20803289115428925, + "sampling/sampling_logp_difference/max": 1.570059061050415, + "sampling/sampling_logp_difference/mean": 0.01729355938732624, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 62.59375, + "completions/mean_terminated_length": 62.59375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2621881365776062, + "epoch": 0.3805309734513274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.056861081675358534, + "kl": 0.005555910989642143, + "learning_rate": 9.469026548672566e-07, + "loss": 0.0, + "num_tokens": 3362795.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7976553440093994, + "sampling/importance_sampling_ratio/mean": 1.002416729927063, + "sampling/importance_sampling_ratio/min": 0.5128350853919983, + "sampling/sampling_logp_difference/max": 0.667801022529602, + "sampling/sampling_logp_difference/mean": 0.01444577518850565, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 204.0, + "completions/max_terminated_length": 204.0, + "completions/mean_length": 59.53125, + "completions/mean_terminated_length": 59.53125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3704109191894531, + "epoch": 0.3823008849557522, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1242170837294012, + "kl": 0.005496586672961712, + "learning_rate": 9.513274336283185e-07, + "loss": 0.0001, + "num_tokens": 3378029.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5919092893600464, + "sampling/importance_sampling_ratio/mean": 1.0035631656646729, + "sampling/importance_sampling_ratio/min": 0.7009298801422119, + "sampling/sampling_logp_difference/max": 0.4649341106414795, + "sampling/sampling_logp_difference/mean": 0.020102720707654953, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 98.078125, + "completions/mean_terminated_length": 98.078125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3987380266189575, + "epoch": 0.384070796460177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.045585197120377674, + "kl": 0.003970324993133545, + "learning_rate": 9.557522123893805e-07, + "loss": 0.0, + "num_tokens": 3394690.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998895525932312, + "sampling/importance_sampling_ratio/min": 0.5717587471008301, + "sampling/sampling_logp_difference/max": 0.921055793762207, + "sampling/sampling_logp_difference/mean": 0.01475785207003355, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 34.296875, + "completions/mean_terminated_length": 34.296875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.19475924968719482, + "epoch": 0.3858407079646018, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11738437086607027, + "kl": 0.010515974834561348, + "learning_rate": 9.601769911504426e-07, + "loss": 0.0001, + "num_tokens": 3407157.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5994064807891846, + "sampling/importance_sampling_ratio/mean": 0.9995347261428833, + "sampling/importance_sampling_ratio/min": 0.6554685831069946, + "sampling/sampling_logp_difference/max": 0.469632625579834, + "sampling/sampling_logp_difference/mean": 0.011333119124174118, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 54.765625, + "completions/mean_terminated_length": 54.765625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.33620890974998474, + "epoch": 0.38761061946902653, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10464694120487733, + "kl": 0.004381977953016758, + "learning_rate": 9.646017699115042e-07, + "loss": 0.0001, + "num_tokens": 3421606.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5475236177444458, + "sampling/importance_sampling_ratio/mean": 1.0017409324645996, + "sampling/importance_sampling_ratio/min": 0.6921407580375671, + "sampling/sampling_logp_difference/max": 0.43665599822998047, + "sampling/sampling_logp_difference/mean": 0.022906150668859482, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 52.09375, + "completions/mean_terminated_length": 52.09375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.17895236611366272, + "epoch": 0.3893805309734513, + "frac_reward_zero_std": 0.75, + "grad_norm": 8.487661476567311, + "kl": 0.002079170662909746, + "learning_rate": 9.690265486725663e-07, + "loss": 0.184, + "num_tokens": 3435900.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4371217489242554, + "sampling/importance_sampling_ratio/mean": 0.9985246658325195, + "sampling/importance_sampling_ratio/min": 0.5759603381156921, + "sampling/sampling_logp_difference/max": 0.5517165660858154, + "sampling/sampling_logp_difference/mean": 0.017181089147925377, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 161.0, + "completions/max_terminated_length": 161.0, + "completions/mean_length": 41.65625, + "completions/mean_terminated_length": 41.65625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.19378438591957092, + "epoch": 0.3911504424778761, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057238480728486814, + "kl": 0.0032039829529821873, + "learning_rate": 9.734513274336282e-07, + "loss": 0.0, + "num_tokens": 3450294.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4914631843566895, + "sampling/importance_sampling_ratio/mean": 0.9990211129188538, + "sampling/importance_sampling_ratio/min": 0.6217343211174011, + "sampling/sampling_logp_difference/max": 0.47524237632751465, + "sampling/sampling_logp_difference/mean": 0.01241573877632618, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 62.703125, + "completions/mean_terminated_length": 62.703125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3654828667640686, + "epoch": 0.3929203539823009, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035383096163377675, + "kl": 0.0059638191014528275, + "learning_rate": 9.778761061946902e-07, + "loss": 0.0, + "num_tokens": 3465411.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5720010995864868, + "sampling/importance_sampling_ratio/mean": 0.9991636276245117, + "sampling/importance_sampling_ratio/min": 0.6438614726066589, + "sampling/sampling_logp_difference/max": 0.4523494243621826, + "sampling/sampling_logp_difference/mean": 0.01645643450319767, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 108.0, + "completions/max_terminated_length": 108.0, + "completions/mean_length": 32.265625, + "completions/mean_terminated_length": 32.265625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.12503893673419952, + "epoch": 0.39469026548672564, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029294659331895984, + "kl": 0.0009783116402104497, + "learning_rate": 9.82300884955752e-07, + "loss": 0.0, + "num_tokens": 3479988.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2597873210906982, + "sampling/importance_sampling_ratio/mean": 1.0013203620910645, + "sampling/importance_sampling_ratio/min": 0.7212660312652588, + "sampling/sampling_logp_difference/max": 0.32674723863601685, + "sampling/sampling_logp_difference/mean": 0.009833626449108124, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 57.65625, + "completions/mean_terminated_length": 57.65625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.242695152759552, + "epoch": 0.39646017699115044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02587311644919685, + "kl": 0.0017175667453557253, + "learning_rate": 9.867256637168142e-07, + "loss": 0.0, + "num_tokens": 3495246.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.328568935394287, + "sampling/importance_sampling_ratio/mean": 1.0001909732818604, + "sampling/importance_sampling_ratio/min": 0.7182667851448059, + "sampling/sampling_logp_difference/max": 0.3309142589569092, + "sampling/sampling_logp_difference/mean": 0.008995315060019493, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 96.734375, + "completions/mean_terminated_length": 96.734375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.39507365226745605, + "epoch": 0.39823008849557523, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.7512447226554966, + "kl": 0.004321282729506493, + "learning_rate": 9.91150442477876e-07, + "loss": -0.0746, + "num_tokens": 3511581.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.593572974205017, + "sampling/importance_sampling_ratio/mean": 1.0000005960464478, + "sampling/importance_sampling_ratio/min": 0.6780757904052734, + "sampling/sampling_logp_difference/max": 0.46597862243652344, + "sampling/sampling_logp_difference/mean": 0.01681409776210785, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 143.0, + "completions/max_terminated_length": 143.0, + "completions/mean_length": 35.171875, + "completions/mean_terminated_length": 35.171875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.22788852453231812, + "epoch": 0.4, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05385673391397239, + "kl": 0.004239211790263653, + "learning_rate": 9.95575221238938e-07, + "loss": 0.0, + "num_tokens": 3523800.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0043925046920776, + "sampling/importance_sampling_ratio/min": 0.7580122351646423, + "sampling/sampling_logp_difference/max": 0.705294132232666, + "sampling/sampling_logp_difference/mean": 0.016978923231363297, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 43.015625, + "completions/mean_terminated_length": 43.015625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1527884155511856, + "epoch": 0.40176991150442476, + "frac_reward_zero_std": 0.75, + "grad_norm": 21.641425147437015, + "kl": 0.004469120409339666, + "learning_rate": 1e-06, + "loss": -0.1624, + "num_tokens": 3537321.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0010275840759277, + "sampling/importance_sampling_ratio/min": 0.4817606806755066, + "sampling/sampling_logp_difference/max": 0.7303078174591064, + "sampling/sampling_logp_difference/mean": 0.017776301130652428, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 30.71875, + "completions/mean_terminated_length": 30.71875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.19440095126628876, + "epoch": 0.40353982300884955, + "frac_reward_zero_std": 0.75, + "grad_norm": 36.71682744138212, + "kl": 0.010755006223917007, + "learning_rate": 9.999994035998135e-07, + "loss": 0.3461, + "num_tokens": 3550775.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.5073049068450928, + "sampling/importance_sampling_ratio/mean": 0.9993104934692383, + "sampling/importance_sampling_ratio/min": 0.6966415047645569, + "sampling/sampling_logp_difference/max": 0.41032326221466064, + "sampling/sampling_logp_difference/mean": 0.01622116006910801, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 107.890625, + "completions/mean_terminated_length": 107.890625, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.4642508029937744, + "epoch": 0.40530973451327434, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054119781405429675, + "kl": 0.0034284957218915224, + "learning_rate": 9.99997614400677e-07, + "loss": 0.0, + "num_tokens": 3567168.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001689195632935, + "sampling/importance_sampling_ratio/min": 0.6158477663993835, + "sampling/sampling_logp_difference/max": 0.7310385704040527, + "sampling/sampling_logp_difference/mean": 0.01859310269355774, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 52.40625, + "completions/mean_terminated_length": 52.40625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2456488460302353, + "epoch": 0.40707964601769914, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.062557665823258, + "kl": 0.002922304905951023, + "learning_rate": 9.999946324068587e-07, + "loss": 0.0, + "num_tokens": 3580618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.969736933708191, + "sampling/importance_sampling_ratio/mean": 0.9975132942199707, + "sampling/importance_sampling_ratio/min": 0.6378993988037109, + "sampling/sampling_logp_difference/max": 0.677899956703186, + "sampling/sampling_logp_difference/mean": 0.019860180094838142, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 39.734375, + "completions/mean_terminated_length": 39.734375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.18301242589950562, + "epoch": 0.4088495575221239, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04358872525227839, + "kl": 0.0017693135887384415, + "learning_rate": 9.999904576254724e-07, + "loss": 0.0, + "num_tokens": 3595081.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3643784523010254, + "sampling/importance_sampling_ratio/mean": 1.0005431175231934, + "sampling/importance_sampling_ratio/min": 0.6537138223648071, + "sampling/sampling_logp_difference/max": 0.4250856637954712, + "sampling/sampling_logp_difference/mean": 0.013198630884289742, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 207.0, + "completions/max_terminated_length": 207.0, + "completions/mean_length": 73.890625, + "completions/mean_terminated_length": 73.890625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2257222831249237, + "epoch": 0.41061946902654867, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08980511226821004, + "kl": 0.001283166348002851, + "learning_rate": 9.999850900664773e-07, + "loss": 0.0, + "num_tokens": 3611618.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2656625509262085, + "sampling/importance_sampling_ratio/mean": 0.9989427924156189, + "sampling/importance_sampling_ratio/min": 0.512908935546875, + "sampling/sampling_logp_difference/max": 0.6676568984985352, + "sampling/sampling_logp_difference/mean": 0.012095805257558823, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 104.0, + "completions/max_terminated_length": 104.0, + "completions/mean_length": 32.453125, + "completions/mean_terminated_length": 32.453125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.294692724943161, + "epoch": 0.41238938053097346, + "frac_reward_zero_std": 0.75, + "grad_norm": 21.452015476936687, + "kl": 0.023687120527029037, + "learning_rate": 9.999785297426788e-07, + "loss": -0.1992, + "num_tokens": 3623407.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.6505494117736816, + "sampling/importance_sampling_ratio/mean": 0.9979889988899231, + "sampling/importance_sampling_ratio/min": 0.6730009913444519, + "sampling/sampling_logp_difference/max": 0.5011081695556641, + "sampling/sampling_logp_difference/mean": 0.01914994977414608, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 37.53125, + "completions/mean_terminated_length": 37.53125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.23396769165992737, + "epoch": 0.41415929203539825, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04800585746953848, + "kl": 0.002836022526025772, + "learning_rate": 9.999707766697265e-07, + "loss": 0.0, + "num_tokens": 3637537.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4313642978668213, + "sampling/importance_sampling_ratio/mean": 1.0034034252166748, + "sampling/importance_sampling_ratio/min": 0.6590465307235718, + "sampling/sampling_logp_difference/max": 0.4169611930847168, + "sampling/sampling_logp_difference/mean": 0.019834768027067184, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 42.234375, + "completions/mean_terminated_length": 42.234375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.15960168838500977, + "epoch": 0.415929203539823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026987595816708707, + "kl": 0.0025480028707534075, + "learning_rate": 9.999618308661168e-07, + "loss": 0.0, + "num_tokens": 3650912.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5073378086090088, + "sampling/importance_sampling_ratio/mean": 0.9996541738510132, + "sampling/importance_sampling_ratio/min": 0.6348318457603455, + "sampling/sampling_logp_difference/max": 0.454395055770874, + "sampling/sampling_logp_difference/mean": 0.01629040017724037, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 90.703125, + "completions/mean_terminated_length": 90.703125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.38684922456741333, + "epoch": 0.4176991150442478, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.010045925952474, + "kl": 0.0022131020668894053, + "learning_rate": 9.999516923531906e-07, + "loss": 0.0773, + "num_tokens": 3668093.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.644107699394226, + "sampling/importance_sampling_ratio/mean": 1.0001626014709473, + "sampling/importance_sampling_ratio/min": 0.6234187483787537, + "sampling/sampling_logp_difference/max": 0.4971977472305298, + "sampling/sampling_logp_difference/mean": 0.017248239368200302, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 86.953125, + "completions/mean_terminated_length": 86.953125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.476330429315567, + "epoch": 0.4194690265486726, + "frac_reward_zero_std": 0.75, + "grad_norm": 7.72996716584878, + "kl": 0.0019192376639693975, + "learning_rate": 9.99940361155134e-07, + "loss": 0.0898, + "num_tokens": 3682634.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4441698789596558, + "sampling/importance_sampling_ratio/mean": 1.0006109476089478, + "sampling/importance_sampling_ratio/min": 0.6419512033462524, + "sampling/sampling_logp_difference/max": 0.44324302673339844, + "sampling/sampling_logp_difference/mean": 0.016820615157485008, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 79.15625, + "completions/mean_terminated_length": 79.15625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.27041104435920715, + "epoch": 0.42123893805309737, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019962387727511497, + "kl": 0.0013378332369029522, + "learning_rate": 9.99927837298979e-07, + "loss": 0.0, + "num_tokens": 3698772.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7013784646987915, + "sampling/importance_sampling_ratio/mean": 0.9985455274581909, + "sampling/importance_sampling_ratio/min": 0.6555176377296448, + "sampling/sampling_logp_difference/max": 0.5314388275146484, + "sampling/sampling_logp_difference/mean": 0.012149603106081486, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 54.84375, + "completions/mean_terminated_length": 54.84375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.20714148879051208, + "epoch": 0.4230088495575221, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04988395372967355, + "kl": 0.003713001497089863, + "learning_rate": 9.999141208146027e-07, + "loss": 0.0, + "num_tokens": 3712506.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3076030015945435, + "sampling/importance_sampling_ratio/mean": 0.9982582330703735, + "sampling/importance_sampling_ratio/min": 0.5380278825759888, + "sampling/sampling_logp_difference/max": 0.619844913482666, + "sampling/sampling_logp_difference/mean": 0.014127662405371666, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 154.0, + "completions/max_terminated_length": 154.0, + "completions/mean_length": 37.328125, + "completions/mean_terminated_length": 37.328125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1810479760169983, + "epoch": 0.4247787610619469, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14025310662941104, + "kl": 0.006155351176857948, + "learning_rate": 9.99899211734727e-07, + "loss": 0.0, + "num_tokens": 3727135.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4445059299468994, + "sampling/importance_sampling_ratio/mean": 1.0028483867645264, + "sampling/importance_sampling_ratio/min": 0.5484683513641357, + "sampling/sampling_logp_difference/max": 0.60062575340271, + "sampling/sampling_logp_difference/mean": 0.020090894773602486, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 13.234375, + "completions/mean_terminated_length": 13.234375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.06778696179389954, + "epoch": 0.4265486725663717, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.32877278340164795, + "kl": 0.003949641715735197, + "learning_rate": 9.998831100949186e-07, + "loss": 0.0, + "num_tokens": 3738222.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4646263122558594, + "sampling/importance_sampling_ratio/mean": 0.9988141059875488, + "sampling/importance_sampling_ratio/min": 0.781985878944397, + "sampling/sampling_logp_difference/max": 0.38160014152526855, + "sampling/sampling_logp_difference/mean": 0.009022011421620846, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 36.21875, + "completions/mean_terminated_length": 36.21875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.14866621792316437, + "epoch": 0.4283185840707965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030079433302675318, + "kl": 0.0009830165654420853, + "learning_rate": 9.998658159335901e-07, + "loss": 0.0, + "num_tokens": 3754716.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.250900387763977, + "sampling/importance_sampling_ratio/mean": 0.9993738532066345, + "sampling/importance_sampling_ratio/min": 0.7676602005958557, + "sampling/sampling_logp_difference/max": 0.2644081115722656, + "sampling/sampling_logp_difference/mean": 0.009230071678757668, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 85.328125, + "completions/mean_terminated_length": 85.328125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.38343095779418945, + "epoch": 0.4300884955752212, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.0171205491946504, + "kl": 0.0024617710150778294, + "learning_rate": 9.998473292919985e-07, + "loss": -0.0413, + "num_tokens": 3771553.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6167631149291992, + "sampling/importance_sampling_ratio/mean": 1.0004315376281738, + "sampling/importance_sampling_ratio/min": 0.3154713213443756, + "sampling/sampling_logp_difference/max": 1.1536874771118164, + "sampling/sampling_logp_difference/mean": 0.01735665649175644, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 37.078125, + "completions/mean_terminated_length": 37.078125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.18088294565677643, + "epoch": 0.431858407079646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12771523381725922, + "kl": 0.006010980345308781, + "learning_rate": 9.998276502142454e-07, + "loss": 0.0, + "num_tokens": 3788790.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2638511657714844, + "sampling/importance_sampling_ratio/mean": 0.999843180179596, + "sampling/importance_sampling_ratio/min": 0.6756933331489563, + "sampling/sampling_logp_difference/max": 0.3920159339904785, + "sampling/sampling_logp_difference/mean": 0.0102929025888443, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 176.0, + "completions/max_terminated_length": 176.0, + "completions/mean_length": 74.46875, + "completions/mean_terminated_length": 74.46875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3529541492462158, + "epoch": 0.4336283185840708, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03759443190383923, + "kl": 0.003588553052395582, + "learning_rate": 9.99806778747277e-07, + "loss": 0.0, + "num_tokens": 3802644.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.421198844909668, + "sampling/importance_sampling_ratio/mean": 0.9967384934425354, + "sampling/importance_sampling_ratio/min": 0.5428851842880249, + "sampling/sampling_logp_difference/max": 0.6108574867248535, + "sampling/sampling_logp_difference/mean": 0.024649962782859802, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 67.40625, + "completions/mean_terminated_length": 67.40625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3470374047756195, + "epoch": 0.4353982300884956, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051173374001041765, + "kl": 0.004685951862484217, + "learning_rate": 9.997847149408844e-07, + "loss": 0.0, + "num_tokens": 3817582.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.598049521446228, + "sampling/importance_sampling_ratio/mean": 0.999319314956665, + "sampling/importance_sampling_ratio/min": 0.5836055874824524, + "sampling/sampling_logp_difference/max": 0.5385298728942871, + "sampling/sampling_logp_difference/mean": 0.015922270715236664, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 62.484375, + "completions/mean_terminated_length": 62.484375, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.2849852740764618, + "epoch": 0.43716814159292033, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.050583000758105276, + "kl": 0.007301662117242813, + "learning_rate": 9.997614588477033e-07, + "loss": 0.0, + "num_tokens": 3831645.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3781495094299316, + "sampling/importance_sampling_ratio/mean": 1.0004756450653076, + "sampling/importance_sampling_ratio/min": 0.46459150314331055, + "sampling/sampling_logp_difference/max": 0.766596794128418, + "sampling/sampling_logp_difference/mean": 0.014835800975561142, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 72.390625, + "completions/mean_terminated_length": 72.390625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.25849053263664246, + "epoch": 0.4389380530973451, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.7871676009843105, + "kl": 0.0029903594404459, + "learning_rate": 9.997370105232132e-07, + "loss": -0.1178, + "num_tokens": 3847062.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6574006080627441, + "sampling/importance_sampling_ratio/mean": 1.0001531839370728, + "sampling/importance_sampling_ratio/min": 0.5268588662147522, + "sampling/sampling_logp_difference/max": 0.6408225297927856, + "sampling/sampling_logp_difference/mean": 0.012619627639651299, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 43.890625, + "completions/mean_terminated_length": 43.890625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.16508735716342926, + "epoch": 0.4407079646017699, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06435464819757303, + "kl": 0.003507372457534075, + "learning_rate": 9.99711370025738e-07, + "loss": 0.0, + "num_tokens": 3861119.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.8435330390930176, + "sampling/importance_sampling_ratio/mean": 1.0004816055297852, + "sampling/importance_sampling_ratio/min": 0.5295119285583496, + "sampling/sampling_logp_difference/max": 0.6357995271682739, + "sampling/sampling_logp_difference/mean": 0.0132959159091115, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 125.0, + "completions/max_terminated_length": 125.0, + "completions/mean_length": 36.046875, + "completions/mean_terminated_length": 36.046875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.15764935314655304, + "epoch": 0.4424778761061947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05396769395261247, + "kl": 0.00200331280939281, + "learning_rate": 9.99684537416446e-07, + "loss": 0.0, + "num_tokens": 3879634.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4567588567733765, + "sampling/importance_sampling_ratio/mean": 1.0007212162017822, + "sampling/importance_sampling_ratio/min": 0.5999128818511963, + "sampling/sampling_logp_difference/max": 0.5109708309173584, + "sampling/sampling_logp_difference/mean": 0.008045634254813194, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 95.9375, + "completions/mean_terminated_length": 95.9375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3375559151172638, + "epoch": 0.44424778761061945, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022391294297959205, + "kl": 0.0018936173291876912, + "learning_rate": 9.996565127593489e-07, + "loss": 0.0, + "num_tokens": 3899038.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6154776811599731, + "sampling/importance_sampling_ratio/mean": 0.9995601773262024, + "sampling/importance_sampling_ratio/min": 0.6189666986465454, + "sampling/sampling_logp_difference/max": 0.4797039031982422, + "sampling/sampling_logp_difference/mean": 0.014130129478871822, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 170.0, + "completions/max_terminated_length": 170.0, + "completions/mean_length": 82.203125, + "completions/mean_terminated_length": 82.203125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.27879437804222107, + "epoch": 0.44601769911504424, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05656063312383896, + "kl": 0.0035494635812938213, + "learning_rate": 9.996272961213022e-07, + "loss": 0.0, + "num_tokens": 3915467.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5963265895843506, + "sampling/importance_sampling_ratio/mean": 0.9994061589241028, + "sampling/importance_sampling_ratio/min": 0.4906884729862213, + "sampling/sampling_logp_difference/max": 0.7119457721710205, + "sampling/sampling_logp_difference/mean": 0.01221383735537529, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 77.484375, + "completions/mean_terminated_length": 77.484375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.27256277203559875, + "epoch": 0.44778761061946903, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.370429715106762, + "kl": 0.004128012806177139, + "learning_rate": 9.995968875720051e-07, + "loss": 0.0854, + "num_tokens": 3931418.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5513712167739868, + "sampling/importance_sampling_ratio/mean": 0.9996453523635864, + "sampling/importance_sampling_ratio/min": 0.6445860862731934, + "sampling/sampling_logp_difference/max": 0.4391469955444336, + "sampling/sampling_logp_difference/mean": 0.013168378733098507, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 104.703125, + "completions/mean_terminated_length": 104.703125, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 0.39987921714782715, + "epoch": 0.4495575221238938, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02226855702470426, + "kl": 0.002329135313630104, + "learning_rate": 9.995652871840006e-07, + "loss": 0.0, + "num_tokens": 3948695.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6002646684646606, + "sampling/importance_sampling_ratio/mean": 1.0006269216537476, + "sampling/importance_sampling_ratio/min": 0.6878425478935242, + "sampling/sampling_logp_difference/max": 0.4701690673828125, + "sampling/sampling_logp_difference/mean": 0.014145614579319954, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 42.234375, + "completions/mean_terminated_length": 42.234375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2794543504714966, + "epoch": 0.45132743362831856, + "frac_reward_zero_std": 0.75, + "grad_norm": 23.872798817189192, + "kl": 0.0077172527089715, + "learning_rate": 9.995324950326745e-07, + "loss": 0.2481, + "num_tokens": 3961158.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.440090298652649, + "sampling/importance_sampling_ratio/mean": 0.9973623752593994, + "sampling/importance_sampling_ratio/min": 0.5127883553504944, + "sampling/sampling_logp_difference/max": 0.6678920984268188, + "sampling/sampling_logp_difference/mean": 0.016313213855028152, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 210.0, + "completions/max_terminated_length": 210.0, + "completions/mean_length": 76.859375, + "completions/mean_terminated_length": 76.859375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.366608589887619, + "epoch": 0.45309734513274336, + "frac_reward_zero_std": 0.75, + "grad_norm": 6.752881151929045, + "kl": 0.0026165214367210865, + "learning_rate": 9.994985111962555e-07, + "loss": -0.0774, + "num_tokens": 3976269.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.617125153541565, + "sampling/importance_sampling_ratio/mean": 0.9991705417633057, + "sampling/importance_sampling_ratio/min": 0.5433637499809265, + "sampling/sampling_logp_difference/max": 0.6099762916564941, + "sampling/sampling_logp_difference/mean": 0.018060646951198578, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 39.828125, + "completions/mean_terminated_length": 39.828125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.13335466384887695, + "epoch": 0.45486725663716815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10586924061032081, + "kl": 0.03460051491856575, + "learning_rate": 9.994633357558158e-07, + "loss": 0.0001, + "num_tokens": 3988818.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2890098094940186, + "sampling/importance_sampling_ratio/mean": 1.0003187656402588, + "sampling/importance_sampling_ratio/min": 0.530799150466919, + "sampling/sampling_logp_difference/max": 0.6333715915679932, + "sampling/sampling_logp_difference/mean": 0.012708578258752823, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 82.9375, + "completions/mean_terminated_length": 82.9375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4873947501182556, + "epoch": 0.45663716814159294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046335198779911754, + "kl": 0.004279019311070442, + "learning_rate": 9.994269687952698e-07, + "loss": 0.0, + "num_tokens": 4005070.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5861549377441406, + "sampling/importance_sampling_ratio/mean": 0.9983864426612854, + "sampling/importance_sampling_ratio/min": 0.583560585975647, + "sampling/sampling_logp_difference/max": 0.5386070013046265, + "sampling/sampling_logp_difference/mean": 0.019403012469410896, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 55.25, + "completions/mean_terminated_length": 55.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.28078144788742065, + "epoch": 0.4584070796460177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051731198908428394, + "kl": 0.00406224001199007, + "learning_rate": 9.993894104013746e-07, + "loss": 0.0, + "num_tokens": 4031454.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4271976947784424, + "sampling/importance_sampling_ratio/mean": 0.9989052414894104, + "sampling/importance_sampling_ratio/min": 0.6807091236114502, + "sampling/sampling_logp_difference/max": 0.38462018966674805, + "sampling/sampling_logp_difference/mean": 0.012434137985110283, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 136.0, + "completions/max_terminated_length": 136.0, + "completions/mean_length": 38.859375, + "completions/mean_terminated_length": 38.859375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.17825648188591003, + "epoch": 0.46017699115044247, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09457567520972249, + "kl": 0.004964154213666916, + "learning_rate": 9.993506606637296e-07, + "loss": 0.0001, + "num_tokens": 4045093.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.430456519126892, + "sampling/importance_sampling_ratio/mean": 0.9988031387329102, + "sampling/importance_sampling_ratio/min": 0.662390947341919, + "sampling/sampling_logp_difference/max": 0.4118993282318115, + "sampling/sampling_logp_difference/mean": 0.012801151722669601, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 63.53125, + "completions/mean_terminated_length": 63.53125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.24471867084503174, + "epoch": 0.46194690265486726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07442819078368382, + "kl": 0.014636065810918808, + "learning_rate": 9.993107196747758e-07, + "loss": 0.0001, + "num_tokens": 4059527.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4217251539230347, + "sampling/importance_sampling_ratio/mean": 0.9983421564102173, + "sampling/importance_sampling_ratio/min": 0.6172206997871399, + "sampling/sampling_logp_difference/max": 0.4825286865234375, + "sampling/sampling_logp_difference/mean": 0.014853391796350479, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 76.0625, + "completions/mean_terminated_length": 76.0625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2663308382034302, + "epoch": 0.46371681415929206, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03699252494847363, + "kl": 0.002792743733152747, + "learning_rate": 9.99269587529797e-07, + "loss": 0.0, + "num_tokens": 4076491.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4453238248825073, + "sampling/importance_sampling_ratio/mean": 0.9987292289733887, + "sampling/importance_sampling_ratio/min": 0.31267639994621277, + "sampling/sampling_logp_difference/max": 1.1625864505767822, + "sampling/sampling_logp_difference/mean": 0.010977178812026978, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 25.109375, + "completions/mean_terminated_length": 25.109375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.22926849126815796, + "epoch": 0.4654867256637168, + "frac_reward_zero_std": 0.75, + "grad_norm": 31.72844486467987, + "kl": 0.02525678463280201, + "learning_rate": 9.99227264326918e-07, + "loss": 0.4846, + "num_tokens": 4090498.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.595948576927185, + "sampling/importance_sampling_ratio/mean": 0.9998789429664612, + "sampling/importance_sampling_ratio/min": 0.6622171401977539, + "sampling/sampling_logp_difference/max": 0.46746826171875, + "sampling/sampling_logp_difference/mean": 0.016610635444521904, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 15.875, + "completions/mean_terminated_length": 15.875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.0870765745639801, + "epoch": 0.4672566371681416, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24809152797829495, + "kl": 0.005051221698522568, + "learning_rate": 9.991837501671048e-07, + "loss": 0.0, + "num_tokens": 4103130.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4061976671218872, + "sampling/importance_sampling_ratio/mean": 1.0006778240203857, + "sampling/importance_sampling_ratio/min": 0.730045735836029, + "sampling/sampling_logp_difference/max": 0.3408893346786499, + "sampling/sampling_logp_difference/mean": 0.02010222151875496, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 189.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 81.34375, + "completions/mean_terminated_length": 81.34375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3357410430908203, + "epoch": 0.4690265486725664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02930875122187649, + "kl": 0.0031524254009127617, + "learning_rate": 9.991390451541648e-07, + "loss": 0.0, + "num_tokens": 4118880.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4591114521026611, + "sampling/importance_sampling_ratio/mean": 0.9978489279747009, + "sampling/importance_sampling_ratio/min": 0.5481137037277222, + "sampling/sampling_logp_difference/max": 0.6012725830078125, + "sampling/sampling_logp_difference/mean": 0.01842043176293373, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 76.6875, + "completions/mean_terminated_length": 76.6875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.39841485023498535, + "epoch": 0.47079646017699117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09000805900037295, + "kl": 0.09546725451946259, + "learning_rate": 9.990931493947465e-07, + "loss": 0.0002, + "num_tokens": 4133548.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.588159203529358, + "sampling/importance_sampling_ratio/mean": 0.9959626197814941, + "sampling/importance_sampling_ratio/min": 0.4857686758041382, + "sampling/sampling_logp_difference/max": 0.7220227718353271, + "sampling/sampling_logp_difference/mean": 0.0178472138941288, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 245.0, + "completions/max_terminated_length": 245.0, + "completions/mean_length": 47.296875, + "completions/mean_terminated_length": 47.296875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.15155848860740662, + "epoch": 0.4725663716814159, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025041931999307744, + "kl": 0.006949643604457378, + "learning_rate": 9.990460629983388e-07, + "loss": 0.0, + "num_tokens": 4147439.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.2860136032104492, + "sampling/importance_sampling_ratio/mean": 0.9995124936103821, + "sampling/importance_sampling_ratio/min": 0.6633288264274597, + "sampling/sampling_logp_difference/max": 0.4104844331741333, + "sampling/sampling_logp_difference/mean": 0.006444099824875593, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.25, + "completions/mean_terminated_length": 15.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.07502971589565277, + "epoch": 0.4743362831858407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06736313727533422, + "kl": 0.0005349120474420488, + "learning_rate": 9.98997786077271e-07, + "loss": 0.0, + "num_tokens": 4161759.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3041719198226929, + "sampling/importance_sampling_ratio/mean": 1.00235915184021, + "sampling/importance_sampling_ratio/min": 0.8297764658927917, + "sampling/sampling_logp_difference/max": 0.26556825637817383, + "sampling/sampling_logp_difference/mean": 0.0092760119587183, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 61.265625, + "completions/mean_terminated_length": 61.265625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.331500768661499, + "epoch": 0.4761061946902655, + "frac_reward_zero_std": 0.25, + "grad_norm": 25.38060991543653, + "kl": 0.06742088496685028, + "learning_rate": 9.989483187467125e-07, + "loss": -0.0358, + "num_tokens": 4178736.0, + "reward": 0.5, + "reward_std": 0.5501632690429688, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.854530930519104, + "sampling/importance_sampling_ratio/mean": 0.9997930526733398, + "sampling/importance_sampling_ratio/min": 0.4544433653354645, + "sampling/sampling_logp_difference/max": 0.7886819839477539, + "sampling/sampling_logp_difference/mean": 0.018123114481568336, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 33.328125, + "completions/mean_terminated_length": 33.328125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.13617177307605743, + "epoch": 0.4778761061946903, + "frac_reward_zero_std": 0.75, + "grad_norm": 29.547501296153417, + "kl": 0.09512509405612946, + "learning_rate": 9.988976611246728e-07, + "loss": -0.7683, + "num_tokens": 4190165.0, + "reward": -0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": -0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.278420090675354, + "sampling/importance_sampling_ratio/mean": 1.0012259483337402, + "sampling/importance_sampling_ratio/min": 0.7977098822593689, + "sampling/sampling_logp_difference/max": 0.24562501907348633, + "sampling/sampling_logp_difference/mean": 0.00955257099121809, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 84.5, + "completions/mean_terminated_length": 84.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3737594485282898, + "epoch": 0.479646017699115, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.198167042441208, + "kl": 0.002854880876839161, + "learning_rate": 9.988458133320008e-07, + "loss": -0.0762, + "num_tokens": 4205285.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.346764087677002, + "sampling/importance_sampling_ratio/mean": 0.9988805651664734, + "sampling/importance_sampling_ratio/min": 0.5224089622497559, + "sampling/sampling_logp_difference/max": 0.6493045091629028, + "sampling/sampling_logp_difference/mean": 0.01678304933011532, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 82.8125, + "completions/mean_terminated_length": 82.8125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.43366530537605286, + "epoch": 0.4814159292035398, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031160634170031478, + "kl": 0.004372789058834314, + "learning_rate": 9.987927754923843e-07, + "loss": 0.0, + "num_tokens": 4224985.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6253509521484375, + "sampling/importance_sampling_ratio/mean": 1.000022530555725, + "sampling/importance_sampling_ratio/min": 0.417615681886673, + "sampling/sampling_logp_difference/max": 0.8731937408447266, + "sampling/sampling_logp_difference/mean": 0.017850469797849655, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 93.890625, + "completions/mean_terminated_length": 93.890625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.30273085832595825, + "epoch": 0.4831858407079646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03724891110348457, + "kl": 0.003369373269379139, + "learning_rate": 9.987385477323506e-07, + "loss": 0.0, + "num_tokens": 4242722.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4414142370224, + "sampling/importance_sampling_ratio/mean": 1.0018047094345093, + "sampling/importance_sampling_ratio/min": 0.5525964498519897, + "sampling/sampling_logp_difference/max": 0.5931272506713867, + "sampling/sampling_logp_difference/mean": 0.020217468962073326, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 103.703125, + "completions/mean_terminated_length": 103.703125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3254879415035248, + "epoch": 0.4849557522123894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028170574077244014, + "kl": 0.004011090844869614, + "learning_rate": 9.986831301812655e-07, + "loss": 0.0, + "num_tokens": 4260655.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3963828086853027, + "sampling/importance_sampling_ratio/mean": 0.9998421669006348, + "sampling/importance_sampling_ratio/min": 0.6422039270401001, + "sampling/sampling_logp_difference/max": 0.44284939765930176, + "sampling/sampling_logp_difference/mean": 0.012491639703512192, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 103.25, + "completions/mean_terminated_length": 103.25, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.4250674843788147, + "epoch": 0.48672566371681414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024067047052095888, + "kl": 0.002488392870873213, + "learning_rate": 9.98626522971333e-07, + "loss": 0.0, + "num_tokens": 4277503.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6527782678604126, + "sampling/importance_sampling_ratio/mean": 0.9999735355377197, + "sampling/importance_sampling_ratio/min": 0.5634852647781372, + "sampling/sampling_logp_difference/max": 0.5736141204833984, + "sampling/sampling_logp_difference/mean": 0.016514642164111137, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 57.203125, + "completions/mean_terminated_length": 57.203125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.30962735414505005, + "epoch": 0.48849557522123893, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08756911954230388, + "kl": 0.010065382346510887, + "learning_rate": 9.985687262375956e-07, + "loss": 0.0, + "num_tokens": 4300284.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5002374649047852, + "sampling/importance_sampling_ratio/mean": 1.0002117156982422, + "sampling/importance_sampling_ratio/min": 0.7399517297744751, + "sampling/sampling_logp_difference/max": 0.4056234359741211, + "sampling/sampling_logp_difference/mean": 0.014174298383295536, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 104.296875, + "completions/mean_terminated_length": 104.296875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.3918093144893646, + "epoch": 0.4902654867256637, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026797512712986273, + "kl": 0.0023234847467392683, + "learning_rate": 9.985097401179333e-07, + "loss": 0.0, + "num_tokens": 4315727.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5206025838851929, + "sampling/importance_sampling_ratio/mean": 1.0002238750457764, + "sampling/importance_sampling_ratio/min": 0.6069608926773071, + "sampling/sampling_logp_difference/max": 0.49929094314575195, + "sampling/sampling_logp_difference/mean": 0.014753839001059532, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 56.5, + "completions/mean_terminated_length": 56.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2540507912635803, + "epoch": 0.4920353982300885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09931648670257845, + "kl": 0.012060517445206642, + "learning_rate": 9.98449564753063e-07, + "loss": 0.0001, + "num_tokens": 4330927.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4970253705978394, + "sampling/importance_sampling_ratio/mean": 0.999140202999115, + "sampling/importance_sampling_ratio/min": 0.6079336404800415, + "sampling/sampling_logp_difference/max": 0.49768948554992676, + "sampling/sampling_logp_difference/mean": 0.012635363265872002, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 95.515625, + "completions/mean_terminated_length": 95.515625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.29506152868270874, + "epoch": 0.49380530973451325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02588733266619697, + "kl": 0.005474093370139599, + "learning_rate": 9.98388200286539e-07, + "loss": 0.0, + "num_tokens": 4346944.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7718281745910645, + "sampling/importance_sampling_ratio/mean": 0.9993443489074707, + "sampling/importance_sampling_ratio/min": 0.6424316167831421, + "sampling/sampling_logp_difference/max": 0.5720119476318359, + "sampling/sampling_logp_difference/mean": 0.015047137625515461, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 120.96875, + "completions/mean_terminated_length": 120.96875, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "entropy": 0.43145015835762024, + "epoch": 0.49557522123893805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15511521376513152, + "kl": 0.007133021019399166, + "learning_rate": 9.98325646864753e-07, + "loss": 0.0001, + "num_tokens": 4363742.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.998978853225708, + "sampling/importance_sampling_ratio/min": 0.41871336102485657, + "sampling/sampling_logp_difference/max": 0.8705687522888184, + "sampling/sampling_logp_difference/mean": 0.01744624227285385, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 59.71875, + "completions/mean_terminated_length": 59.71875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.33847692608833313, + "epoch": 0.49734513274336284, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5322890577873676, + "kl": 0.009394319728016853, + "learning_rate": 9.98261904636932e-07, + "loss": -0.0016, + "num_tokens": 4377820.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.586092233657837, + "sampling/importance_sampling_ratio/mean": 1.0006672143936157, + "sampling/importance_sampling_ratio/min": 0.6987010836601257, + "sampling/sampling_logp_difference/max": 0.461273193359375, + "sampling/sampling_logp_difference/mean": 0.015468766912817955, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 30.8125, + "completions/mean_terminated_length": 30.8125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.22856585681438446, + "epoch": 0.49911504424778763, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20919807997035103, + "kl": 0.013904878869652748, + "learning_rate": 9.9819697375514e-07, + "loss": 0.0001, + "num_tokens": 4399776.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3514093160629272, + "sampling/importance_sampling_ratio/mean": 1.0007905960083008, + "sampling/importance_sampling_ratio/min": 0.7770977020263672, + "sampling/sampling_logp_difference/max": 0.3011479377746582, + "sampling/sampling_logp_difference/mean": 0.011299017816781998, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 41.3125, + "completions/mean_terminated_length": 41.3125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.17480722069740295, + "epoch": 0.5008849557522124, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033366070697314384, + "kl": 0.0021748305298388004, + "learning_rate": 9.981308543742756e-07, + "loss": 0.0, + "num_tokens": 4416100.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2915865182876587, + "sampling/importance_sampling_ratio/mean": 0.9992952346801758, + "sampling/importance_sampling_ratio/min": 0.737482488155365, + "sampling/sampling_logp_difference/max": 0.30451297760009766, + "sampling/sampling_logp_difference/mean": 0.01027563028037548, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 68.890625, + "completions/mean_terminated_length": 68.890625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.24031007289886475, + "epoch": 0.5026548672566372, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.015676680037355565, + "kl": 0.001302052871324122, + "learning_rate": 9.980635466520736e-07, + "loss": 0.0, + "num_tokens": 4432957.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3076136112213135, + "sampling/importance_sampling_ratio/mean": 1.0002350807189941, + "sampling/importance_sampling_ratio/min": 0.7194289565086365, + "sampling/sampling_logp_difference/max": 0.3292975425720215, + "sampling/sampling_logp_difference/mean": 0.009223762899637222, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 72.375, + "completions/mean_terminated_length": 72.375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3059983253479004, + "epoch": 0.504424778761062, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03358499227826061, + "kl": 0.004222136922180653, + "learning_rate": 9.979950507491033e-07, + "loss": 0.0, + "num_tokens": 4453269.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9277665615081787, + "sampling/importance_sampling_ratio/mean": 0.999069333076477, + "sampling/importance_sampling_ratio/min": 0.7011988759040833, + "sampling/sampling_logp_difference/max": 0.6563620567321777, + "sampling/sampling_logp_difference/mean": 0.014185376465320587, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 100.25, + "completions/mean_terminated_length": 100.25, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.46278679370880127, + "epoch": 0.5061946902654867, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.349333059405631, + "kl": 0.00824448186904192, + "learning_rate": 9.979253668287685e-07, + "loss": 0.0758, + "num_tokens": 4470949.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.6193196773529053, + "sampling/importance_sampling_ratio/mean": 1.000375747680664, + "sampling/importance_sampling_ratio/min": 0.6411598324775696, + "sampling/sampling_logp_difference/max": 0.4820060729980469, + "sampling/sampling_logp_difference/mean": 0.017631113529205322, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 823.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 140.109375, + "completions/mean_terminated_length": 140.109375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.3196713924407959, + "epoch": 0.5079646017699115, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.4901874515071363, + "kl": 0.005851144902408123, + "learning_rate": 9.978544950573073e-07, + "loss": -0.0837, + "num_tokens": 4489404.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9992465376853943, + "sampling/importance_sampling_ratio/min": 0.6388933062553406, + "sampling/sampling_logp_difference/max": 0.8145382404327393, + "sampling/sampling_logp_difference/mean": 0.01222514733672142, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 81.03125, + "completions/mean_terminated_length": 81.03125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2995682954788208, + "epoch": 0.5097345132743363, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036957311092894976, + "kl": 0.006040679756551981, + "learning_rate": 9.977824356037915e-07, + "loss": 0.0, + "num_tokens": 4504238.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.493669033050537, + "sampling/importance_sampling_ratio/mean": 1.0006897449493408, + "sampling/importance_sampling_ratio/min": 0.6473813652992249, + "sampling/sampling_logp_difference/max": 0.43481969833374023, + "sampling/sampling_logp_difference/mean": 0.013412510976195335, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 75.078125, + "completions/mean_terminated_length": 75.078125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4171299338340759, + "epoch": 0.511504424778761, + "frac_reward_zero_std": 0.5, + "grad_norm": 15.761554871688615, + "kl": 0.018951520323753357, + "learning_rate": 9.97709188640126e-07, + "loss": 0.1355, + "num_tokens": 4521539.0, + "reward": 0.625, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.690384030342102, + "sampling/importance_sampling_ratio/mean": 0.9999475479125977, + "sampling/importance_sampling_ratio/min": 0.5168097615242004, + "sampling/sampling_logp_difference/max": 0.6600804328918457, + "sampling/sampling_logp_difference/mean": 0.01776520349085331, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 150.0, + "completions/max_terminated_length": 150.0, + "completions/mean_length": 56.796875, + "completions/mean_terminated_length": 56.796875, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.3325667381286621, + "epoch": 0.5132743362831859, + "frac_reward_zero_std": 0.5, + "grad_norm": 12.906319531110897, + "kl": 0.01129073090851307, + "learning_rate": 9.976347543410486e-07, + "loss": -0.0576, + "num_tokens": 4542166.0, + "reward": 0.71875, + "reward_std": 0.38319888710975647, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0011260509490967, + "sampling/importance_sampling_ratio/min": 0.6028571724891663, + "sampling/sampling_logp_difference/max": 0.7271323204040527, + "sampling/sampling_logp_difference/mean": 0.014359983615577221, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 91.40625, + "completions/mean_terminated_length": 91.40625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.27161622047424316, + "epoch": 0.5150442477876106, + "frac_reward_zero_std": 0.5, + "grad_norm": 6.2989239760333975, + "kl": 0.022020166739821434, + "learning_rate": 9.975591328841304e-07, + "loss": 0.1212, + "num_tokens": 4559424.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005266666412354, + "sampling/importance_sampling_ratio/min": 0.4944953918457031, + "sampling/sampling_logp_difference/max": 0.7871992588043213, + "sampling/sampling_logp_difference/mean": 0.013765428215265274, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 32.109375, + "completions/mean_terminated_length": 32.109375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.20906458795070648, + "epoch": 0.5168141592920354, + "frac_reward_zero_std": 0.5, + "grad_norm": 30.28759677520763, + "kl": 0.030800532549619675, + "learning_rate": 9.974823244497737e-07, + "loss": 0.1248, + "num_tokens": 4573079.0, + "reward": 0.46875, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9987730979919434, + "sampling/importance_sampling_ratio/min": 0.42857232689857483, + "sampling/sampling_logp_difference/max": 0.8472957611083984, + "sampling/sampling_logp_difference/mean": 0.016848154366016388, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 62.265625, + "completions/mean_terminated_length": 62.265625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.275778591632843, + "epoch": 0.5185840707964602, + "frac_reward_zero_std": 0.5, + "grad_norm": 11.562227028990126, + "kl": 0.030032262206077576, + "learning_rate": 9.974043292212127e-07, + "loss": 0.2261, + "num_tokens": 4587704.0, + "reward": 0.65625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.631435751914978, + "sampling/importance_sampling_ratio/mean": 0.9993401765823364, + "sampling/importance_sampling_ratio/min": 0.1340518444776535, + "sampling/sampling_logp_difference/max": 2.009528636932373, + "sampling/sampling_logp_difference/mean": 0.016510870307683945, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.359375, + "completions/mean_terminated_length": 17.359375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.049905285239219666, + "epoch": 0.5203539823008849, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.29504253071905645, + "kl": 0.008438438177108765, + "learning_rate": 9.97325147384513e-07, + "loss": 0.0001, + "num_tokens": 4622127.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2581522464752197, + "sampling/importance_sampling_ratio/mean": 0.9986610412597656, + "sampling/importance_sampling_ratio/min": 0.8183937072753906, + "sampling/sampling_logp_difference/max": 0.22964417934417725, + "sampling/sampling_logp_difference/mean": 0.0056549059227108955, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 111.421875, + "completions/mean_terminated_length": 111.421875, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "entropy": 0.5299802422523499, + "epoch": 0.5221238938053098, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.4250467422790787, + "kl": 0.0054356371983885765, + "learning_rate": 9.97244779128571e-07, + "loss": -0.0261, + "num_tokens": 4638570.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.3723515272140503, + "sampling/importance_sampling_ratio/mean": 0.9999358654022217, + "sampling/importance_sampling_ratio/min": 0.6040018200874329, + "sampling/sampling_logp_difference/max": 0.5041780471801758, + "sampling/sampling_logp_difference/mean": 0.01785511150956154, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 60.34375, + "completions/mean_terminated_length": 60.34375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.18186187744140625, + "epoch": 0.5238938053097345, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0889240132782423, + "kl": 0.009175324812531471, + "learning_rate": 9.971632246451127e-07, + "loss": 0.0001, + "num_tokens": 4652832.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.874014139175415, + "sampling/importance_sampling_ratio/mean": 1.0008121728897095, + "sampling/importance_sampling_ratio/min": 0.4280461370944977, + "sampling/sampling_logp_difference/max": 0.8485243320465088, + "sampling/sampling_logp_difference/mean": 0.008807593025267124, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 69.3125, + "completions/mean_terminated_length": 69.3125, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.29337161779403687, + "epoch": 0.5256637168141592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2845039152124611, + "kl": 0.01000886783003807, + "learning_rate": 9.970804841286953e-07, + "loss": 0.0001, + "num_tokens": 4668708.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.484485387802124, + "sampling/importance_sampling_ratio/mean": 0.9988928437232971, + "sampling/importance_sampling_ratio/min": 0.5810959935188293, + "sampling/sampling_logp_difference/max": 0.5428392887115479, + "sampling/sampling_logp_difference/mean": 0.012892980128526688, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 248.0, + "completions/max_terminated_length": 248.0, + "completions/mean_length": 106.4375, + "completions/mean_terminated_length": 106.4375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.44031625986099243, + "epoch": 0.5274336283185841, + "frac_reward_zero_std": 0.75, + "grad_norm": 6.971494631759584, + "kl": 0.006387860979884863, + "learning_rate": 9.96996557776704e-07, + "loss": -0.0907, + "num_tokens": 4684560.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.8611406087875366, + "sampling/importance_sampling_ratio/mean": 0.9997141361236572, + "sampling/importance_sampling_ratio/min": 0.6056256890296936, + "sampling/sampling_logp_difference/max": 0.6211895942687988, + "sampling/sampling_logp_difference/mean": 0.01681620627641678, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 196.0, + "completions/max_terminated_length": 196.0, + "completions/mean_length": 45.78125, + "completions/mean_terminated_length": 45.78125, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "entropy": 0.20236334204673767, + "epoch": 0.5292035398230088, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0530022444996644, + "kl": 0.008030351251363754, + "learning_rate": 9.969114457893539e-07, + "loss": 0.0001, + "num_tokens": 4700482.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6515015363693237, + "sampling/importance_sampling_ratio/mean": 0.9991205334663391, + "sampling/importance_sampling_ratio/min": 0.5748081207275391, + "sampling/sampling_logp_difference/max": 0.5537190437316895, + "sampling/sampling_logp_difference/mean": 0.009639833122491837, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 53.359375, + "completions/mean_terminated_length": 53.359375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.14972491562366486, + "epoch": 0.5309734513274337, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2657145861144557, + "kl": 0.02420363575220108, + "learning_rate": 9.96825148369688e-07, + "loss": 0.0001, + "num_tokens": 4718153.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6632829904556274, + "sampling/importance_sampling_ratio/mean": 0.9984308481216431, + "sampling/importance_sampling_ratio/min": 0.6867097616195679, + "sampling/sampling_logp_difference/max": 0.5087933540344238, + "sampling/sampling_logp_difference/mean": 0.011596485041081905, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 86.640625, + "completions/mean_terminated_length": 86.640625, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.4426685571670532, + "epoch": 0.5327433628318584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05573498068166531, + "kl": 0.0059372428804636, + "learning_rate": 9.967376657235778e-07, + "loss": 0.0001, + "num_tokens": 4736258.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.352237582206726, + "sampling/importance_sampling_ratio/mean": 1.0006613731384277, + "sampling/importance_sampling_ratio/min": 0.5942684412002563, + "sampling/sampling_logp_difference/max": 0.5204241275787354, + "sampling/sampling_logp_difference/mean": 0.017739780247211456, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 527.0, + "completions/max_terminated_length": 527.0, + "completions/mean_length": 75.59375, + "completions/mean_terminated_length": 75.59375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3071618676185608, + "epoch": 0.5345132743362832, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4380444782477624, + "kl": 0.05999153107404709, + "learning_rate": 9.966489980597217e-07, + "loss": 0.0002, + "num_tokens": 4753320.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0008987188339233, + "sampling/importance_sampling_ratio/min": 0.41841116547584534, + "sampling/sampling_logp_difference/max": 0.8712906837463379, + "sampling/sampling_logp_difference/mean": 0.01640094444155693, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 68.9375, + "completions/mean_terminated_length": 68.9375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.3281650245189667, + "epoch": 0.536283185840708, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07769537274799407, + "kl": 0.007740527857095003, + "learning_rate": 9.965591455896455e-07, + "loss": 0.0001, + "num_tokens": 4770196.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6265430450439453, + "sampling/importance_sampling_ratio/mean": 0.9986387491226196, + "sampling/importance_sampling_ratio/min": 0.7393641471862793, + "sampling/sampling_logp_difference/max": 0.48645687103271484, + "sampling/sampling_logp_difference/mean": 0.017250174656510353, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 104.25, + "completions/mean_terminated_length": 104.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3975277245044708, + "epoch": 0.5380530973451327, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2827678240213584, + "kl": 0.022672219201922417, + "learning_rate": 9.964681085277011e-07, + "loss": 0.0001, + "num_tokens": 4786692.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.001333475112915, + "sampling/importance_sampling_ratio/min": 0.592957079410553, + "sampling/sampling_logp_difference/max": 1.0330731868743896, + "sampling/sampling_logp_difference/mean": 0.017484024167060852, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 158.265625, + "completions/mean_terminated_length": 158.265625, + "completions/min_length": 40.0, + "completions/min_terminated_length": 40.0, + "entropy": 0.5590561628341675, + "epoch": 0.5398230088495575, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.08652906915803, + "kl": 0.005347556434571743, + "learning_rate": 9.96375887091067e-07, + "loss": 0.0594, + "num_tokens": 4807045.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.7717801332473755, + "sampling/importance_sampling_ratio/mean": 0.9994161128997803, + "sampling/importance_sampling_ratio/min": 0.5527122616767883, + "sampling/sampling_logp_difference/max": 0.5929176807403564, + "sampling/sampling_logp_difference/mean": 0.01890292763710022, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 142.390625, + "completions/mean_terminated_length": 142.390625, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.36418986320495605, + "epoch": 0.5415929203539823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03143902386920403, + "kl": 0.004250286612659693, + "learning_rate": 9.962824814997464e-07, + "loss": 0.0, + "num_tokens": 4827262.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0011675357818604, + "sampling/importance_sampling_ratio/min": 0.6306570172309875, + "sampling/sampling_logp_difference/max": 0.7148250341415405, + "sampling/sampling_logp_difference/mean": 0.01493034791201353, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 60.234375, + "completions/mean_terminated_length": 60.234375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.29062414169311523, + "epoch": 0.5433628318584071, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07331000426106707, + "kl": 0.011067867279052734, + "learning_rate": 9.961878919765677e-07, + "loss": 0.0001, + "num_tokens": 4841629.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4982765913009644, + "sampling/importance_sampling_ratio/mean": 0.9982985258102417, + "sampling/importance_sampling_ratio/min": 0.5298459529876709, + "sampling/sampling_logp_difference/max": 0.6351690292358398, + "sampling/sampling_logp_difference/mean": 0.013999614864587784, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 79.125, + "completions/mean_terminated_length": 79.125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2869476079940796, + "epoch": 0.5451327433628319, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0780303316547964, + "kl": 0.014650448225438595, + "learning_rate": 9.96092118747184e-07, + "loss": 0.0001, + "num_tokens": 4857221.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994796514511108, + "sampling/importance_sampling_ratio/min": 0.49689343571662903, + "sampling/sampling_logp_difference/max": 0.7066882848739624, + "sampling/sampling_logp_difference/mean": 0.018858671188354492, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 72.328125, + "completions/mean_terminated_length": 72.328125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.25819018483161926, + "epoch": 0.5469026548672566, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04189031557716253, + "kl": 0.0059643518179655075, + "learning_rate": 9.959951620400718e-07, + "loss": 0.0001, + "num_tokens": 4874138.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4584249258041382, + "sampling/importance_sampling_ratio/mean": 0.9993717074394226, + "sampling/importance_sampling_ratio/min": 0.6622897386550903, + "sampling/sampling_logp_difference/max": 0.4120521545410156, + "sampling/sampling_logp_difference/mean": 0.012498611584305763, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 68.3125, + "completions/mean_terminated_length": 68.3125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.401637464761734, + "epoch": 0.5486725663716814, + "frac_reward_zero_std": 0.75, + "grad_norm": 8.403602923707776, + "kl": 0.013075219467282295, + "learning_rate": 9.95897022086531e-07, + "loss": 0.1173, + "num_tokens": 4888862.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.3958466053009033, + "sampling/importance_sampling_ratio/mean": 0.9982624650001526, + "sampling/importance_sampling_ratio/min": 0.6836552619934082, + "sampling/sampling_logp_difference/max": 0.38030147552490234, + "sampling/sampling_logp_difference/mean": 0.016478929668664932, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 49.859375, + "completions/mean_terminated_length": 49.859375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3399659991264343, + "epoch": 0.5504424778761062, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15924851242976146, + "kl": 0.015980035066604614, + "learning_rate": 9.957976991206845e-07, + "loss": 0.0001, + "num_tokens": 4904533.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4301775693893433, + "sampling/importance_sampling_ratio/mean": 1.001241683959961, + "sampling/importance_sampling_ratio/min": 0.6126782298088074, + "sampling/sampling_logp_difference/max": 0.4899153709411621, + "sampling/sampling_logp_difference/mean": 0.01730850338935852, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 83.75, + "completions/mean_terminated_length": 83.75, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.41513192653656006, + "epoch": 0.552212389380531, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04073152091838604, + "kl": 0.006671360228210688, + "learning_rate": 9.956971933794773e-07, + "loss": 0.0001, + "num_tokens": 4920229.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.8718671798706055, + "sampling/importance_sampling_ratio/mean": 1.0000343322753906, + "sampling/importance_sampling_ratio/min": 0.5747321844100952, + "sampling/sampling_logp_difference/max": 0.6269364356994629, + "sampling/sampling_logp_difference/mean": 0.01486043632030487, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 135.65625, + "completions/mean_terminated_length": 135.65625, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "entropy": 0.4970802664756775, + "epoch": 0.5539823008849557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021734362861208452, + "kl": 0.0033400128595530987, + "learning_rate": 9.955955051026758e-07, + "loss": 0.0, + "num_tokens": 4939343.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5506399869918823, + "sampling/importance_sampling_ratio/mean": 1.0004029273986816, + "sampling/importance_sampling_ratio/min": 0.710355281829834, + "sampling/sampling_logp_difference/max": 0.43866777420043945, + "sampling/sampling_logp_difference/mean": 0.01776507869362831, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 52.359375, + "completions/mean_terminated_length": 52.359375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2804589867591858, + "epoch": 0.5557522123893806, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06747453302786525, + "kl": 0.007154546212404966, + "learning_rate": 9.954926345328678e-07, + "loss": 0.0001, + "num_tokens": 4954166.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5472042560577393, + "sampling/importance_sampling_ratio/mean": 1.0000067949295044, + "sampling/importance_sampling_ratio/min": 0.6417877674102783, + "sampling/sampling_logp_difference/max": 0.4434976577758789, + "sampling/sampling_logp_difference/mean": 0.014697583392262459, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 14.84375, + "completions/mean_terminated_length": 14.84375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.07681912183761597, + "epoch": 0.5575221238938053, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7951193652292655, + "kl": 0.022772781550884247, + "learning_rate": 9.953885819154614e-07, + "loss": 0.0002, + "num_tokens": 4975244.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5425488948822021, + "sampling/importance_sampling_ratio/mean": 0.9988242983818054, + "sampling/importance_sampling_ratio/min": 0.5694617629051208, + "sampling/sampling_logp_difference/max": 0.5630636215209961, + "sampling/sampling_logp_difference/mean": 0.018749669194221497, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 106.96875, + "completions/mean_terminated_length": 106.96875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4102153778076172, + "epoch": 0.5592920353982301, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.188538697896643, + "kl": 0.003499663434922695, + "learning_rate": 9.952833474986846e-07, + "loss": 0.04, + "num_tokens": 4992490.0, + "reward": -0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5456247329711914, + "sampling/importance_sampling_ratio/mean": 0.9988653063774109, + "sampling/importance_sampling_ratio/min": 0.6146138310432434, + "sampling/sampling_logp_difference/max": 0.48676109313964844, + "sampling/sampling_logp_difference/mean": 0.020490184426307678, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 119.140625, + "completions/mean_terminated_length": 119.140625, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "entropy": 0.6081708669662476, + "epoch": 0.5610619469026549, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.7009228075577634, + "kl": 0.0045472802594304085, + "learning_rate": 9.951769315335843e-07, + "loss": 0.0164, + "num_tokens": 5009779.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.9710369110107422, + "sampling/importance_sampling_ratio/mean": 1.0008913278579712, + "sampling/importance_sampling_ratio/min": 0.5404453873634338, + "sampling/sampling_logp_difference/max": 0.6785597801208496, + "sampling/sampling_logp_difference/mean": 0.02135409787297249, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 79.640625, + "completions/mean_terminated_length": 79.640625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4556015729904175, + "epoch": 0.5628318584070796, + "frac_reward_zero_std": 0.75, + "grad_norm": 11.94893909065176, + "kl": 0.004789283964782953, + "learning_rate": 9.95069334274027e-07, + "loss": 0.2709, + "num_tokens": 5026012.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.5947656631469727, + "sampling/importance_sampling_ratio/mean": 1.0000662803649902, + "sampling/importance_sampling_ratio/min": 0.6957854628562927, + "sampling/sampling_logp_difference/max": 0.46672677993774414, + "sampling/sampling_logp_difference/mean": 0.02015022188425064, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 41.609375, + "completions/mean_terminated_length": 41.609375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.30406153202056885, + "epoch": 0.5646017699115045, + "frac_reward_zero_std": 0.75, + "grad_norm": 12.714616664961063, + "kl": 0.01399939227849245, + "learning_rate": 9.949605559766967e-07, + "loss": -0.1305, + "num_tokens": 5041763.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.4089725017547607, + "sampling/importance_sampling_ratio/mean": 0.9992440938949585, + "sampling/importance_sampling_ratio/min": 0.6184074282646179, + "sampling/sampling_logp_difference/max": 0.4806077480316162, + "sampling/sampling_logp_difference/mean": 0.012585079297423363, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 41.546875, + "completions/mean_terminated_length": 41.546875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.17763113975524902, + "epoch": 0.5663716814159292, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14469667542750955, + "kl": 0.017358124256134033, + "learning_rate": 9.94850596901095e-07, + "loss": 0.0001, + "num_tokens": 5056518.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4221454858779907, + "sampling/importance_sampling_ratio/mean": 1.0001862049102783, + "sampling/importance_sampling_ratio/min": 0.586487889289856, + "sampling/sampling_logp_difference/max": 0.5336031913757324, + "sampling/sampling_logp_difference/mean": 0.01157684437930584, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 76.484375, + "completions/mean_terminated_length": 76.484375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3155892789363861, + "epoch": 0.5681415929203539, + "frac_reward_zero_std": 0.75, + "grad_norm": 7.021400392096172, + "kl": 0.010425843298435211, + "learning_rate": 9.947394573095402e-07, + "loss": 0.2305, + "num_tokens": 5075909.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.4295427799224854, + "sampling/importance_sampling_ratio/mean": 0.9993923902511597, + "sampling/importance_sampling_ratio/min": 0.6148527264595032, + "sampling/sampling_logp_difference/max": 0.4863724708557129, + "sampling/sampling_logp_difference/mean": 0.01577897183597088, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 86.734375, + "completions/mean_terminated_length": 86.734375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3866017162799835, + "epoch": 0.5699115044247788, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040077809721334394, + "kl": 0.007752751465886831, + "learning_rate": 9.94627137467167e-07, + "loss": 0.0001, + "num_tokens": 5090708.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6048550605773926, + "sampling/importance_sampling_ratio/mean": 0.9996352195739746, + "sampling/importance_sampling_ratio/min": 0.5155983567237854, + "sampling/sampling_logp_difference/max": 0.6624271869659424, + "sampling/sampling_logp_difference/mean": 0.014700477942824364, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 35.953125, + "completions/mean_terminated_length": 35.953125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1371205598115921, + "epoch": 0.5716814159292035, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2096955708072268, + "kl": 0.019187739118933678, + "learning_rate": 9.945136376419258e-07, + "loss": 0.0001, + "num_tokens": 5105873.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3945482969284058, + "sampling/importance_sampling_ratio/mean": 0.9994967579841614, + "sampling/importance_sampling_ratio/min": 0.47429314255714417, + "sampling/sampling_logp_difference/max": 0.7459297180175781, + "sampling/sampling_logp_difference/mean": 0.007329988293349743, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 130.0, + "completions/max_terminated_length": 130.0, + "completions/mean_length": 33.78125, + "completions/mean_terminated_length": 33.78125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.12136248499155045, + "epoch": 0.5734513274336284, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20048100584716425, + "kl": 0.018594108521938324, + "learning_rate": 9.943989581045819e-07, + "loss": 0.0001, + "num_tokens": 5122099.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.266669511795044, + "sampling/importance_sampling_ratio/mean": 0.9998998641967773, + "sampling/importance_sampling_ratio/min": 0.7039791941642761, + "sampling/sampling_logp_difference/max": 0.35100650787353516, + "sampling/sampling_logp_difference/mean": 0.005552534945309162, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 55.84375, + "completions/mean_terminated_length": 55.84375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3695143461227417, + "epoch": 0.5752212389380531, + "frac_reward_zero_std": 0.75, + "grad_norm": 12.740230354540556, + "kl": 0.00976649858057499, + "learning_rate": 9.942830991287149e-07, + "loss": 0.0784, + "num_tokens": 5135977.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5773446559906006, + "sampling/importance_sampling_ratio/mean": 0.9974552989006042, + "sampling/importance_sampling_ratio/min": 0.6371076703071594, + "sampling/sampling_logp_difference/max": 0.45574283599853516, + "sampling/sampling_logp_difference/mean": 0.021933196112513542, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 97.875, + "completions/mean_terminated_length": 97.875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.3761260509490967, + "epoch": 0.5769911504424778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0306591512793739, + "kl": 0.003819515462964773, + "learning_rate": 9.94166060990718e-07, + "loss": 0.0, + "num_tokens": 5151105.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6236847639083862, + "sampling/importance_sampling_ratio/mean": 1.000258207321167, + "sampling/importance_sampling_ratio/min": 0.5582404732704163, + "sampling/sampling_logp_difference/max": 0.5829653739929199, + "sampling/sampling_logp_difference/mean": 0.01616375334560871, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 40.4375, + "completions/mean_terminated_length": 40.4375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.21063324809074402, + "epoch": 0.5787610619469027, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18340809261246058, + "kl": 0.03675929456949234, + "learning_rate": 9.940478439697972e-07, + "loss": 0.0002, + "num_tokens": 5165245.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6573799848556519, + "sampling/importance_sampling_ratio/mean": 0.9981979727745056, + "sampling/importance_sampling_ratio/min": 0.47290316224098206, + "sampling/sampling_logp_difference/max": 0.7488646507263184, + "sampling/sampling_logp_difference/mean": 0.012335125356912613, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 54.9375, + "completions/mean_terminated_length": 54.9375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.35798150300979614, + "epoch": 0.5805309734513274, + "frac_reward_zero_std": 0.5, + "grad_norm": 7.141605430777651, + "kl": 0.05895555764436722, + "learning_rate": 9.939284483479715e-07, + "loss": 0.0999, + "num_tokens": 5178681.0, + "reward": 0.59375, + "reward_std": 0.497555673122406, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.5644574165344238, + "sampling/importance_sampling_ratio/mean": 1.0009150505065918, + "sampling/importance_sampling_ratio/min": 0.5880892872810364, + "sampling/sampling_logp_difference/max": 0.5308763980865479, + "sampling/sampling_logp_difference/mean": 0.02186429128050804, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 14.609375, + "completions/mean_terminated_length": 14.609375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.05067729949951172, + "epoch": 0.5823008849557522, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5404423741608648, + "kl": 0.02486887201666832, + "learning_rate": 9.93807874410071e-07, + "loss": 0.0002, + "num_tokens": 5191696.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.265113115310669, + "sampling/importance_sampling_ratio/mean": 0.9971482753753662, + "sampling/importance_sampling_ratio/min": 0.7563673257827759, + "sampling/sampling_logp_difference/max": 0.279228150844574, + "sampling/sampling_logp_difference/mean": 0.009500461630523205, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 85.0, + "completions/max_terminated_length": 85.0, + "completions/mean_length": 15.828125, + "completions/mean_terminated_length": 15.828125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1415385901927948, + "epoch": 0.584070796460177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.30440635525570386, + "kl": 0.1242285966873169, + "learning_rate": 9.936861224437372e-07, + "loss": 0.0013, + "num_tokens": 5205861.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5973737239837646, + "sampling/importance_sampling_ratio/mean": 1.0021167993545532, + "sampling/importance_sampling_ratio/min": 0.7834390997886658, + "sampling/sampling_logp_difference/max": 0.46836090087890625, + "sampling/sampling_logp_difference/mean": 0.008564083836972713, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 13.65625, + "completions/mean_terminated_length": 13.65625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.062073707580566406, + "epoch": 0.5858407079646017, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08148900210167391, + "kl": 0.1352459043264389, + "learning_rate": 9.935631927394214e-07, + "loss": 0.0014, + "num_tokens": 5218223.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6912966966629028, + "sampling/importance_sampling_ratio/mean": 0.9999576807022095, + "sampling/importance_sampling_ratio/min": 0.6084062457084656, + "sampling/sampling_logp_difference/max": 0.5254955291748047, + "sampling/sampling_logp_difference/mean": 0.011843099258840084, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 18.0, + "completions/max_terminated_length": 18.0, + "completions/mean_length": 14.96875, + "completions/mean_terminated_length": 14.96875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.07408523559570312, + "epoch": 0.5876106194690266, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.37575585050410487, + "kl": 0.12586016952991486, + "learning_rate": 9.934390855903852e-07, + "loss": 0.0011, + "num_tokens": 5233069.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.1157681941986084, + "sampling/importance_sampling_ratio/mean": 0.9991582632064819, + "sampling/importance_sampling_ratio/min": 0.8067606091499329, + "sampling/sampling_logp_difference/max": 0.21472835540771484, + "sampling/sampling_logp_difference/mean": 0.005963312461972237, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 98.0, + "completions/max_terminated_length": 98.0, + "completions/mean_length": 14.796875, + "completions/mean_terminated_length": 14.796875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.20496223866939545, + "epoch": 0.5893805309734513, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2765369592767717, + "kl": 0.2802959978580475, + "learning_rate": 9.93313801292698e-07, + "loss": 0.003, + "num_tokens": 5243936.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.646456241607666, + "sampling/importance_sampling_ratio/mean": 1.0015816688537598, + "sampling/importance_sampling_ratio/min": 0.5725432634353638, + "sampling/sampling_logp_difference/max": 0.5576670169830322, + "sampling/sampling_logp_difference/mean": 0.008417494595050812, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 146.0, + "completions/max_terminated_length": 146.0, + "completions/mean_length": 19.640625, + "completions/mean_terminated_length": 19.640625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1658213883638382, + "epoch": 0.5911504424778761, + "frac_reward_zero_std": 0.75, + "grad_norm": 38.715214411963814, + "kl": 0.1244959682226181, + "learning_rate": 9.93187340145239e-07, + "loss": -0.506, + "num_tokens": 5256345.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.4853169918060303, + "sampling/importance_sampling_ratio/mean": 0.99802565574646, + "sampling/importance_sampling_ratio/min": 0.6318823099136353, + "sampling/sampling_logp_difference/max": 0.45905208587646484, + "sampling/sampling_logp_difference/mean": 0.017435431480407715, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 83.0, + "completions/max_terminated_length": 83.0, + "completions/mean_length": 14.25, + "completions/mean_terminated_length": 14.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.16205374896526337, + "epoch": 0.5929203539823009, + "frac_reward_zero_std": 0.75, + "grad_norm": 45.914292343197125, + "kl": 0.5228198170661926, + "learning_rate": 9.93059702449693e-07, + "loss": -0.1383, + "num_tokens": 5267081.0, + "reward": -0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.230634093284607, + "sampling/importance_sampling_ratio/mean": 1.0003182888031006, + "sampling/importance_sampling_ratio/min": 0.8197073936462402, + "sampling/sampling_logp_difference/max": 0.20752954483032227, + "sampling/sampling_logp_difference/mean": 0.0072051300667226315, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 133.0, + "completions/max_terminated_length": 133.0, + "completions/mean_length": 15.390625, + "completions/mean_terminated_length": 15.390625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1279073804616928, + "epoch": 0.5946902654867257, + "frac_reward_zero_std": 0.75, + "grad_norm": 31.76875433357429, + "kl": 0.5445804595947266, + "learning_rate": 9.929308885105534e-07, + "loss": -0.2191, + "num_tokens": 5279250.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.1882487535476685, + "sampling/importance_sampling_ratio/mean": 1.0010457038879395, + "sampling/importance_sampling_ratio/min": 0.7639153599739075, + "sampling/sampling_logp_difference/max": 0.2692983150482178, + "sampling/sampling_logp_difference/mean": 0.00909061636775732, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 98.0, + "completions/max_terminated_length": 98.0, + "completions/mean_length": 16.4375, + "completions/mean_terminated_length": 16.4375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.12667866051197052, + "epoch": 0.5964601769911504, + "frac_reward_zero_std": 0.75, + "grad_norm": 50.528642905457026, + "kl": 0.21811896562576294, + "learning_rate": 9.928008986351186e-07, + "loss": -0.4598, + "num_tokens": 5290814.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.3493925333023071, + "sampling/importance_sampling_ratio/mean": 1.000293493270874, + "sampling/importance_sampling_ratio/min": 0.7212910056114197, + "sampling/sampling_logp_difference/max": 0.32671260833740234, + "sampling/sampling_logp_difference/mean": 0.007266349159181118, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 78.5625, + "completions/mean_terminated_length": 78.5625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.40841299295425415, + "epoch": 0.5982300884955752, + "frac_reward_zero_std": 0.0, + "grad_norm": 22.944560152841685, + "kl": 0.12588217854499817, + "learning_rate": 9.926697331334924e-07, + "loss": -0.3419, + "num_tokens": 5306290.0, + "reward": 0.15625, + "reward_std": 0.7297805547714233, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.8606716394424438, + "sampling/importance_sampling_ratio/mean": 1.0010006427764893, + "sampling/importance_sampling_ratio/min": 0.4402218163013458, + "sampling/sampling_logp_difference/max": 0.8204765319824219, + "sampling/sampling_logp_difference/mean": 0.02122463285923004, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 168.0, + "completions/max_terminated_length": 168.0, + "completions/mean_length": 69.5, + "completions/mean_terminated_length": 69.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3494517505168915, + "epoch": 0.6, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03945891901988461, + "kl": 0.004555393010377884, + "learning_rate": 9.925373923185834e-07, + "loss": 0.0001, + "num_tokens": 5320178.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2857304811477661, + "sampling/importance_sampling_ratio/mean": 0.9992601871490479, + "sampling/importance_sampling_ratio/min": 0.6282122731208801, + "sampling/sampling_logp_difference/max": 0.4648771286010742, + "sampling/sampling_logp_difference/mean": 0.012271528132259846, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 183.0, + "completions/max_terminated_length": 183.0, + "completions/mean_length": 86.203125, + "completions/mean_terminated_length": 86.203125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.544769287109375, + "epoch": 0.6017699115044248, + "frac_reward_zero_std": 0.75, + "grad_norm": 10.513692113628164, + "kl": 0.012212522327899933, + "learning_rate": 9.92403876506104e-07, + "loss": -0.1197, + "num_tokens": 5336207.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.5996742248535156, + "sampling/importance_sampling_ratio/mean": 1.00087571144104, + "sampling/importance_sampling_ratio/min": 0.7218353748321533, + "sampling/sampling_logp_difference/max": 0.4697999954223633, + "sampling/sampling_logp_difference/mean": 0.018725665286183357, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 50.984375, + "completions/mean_terminated_length": 50.984375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.25952285528182983, + "epoch": 0.6035398230088496, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13402526271688212, + "kl": 0.025582045316696167, + "learning_rate": 9.922691860145696e-07, + "loss": 0.0001, + "num_tokens": 5350654.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.56446373462677, + "sampling/importance_sampling_ratio/mean": 1.0010714530944824, + "sampling/importance_sampling_ratio/min": 0.6776019930839539, + "sampling/sampling_logp_difference/max": 0.4475431442260742, + "sampling/sampling_logp_difference/mean": 0.015527051873505116, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 51.3125, + "completions/mean_terminated_length": 51.3125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.30644503235816956, + "epoch": 0.6053097345132743, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28312367214821127, + "kl": 0.05292034149169922, + "learning_rate": 9.921333211652977e-07, + "loss": 0.0002, + "num_tokens": 5365426.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.364556074142456, + "sampling/importance_sampling_ratio/mean": 1.0007081031799316, + "sampling/importance_sampling_ratio/min": 0.559749960899353, + "sampling/sampling_logp_difference/max": 0.5802650451660156, + "sampling/sampling_logp_difference/mean": 0.014037737622857094, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 97.203125, + "completions/mean_terminated_length": 97.203125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3141934275627136, + "epoch": 0.6070796460176991, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040306133956537274, + "kl": 0.008490835316479206, + "learning_rate": 9.919962822824083e-07, + "loss": 0.0001, + "num_tokens": 5381807.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.52422034740448, + "sampling/importance_sampling_ratio/mean": 1.0011824369430542, + "sampling/importance_sampling_ratio/min": 0.6413918733596802, + "sampling/sampling_logp_difference/max": 0.44411468505859375, + "sampling/sampling_logp_difference/mean": 0.011814025230705738, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 112.453125, + "completions/mean_terminated_length": 112.453125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3980545401573181, + "epoch": 0.6088495575221239, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03816208034133788, + "kl": 0.014165371656417847, + "learning_rate": 9.918580696928205e-07, + "loss": 0.0001, + "num_tokens": 5400892.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8213403224945068, + "sampling/importance_sampling_ratio/mean": 0.999814510345459, + "sampling/importance_sampling_ratio/min": 0.5459651947021484, + "sampling/sampling_logp_difference/max": 0.6052000522613525, + "sampling/sampling_logp_difference/mean": 0.022808555513620377, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 65.78125, + "completions/mean_terminated_length": 65.78125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3655658960342407, + "epoch": 0.6106194690265486, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08708536054248218, + "kl": 0.019657455384731293, + "learning_rate": 9.91718683726255e-07, + "loss": 0.0001, + "num_tokens": 5417822.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4601846933364868, + "sampling/importance_sampling_ratio/mean": 1.0012413263320923, + "sampling/importance_sampling_ratio/min": 0.6977089643478394, + "sampling/sampling_logp_difference/max": 0.37856292724609375, + "sampling/sampling_logp_difference/mean": 0.016577202826738358, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 128.0, + "completions/max_terminated_length": 128.0, + "completions/mean_length": 35.328125, + "completions/mean_terminated_length": 35.328125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.22649946808815002, + "epoch": 0.6123893805309735, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2680127798221453, + "kl": 0.037691764533519745, + "learning_rate": 9.915781247152308e-07, + "loss": 0.0002, + "num_tokens": 5431411.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6613584756851196, + "sampling/importance_sampling_ratio/mean": 1.000566005706787, + "sampling/importance_sampling_ratio/min": 0.6990428566932678, + "sampling/sampling_logp_difference/max": 0.5076355934143066, + "sampling/sampling_logp_difference/mean": 0.012610787525773048, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 80.25, + "completions/mean_terminated_length": 80.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.38664960861206055, + "epoch": 0.6141592920353982, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07839437680152708, + "kl": 0.013993092812597752, + "learning_rate": 9.914363929950657e-07, + "loss": 0.0001, + "num_tokens": 5447107.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0008046627044678, + "sampling/importance_sampling_ratio/min": 0.5363089442253113, + "sampling/sampling_logp_difference/max": 1.106311321258545, + "sampling/sampling_logp_difference/mean": 0.018526988103985786, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 88.0, + "completions/mean_terminated_length": 88.0, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.35303232073783875, + "epoch": 0.6159292035398231, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1290413396036071, + "kl": 0.03378347307443619, + "learning_rate": 9.91293488903875e-07, + "loss": 0.0002, + "num_tokens": 5464083.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5305931568145752, + "sampling/importance_sampling_ratio/mean": 1.0008561611175537, + "sampling/importance_sampling_ratio/min": 0.6892523765563965, + "sampling/sampling_logp_difference/max": 0.4256553649902344, + "sampling/sampling_logp_difference/mean": 0.012805117294192314, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 101.84375, + "completions/mean_terminated_length": 101.84375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.4014196991920471, + "epoch": 0.6176991150442478, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05282522173640956, + "kl": 0.012553641572594643, + "learning_rate": 9.91149412782571e-07, + "loss": 0.0001, + "num_tokens": 5481881.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4656864404678345, + "sampling/importance_sampling_ratio/mean": 0.9996223449707031, + "sampling/importance_sampling_ratio/min": 0.6784743070602417, + "sampling/sampling_logp_difference/max": 0.3879086971282959, + "sampling/sampling_logp_difference/mean": 0.015167336910963058, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 121.859375, + "completions/mean_terminated_length": 121.859375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.42137038707733154, + "epoch": 0.6194690265486725, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17263349465215008, + "kl": 0.021598845720291138, + "learning_rate": 9.910041649748612e-07, + "loss": 0.0002, + "num_tokens": 5498240.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998193979263306, + "sampling/importance_sampling_ratio/min": 0.42447274923324585, + "sampling/sampling_logp_difference/max": 0.8569074869155884, + "sampling/sampling_logp_difference/mean": 0.015872254967689514, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 120.828125, + "completions/mean_terminated_length": 120.828125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.5624607801437378, + "epoch": 0.6212389380530974, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07907780312988107, + "kl": 0.04852885752916336, + "learning_rate": 9.908577458272495e-07, + "loss": 0.0005, + "num_tokens": 5517605.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.870368242263794, + "sampling/importance_sampling_ratio/mean": 1.0003533363342285, + "sampling/importance_sampling_ratio/min": 0.5663659572601318, + "sampling/sampling_logp_difference/max": 0.6261353492736816, + "sampling/sampling_logp_difference/mean": 0.021383430808782578, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 130.515625, + "completions/mean_terminated_length": 130.515625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.510712742805481, + "epoch": 0.6230088495575221, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14652883275587417, + "kl": 0.037225618958473206, + "learning_rate": 9.907101556890331e-07, + "loss": 0.0004, + "num_tokens": 5535334.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000260829925537, + "sampling/importance_sampling_ratio/min": 0.6686792373657227, + "sampling/sampling_logp_difference/max": 0.7186784744262695, + "sampling/sampling_logp_difference/mean": 0.017669696360826492, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 134.375, + "completions/mean_terminated_length": 134.375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.4618055820465088, + "epoch": 0.6247787610619469, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0578213304417604, + "kl": 0.034855760633945465, + "learning_rate": 9.905613949123034e-07, + "loss": 0.0004, + "num_tokens": 5553918.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6922149658203125, + "sampling/importance_sampling_ratio/mean": 0.9999423027038574, + "sampling/importance_sampling_ratio/min": 0.6325134038925171, + "sampling/sampling_logp_difference/max": 0.5260382890701294, + "sampling/sampling_logp_difference/mean": 0.01687721721827984, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 122.09375, + "completions/mean_terminated_length": 122.09375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.6637468338012695, + "epoch": 0.6265486725663717, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05067320244235177, + "kl": 0.02216402254998684, + "learning_rate": 9.904114638519443e-07, + "loss": 0.0003, + "num_tokens": 5573492.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.7573411464691162, + "sampling/importance_sampling_ratio/mean": 0.9993027448654175, + "sampling/importance_sampling_ratio/min": 0.566085159778595, + "sampling/sampling_logp_difference/max": 0.5690107345581055, + "sampling/sampling_logp_difference/mean": 0.02189033478498459, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 108.984375, + "completions/mean_terminated_length": 108.984375, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "entropy": 0.5064927339553833, + "epoch": 0.6283185840707964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022769992272464255, + "kl": 0.003910430707037449, + "learning_rate": 9.902603628656311e-07, + "loss": 0.0, + "num_tokens": 5590211.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4734493494033813, + "sampling/importance_sampling_ratio/mean": 1.0005618333816528, + "sampling/importance_sampling_ratio/min": 0.6874695420265198, + "sampling/sampling_logp_difference/max": 0.387606143951416, + "sampling/sampling_logp_difference/mean": 0.017741823568940163, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 146.703125, + "completions/mean_terminated_length": 146.703125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.5128886103630066, + "epoch": 0.6300884955752213, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05910555125404528, + "kl": 0.06985120475292206, + "learning_rate": 9.901080923138308e-07, + "loss": 0.0006, + "num_tokens": 5622528.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000289797782898, + "sampling/importance_sampling_ratio/min": 0.6196466088294983, + "sampling/sampling_logp_difference/max": 0.8414640426635742, + "sampling/sampling_logp_difference/mean": 0.01770433411002159, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 868.0, + "completions/max_terminated_length": 868.0, + "completions/mean_length": 178.484375, + "completions/mean_terminated_length": 178.484375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.4746066629886627, + "epoch": 0.631858407079646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05989841681630618, + "kl": 0.027999769896268845, + "learning_rate": 9.899546525597997e-07, + "loss": 0.0003, + "num_tokens": 5644367.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.527292013168335, + "sampling/importance_sampling_ratio/mean": 0.9995212554931641, + "sampling/importance_sampling_ratio/min": 0.44542253017425537, + "sampling/sampling_logp_difference/max": 0.8087320327758789, + "sampling/sampling_logp_difference/mean": 0.017029207199811935, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 112.890625, + "completions/mean_terminated_length": 112.890625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.620051383972168, + "epoch": 0.6336283185840708, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026391207446395157, + "kl": 0.005611338187009096, + "learning_rate": 9.898000439695843e-07, + "loss": 0.0001, + "num_tokens": 5661352.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9880595207214355, + "sampling/importance_sampling_ratio/mean": 1.0005333423614502, + "sampling/importance_sampling_ratio/min": 0.7001907825469971, + "sampling/sampling_logp_difference/max": 0.6871590614318848, + "sampling/sampling_logp_difference/mean": 0.02070365846157074, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 122.03125, + "completions/mean_terminated_length": 122.03125, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.5711897611618042, + "epoch": 0.6353982300884956, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.36483039142888785, + "kl": 0.0318559929728508, + "learning_rate": 9.896442669120187e-07, + "loss": 0.0003, + "num_tokens": 5679354.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.430940866470337, + "sampling/importance_sampling_ratio/mean": 0.9996042847633362, + "sampling/importance_sampling_ratio/min": 0.4420183300971985, + "sampling/sampling_logp_difference/max": 0.816403865814209, + "sampling/sampling_logp_difference/mean": 0.019348707050085068, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 77.421875, + "completions/mean_terminated_length": 77.421875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.31168049573898315, + "epoch": 0.6371681415929203, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03541548343871878, + "kl": 0.009070035070180893, + "learning_rate": 9.894873217587245e-07, + "loss": 0.0001, + "num_tokens": 5697653.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5912870168685913, + "sampling/importance_sampling_ratio/mean": 1.0003658533096313, + "sampling/importance_sampling_ratio/min": 0.7069886326789856, + "sampling/sampling_logp_difference/max": 0.4645431637763977, + "sampling/sampling_logp_difference/mean": 0.01266229897737503, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 158.796875, + "completions/mean_terminated_length": 158.796875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.5861371755599976, + "epoch": 0.6389380530973451, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.910836183478518, + "kl": 0.04382926970720291, + "learning_rate": 9.893292088841108e-07, + "loss": 0.046, + "num_tokens": 5719336.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6420366764068604, + "sampling/importance_sampling_ratio/mean": 1.0004842281341553, + "sampling/importance_sampling_ratio/min": 0.48981279134750366, + "sampling/sampling_logp_difference/max": 0.7137320041656494, + "sampling/sampling_logp_difference/mean": 0.01918848231434822, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 110.828125, + "completions/mean_terminated_length": 110.828125, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.431569904088974, + "epoch": 0.6407079646017699, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.212237165655271, + "kl": 0.0400247722864151, + "learning_rate": 9.891699286653712e-07, + "loss": -0.0838, + "num_tokens": 5740701.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.780729055404663, + "sampling/importance_sampling_ratio/mean": 1.0001611709594727, + "sampling/importance_sampling_ratio/min": 0.5716253519058228, + "sampling/sampling_logp_difference/max": 0.5770227909088135, + "sampling/sampling_logp_difference/mean": 0.01780252903699875, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 185.890625, + "completions/mean_terminated_length": 185.890625, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "entropy": 0.48092374205589294, + "epoch": 0.6424778761061947, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.68364450863825, + "kl": 0.029903601855039597, + "learning_rate": 9.890094814824852e-07, + "loss": -0.0082, + "num_tokens": 5764518.0, + "reward": 0.15625, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.4594154357910156, + "sampling/importance_sampling_ratio/mean": 1.0001177787780762, + "sampling/importance_sampling_ratio/min": 0.4550616145133972, + "sampling/sampling_logp_difference/max": 0.7873225212097168, + "sampling/sampling_logp_difference/mean": 0.017083294689655304, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 98.375, + "completions/mean_terminated_length": 98.375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4051584005355835, + "epoch": 0.6442477876106195, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034731977046040044, + "kl": 0.013031707145273685, + "learning_rate": 9.888478677182154e-07, + "loss": 0.0001, + "num_tokens": 5782318.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3868625164031982, + "sampling/importance_sampling_ratio/mean": 1.0004628896713257, + "sampling/importance_sampling_ratio/min": 0.6924977898597717, + "sampling/sampling_logp_difference/max": 0.3674502372741699, + "sampling/sampling_logp_difference/mean": 0.013802760280668736, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 710.0, + "completions/max_terminated_length": 710.0, + "completions/mean_length": 104.421875, + "completions/mean_terminated_length": 104.421875, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.4693533480167389, + "epoch": 0.6460176991150443, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.0831511432804044, + "kl": 0.024606188759207726, + "learning_rate": 9.886850877581078e-07, + "loss": -0.1668, + "num_tokens": 5801369.0, + "reward": -0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.9070464372634888, + "sampling/importance_sampling_ratio/mean": 1.00021493434906, + "sampling/importance_sampling_ratio/min": 0.4210282862186432, + "sampling/sampling_logp_difference/max": 0.8650553226470947, + "sampling/sampling_logp_difference/mean": 0.016472456976771355, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 86.09375, + "completions/mean_terminated_length": 86.09375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.3831039369106293, + "epoch": 0.647787610619469, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05046837503924455, + "kl": 0.008188454434275627, + "learning_rate": 9.885211419904903e-07, + "loss": 0.0001, + "num_tokens": 5816703.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4862701892852783, + "sampling/importance_sampling_ratio/mean": 0.9999388456344604, + "sampling/importance_sampling_ratio/min": 0.7058389782905579, + "sampling/sampling_logp_difference/max": 0.3962697982788086, + "sampling/sampling_logp_difference/mean": 0.018592428416013718, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 82.09375, + "completions/mean_terminated_length": 82.09375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.25472283363342285, + "epoch": 0.6495575221238938, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04646732187995595, + "kl": 0.018464451655745506, + "learning_rate": 9.883560308064722e-07, + "loss": 0.0001, + "num_tokens": 5834549.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5658870935440063, + "sampling/importance_sampling_ratio/mean": 0.9994906187057495, + "sampling/importance_sampling_ratio/min": 0.7226534485816956, + "sampling/sampling_logp_difference/max": 0.4484524726867676, + "sampling/sampling_logp_difference/mean": 0.013782997615635395, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 105.5625, + "completions/mean_terminated_length": 105.5625, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "entropy": 0.3836281895637512, + "epoch": 0.6513274336283186, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05150506273323067, + "kl": 0.016738034784793854, + "learning_rate": 9.881897545999429e-07, + "loss": 0.0002, + "num_tokens": 5852649.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4687012434005737, + "sampling/importance_sampling_ratio/mean": 1.0002315044403076, + "sampling/importance_sampling_ratio/min": 0.6318727731704712, + "sampling/sampling_logp_difference/max": 0.4590672254562378, + "sampling/sampling_logp_difference/mean": 0.013846302404999733, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 119.6875, + "completions/mean_terminated_length": 119.6875, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 0.4400315582752228, + "epoch": 0.6530973451327433, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04148350669275565, + "kl": 0.017446907237172127, + "learning_rate": 9.880223137675707e-07, + "loss": 0.0002, + "num_tokens": 5871701.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.461326003074646, + "sampling/importance_sampling_ratio/mean": 1.0014960765838623, + "sampling/importance_sampling_ratio/min": 0.6738423109054565, + "sampling/sampling_logp_difference/max": 0.3947591781616211, + "sampling/sampling_logp_difference/mean": 0.017619699239730835, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 77.9375, + "completions/mean_terminated_length": 77.9375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.16520312428474426, + "epoch": 0.6548672566371682, + "frac_reward_zero_std": 0.75, + "grad_norm": 6.952300200899005, + "kl": 0.03090791031718254, + "learning_rate": 9.87853708708803e-07, + "loss": 0.1817, + "num_tokens": 5892465.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.707594871520996, + "sampling/importance_sampling_ratio/mean": 0.9993278980255127, + "sampling/importance_sampling_ratio/min": 0.6782475709915161, + "sampling/sampling_logp_difference/max": 0.535085916519165, + "sampling/sampling_logp_difference/mean": 0.012654388323426247, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 113.5625, + "completions/mean_terminated_length": 113.5625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3643713891506195, + "epoch": 0.6566371681415929, + "frac_reward_zero_std": 0.75, + "grad_norm": 7.294525677618649, + "kl": 0.0199178084731102, + "learning_rate": 9.876839398258639e-07, + "loss": 0.1817, + "num_tokens": 5911253.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.5951619148254395, + "sampling/importance_sampling_ratio/mean": 1.0007092952728271, + "sampling/importance_sampling_ratio/min": 0.39645707607269287, + "sampling/sampling_logp_difference/max": 0.9251875877380371, + "sampling/sampling_logp_difference/mean": 0.014422750100493431, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 106.671875, + "completions/mean_terminated_length": 106.671875, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.39586547017097473, + "epoch": 0.6584070796460177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02170165706755283, + "kl": 0.00442938506603241, + "learning_rate": 9.875130075237543e-07, + "loss": 0.0, + "num_tokens": 5928768.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6555458307266235, + "sampling/importance_sampling_ratio/mean": 1.0000089406967163, + "sampling/importance_sampling_ratio/min": 0.7011512517929077, + "sampling/sampling_logp_difference/max": 0.5041307210922241, + "sampling/sampling_logp_difference/mean": 0.014346872456371784, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 71.5, + "completions/mean_terminated_length": 71.5, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.11664820462465286, + "epoch": 0.6601769911504425, + "frac_reward_zero_std": 0.75, + "grad_norm": 6.413113126385533, + "kl": 0.0214911587536335, + "learning_rate": 9.873409122102503e-07, + "loss": 0.185, + "num_tokens": 5946384.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.7430909872055054, + "sampling/importance_sampling_ratio/mean": 0.9998141527175903, + "sampling/importance_sampling_ratio/min": 0.7246072292327881, + "sampling/sampling_logp_difference/max": 0.5556598901748657, + "sampling/sampling_logp_difference/mean": 0.007141374982893467, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 99.90625, + "completions/mean_terminated_length": 99.90625, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.49403318762779236, + "epoch": 0.6619469026548672, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.065293362113313, + "kl": 0.009674089029431343, + "learning_rate": 9.87167654295903e-07, + "loss": 0.083, + "num_tokens": 5965722.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5598499774932861, + "sampling/importance_sampling_ratio/mean": 0.9995803833007812, + "sampling/importance_sampling_ratio/min": 0.5161864161491394, + "sampling/sampling_logp_difference/max": 0.6612873077392578, + "sampling/sampling_logp_difference/mean": 0.018438229337334633, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 49.8125, + "completions/mean_terminated_length": 49.8125, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.1185794249176979, + "epoch": 0.6637168141592921, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0955250497511546, + "kl": 0.015874072909355164, + "learning_rate": 9.869932341940358e-07, + "loss": 0.0001, + "num_tokens": 5981086.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0023404359817505, + "sampling/importance_sampling_ratio/min": 0.5194261074066162, + "sampling/sampling_logp_difference/max": 0.9786150455474854, + "sampling/sampling_logp_difference/mean": 0.023584650829434395, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 42.40625, + "completions/mean_terminated_length": 42.40625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.18201380968093872, + "epoch": 0.6654867256637168, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18587050428707652, + "kl": 0.0745650976896286, + "learning_rate": 9.868176523207463e-07, + "loss": 0.0004, + "num_tokens": 5994552.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3351178169250488, + "sampling/importance_sampling_ratio/mean": 0.9991710186004639, + "sampling/importance_sampling_ratio/min": 0.5561547875404358, + "sampling/sampling_logp_difference/max": 0.586708664894104, + "sampling/sampling_logp_difference/mean": 0.014691246673464775, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 218.0, + "completions/max_terminated_length": 218.0, + "completions/mean_length": 74.796875, + "completions/mean_terminated_length": 74.796875, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.30755698680877686, + "epoch": 0.6672566371681415, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10144206392078267, + "kl": 0.0294176135212183, + "learning_rate": 9.86640909094902e-07, + "loss": 0.0001, + "num_tokens": 6008331.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3973352909088135, + "sampling/importance_sampling_ratio/mean": 0.9999659657478333, + "sampling/importance_sampling_ratio/min": 0.40352514386177063, + "sampling/sampling_logp_difference/max": 0.9075164794921875, + "sampling/sampling_logp_difference/mean": 0.014053567312657833, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 66.953125, + "completions/mean_terminated_length": 66.953125, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.25754937529563904, + "epoch": 0.6690265486725664, + "frac_reward_zero_std": 0.75, + "grad_norm": 5.230792858420169, + "kl": 0.024930693209171295, + "learning_rate": 9.864630049381424e-07, + "loss": 0.0888, + "num_tokens": 6022776.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.4337999820709229, + "sampling/importance_sampling_ratio/mean": 0.9992684721946716, + "sampling/importance_sampling_ratio/min": 0.6020495295524597, + "sampling/sampling_logp_difference/max": 0.5074155330657959, + "sampling/sampling_logp_difference/mean": 0.009889904409646988, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 43.34375, + "completions/mean_terminated_length": 43.34375, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.17238153517246246, + "epoch": 0.6707964601769911, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1524987210184169, + "kl": 0.1290230005979538, + "learning_rate": 9.862839402748753e-07, + "loss": 0.0006, + "num_tokens": 6036398.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5416227579116821, + "sampling/importance_sampling_ratio/mean": 1.0005288124084473, + "sampling/importance_sampling_ratio/min": 0.6389191746711731, + "sampling/sampling_logp_difference/max": 0.44797730445861816, + "sampling/sampling_logp_difference/mean": 0.010033435188233852, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 35.71875, + "completions/mean_terminated_length": 35.71875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.1937960535287857, + "epoch": 0.672566371681416, + "frac_reward_zero_std": 0.75, + "grad_norm": 8.898301167597465, + "kl": 0.202432781457901, + "learning_rate": 9.861037155322776e-07, + "loss": -0.178, + "num_tokens": 6049132.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.343531608581543, + "sampling/importance_sampling_ratio/mean": 1.001887321472168, + "sampling/importance_sampling_ratio/min": 0.6436159610748291, + "sampling/sampling_logp_difference/max": 0.44065308570861816, + "sampling/sampling_logp_difference/mean": 0.013763444498181343, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 195.0, + "completions/max_terminated_length": 195.0, + "completions/mean_length": 38.84375, + "completions/mean_terminated_length": 38.84375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.26760339736938477, + "epoch": 0.6743362831858407, + "frac_reward_zero_std": 0.5, + "grad_norm": 30.52928589488721, + "kl": 0.09609369188547134, + "learning_rate": 9.859223311402936e-07, + "loss": -0.4911, + "num_tokens": 6062402.0, + "reward": -0.03125, + "reward_std": 0.42516323924064636, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.7370104789733887, + "sampling/importance_sampling_ratio/mean": 1.0004743337631226, + "sampling/importance_sampling_ratio/min": 0.7212648987770081, + "sampling/sampling_logp_difference/max": 0.5521655082702637, + "sampling/sampling_logp_difference/mean": 0.014504313468933105, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 116.0, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 22.125, + "completions/mean_terminated_length": 22.125, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.1711660772562027, + "epoch": 0.6761061946902654, + "frac_reward_zero_std": 0.75, + "grad_norm": 24.017188586007887, + "kl": 0.3548486530780792, + "learning_rate": 9.85739787531634e-07, + "loss": -0.3051, + "num_tokens": 6074586.0, + "reward": -0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": -0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.6289836168289185, + "sampling/importance_sampling_ratio/mean": 1.0003910064697266, + "sampling/importance_sampling_ratio/min": 0.5489668846130371, + "sampling/sampling_logp_difference/max": 0.5997171401977539, + "sampling/sampling_logp_difference/mean": 0.013700541108846664, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1171.0, + "completions/max_terminated_length": 1171.0, + "completions/mean_length": 59.65625, + "completions/mean_terminated_length": 59.65625, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.17006395757198334, + "epoch": 0.6778761061946903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06476926527288644, + "kl": 0.22285808622837067, + "learning_rate": 9.85556085141775e-07, + "loss": 0.0008, + "num_tokens": 6088740.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5189270973205566, + "sampling/importance_sampling_ratio/mean": 0.9982098340988159, + "sampling/importance_sampling_ratio/min": 0.7894731760025024, + "sampling/sampling_logp_difference/max": 0.41800427436828613, + "sampling/sampling_logp_difference/mean": 0.009094467386603355, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 112.0, + "completions/max_terminated_length": 112.0, + "completions/mean_length": 20.15625, + "completions/mean_terminated_length": 20.15625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.15394248068332672, + "epoch": 0.679646017699115, + "frac_reward_zero_std": 0.75, + "grad_norm": 21.141641074846657, + "kl": 0.3241913914680481, + "learning_rate": 9.853712244089572e-07, + "loss": 0.2209, + "num_tokens": 6101086.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.425696611404419, + "sampling/importance_sampling_ratio/mean": 1.0021278858184814, + "sampling/importance_sampling_ratio/min": 0.6257392168045044, + "sampling/sampling_logp_difference/max": 0.46882152557373047, + "sampling/sampling_logp_difference/mean": 0.010466893203556538, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.1875, + "completions/mean_terminated_length": 17.1875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.06128706410527229, + "epoch": 0.6814159292035398, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4302172309995353, + "kl": 0.2754594683647156, + "learning_rate": 9.851852057741844e-07, + "loss": 0.0027, + "num_tokens": 6114986.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.1785469055175781, + "sampling/importance_sampling_ratio/mean": 0.998342752456665, + "sampling/importance_sampling_ratio/min": 0.8395922780036926, + "sampling/sampling_logp_difference/max": 0.17483890056610107, + "sampling/sampling_logp_difference/mean": 0.0034622247330844402, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.09375, + "completions/mean_terminated_length": 17.09375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.07266160845756531, + "epoch": 0.6831858407079646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.32445920449965693, + "kl": 0.22553634643554688, + "learning_rate": 9.849980296812231e-07, + "loss": 0.0022, + "num_tokens": 6126784.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3831980228424072, + "sampling/importance_sampling_ratio/mean": 0.9980471134185791, + "sampling/importance_sampling_ratio/min": 0.7396202683448792, + "sampling/sampling_logp_difference/max": 0.3243982791900635, + "sampling/sampling_logp_difference/mean": 0.01246151514351368, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1225.0, + "completions/max_terminated_length": 1225.0, + "completions/mean_length": 50.0625, + "completions/mean_terminated_length": 50.0625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.36946654319763184, + "epoch": 0.6849557522123894, + "frac_reward_zero_std": 0.75, + "grad_norm": 10.48257294738574, + "kl": 0.0802491158246994, + "learning_rate": 9.848096965766002e-07, + "loss": -0.1169, + "num_tokens": 6141316.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.3859869241714478, + "sampling/importance_sampling_ratio/mean": 0.9985344409942627, + "sampling/importance_sampling_ratio/min": 0.561350405216217, + "sampling/sampling_logp_difference/max": 0.5774099826812744, + "sampling/sampling_logp_difference/mean": 0.017032448202371597, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 61.625, + "completions/mean_terminated_length": 61.625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.19366797804832458, + "epoch": 0.6867256637168142, + "frac_reward_zero_std": 0.5, + "grad_norm": 10.91991659893057, + "kl": 0.08208681643009186, + "learning_rate": 9.846202069096038e-07, + "loss": -0.2283, + "num_tokens": 6157004.0, + "reward": 0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6599446535110474, + "sampling/importance_sampling_ratio/mean": 1.0001928806304932, + "sampling/importance_sampling_ratio/min": 0.2644866704940796, + "sampling/sampling_logp_difference/max": 1.3299643993377686, + "sampling/sampling_logp_difference/mean": 0.010109896771609783, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1380.0, + "completions/max_terminated_length": 1380.0, + "completions/mean_length": 45.34375, + "completions/mean_terminated_length": 45.34375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.24957646429538727, + "epoch": 0.6884955752212389, + "frac_reward_zero_std": 0.75, + "grad_norm": 7.634748076840416, + "kl": 0.20094981789588928, + "learning_rate": 9.844295611322803e-07, + "loss": -0.1671, + "num_tokens": 6174466.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.3965059518814087, + "sampling/importance_sampling_ratio/mean": 0.9998543858528137, + "sampling/importance_sampling_ratio/min": 0.7887519001960754, + "sampling/sampling_logp_difference/max": 0.33397340774536133, + "sampling/sampling_logp_difference/mean": 0.0095017459243536, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1056.0, + "completions/max_terminated_length": 1056.0, + "completions/mean_length": 112.140625, + "completions/mean_terminated_length": 112.140625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2890416085720062, + "epoch": 0.6902654867256637, + "frac_reward_zero_std": 0.25, + "grad_norm": 7.721867954809243, + "kl": 0.016003964468836784, + "learning_rate": 9.842377596994344e-07, + "loss": 0.9181, + "num_tokens": 6194203.0, + "reward": 0.875, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.59376060962677, + "sampling/importance_sampling_ratio/mean": 1.0002254247665405, + "sampling/importance_sampling_ratio/min": 0.6421988010406494, + "sampling/sampling_logp_difference/max": 0.4660964012145996, + "sampling/sampling_logp_difference/mean": 0.010162144899368286, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 61.90625, + "completions/mean_terminated_length": 61.90625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.39551448822021484, + "epoch": 0.6920353982300885, + "frac_reward_zero_std": 0.75, + "grad_norm": 9.681739496403345, + "kl": 0.016538335010409355, + "learning_rate": 9.84044803068628e-07, + "loss": -0.1438, + "num_tokens": 6208901.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.596825122833252, + "sampling/importance_sampling_ratio/mean": 1.0015642642974854, + "sampling/importance_sampling_ratio/min": 0.6489729285240173, + "sampling/sampling_logp_difference/max": 0.4680173397064209, + "sampling/sampling_logp_difference/mean": 0.018785130232572556, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 60.109375, + "completions/mean_terminated_length": 60.109375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3214569389820099, + "epoch": 0.6938053097345133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.057294202411554675, + "kl": 0.011079341173171997, + "learning_rate": 9.838506917001784e-07, + "loss": 0.0001, + "num_tokens": 6223164.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.2777345180511475, + "sampling/importance_sampling_ratio/mean": 1.0013971328735352, + "sampling/importance_sampling_ratio/min": 0.6055299639701843, + "sampling/sampling_logp_difference/max": 0.5016512870788574, + "sampling/sampling_logp_difference/mean": 0.013057949021458626, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 712.0, + "completions/max_terminated_length": 712.0, + "completions/mean_length": 140.515625, + "completions/mean_terminated_length": 140.515625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4342554211616516, + "epoch": 0.695575221238938, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.046959634621339, + "kl": 0.013974161818623543, + "learning_rate": 9.836554260571577e-07, + "loss": -0.0494, + "num_tokens": 6242509.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.4464900493621826, + "sampling/importance_sampling_ratio/mean": 1.0002140998840332, + "sampling/importance_sampling_ratio/min": 0.4558376967906952, + "sampling/sampling_logp_difference/max": 0.7856185436248779, + "sampling/sampling_logp_difference/mean": 0.016138717532157898, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1088.0, + "completions/max_terminated_length": 1088.0, + "completions/mean_length": 133.3125, + "completions/mean_terminated_length": 133.3125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4057089686393738, + "epoch": 0.6973451327433628, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.5408018723691908, + "kl": 0.008893290534615517, + "learning_rate": 9.834590066053917e-07, + "loss": 0.4581, + "num_tokens": 6262081.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6015470027923584, + "sampling/importance_sampling_ratio/mean": 0.9998269081115723, + "sampling/importance_sampling_ratio/min": 0.540658175945282, + "sampling/sampling_logp_difference/max": 0.6149680614471436, + "sampling/sampling_logp_difference/mean": 0.013680309988558292, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 88.953125, + "completions/mean_terminated_length": 88.953125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.446719229221344, + "epoch": 0.6991150442477876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0329129135848926, + "kl": 0.007592624984681606, + "learning_rate": 9.832614338134595e-07, + "loss": 0.0001, + "num_tokens": 6276606.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4540531635284424, + "sampling/importance_sampling_ratio/mean": 1.0001907348632812, + "sampling/importance_sampling_ratio/min": 0.6069669127464294, + "sampling/sampling_logp_difference/max": 0.4992809295654297, + "sampling/sampling_logp_difference/mean": 0.01567765697836876, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 203.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 108.09375, + "completions/mean_terminated_length": 108.09375, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "entropy": 0.41068732738494873, + "epoch": 0.7008849557522124, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03007958844071075, + "kl": 0.007306972984224558, + "learning_rate": 9.8306270815269e-07, + "loss": 0.0001, + "num_tokens": 6293428.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7739129066467285, + "sampling/importance_sampling_ratio/mean": 1.0002840757369995, + "sampling/importance_sampling_ratio/min": 0.5545526742935181, + "sampling/sampling_logp_difference/max": 0.5895934104919434, + "sampling/sampling_logp_difference/mean": 0.016644949093461037, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 96.875, + "completions/mean_terminated_length": 96.875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.4401080906391144, + "epoch": 0.7026548672566372, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.4586871026462416, + "kl": 0.010626133531332016, + "learning_rate": 9.828628300971638e-07, + "loss": -0.0859, + "num_tokens": 6310460.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5749874114990234, + "sampling/importance_sampling_ratio/mean": 1.0006617307662964, + "sampling/importance_sampling_ratio/min": 0.5700552463531494, + "sampling/sampling_logp_difference/max": 0.5620219707489014, + "sampling/sampling_logp_difference/mean": 0.018485158681869507, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 48.0625, + "completions/mean_terminated_length": 48.0625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3010576069355011, + "epoch": 0.7044247787610619, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047274504860197054, + "kl": 0.009327048435807228, + "learning_rate": 9.826618001237099e-07, + "loss": 0.0001, + "num_tokens": 6324384.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6009161472320557, + "sampling/importance_sampling_ratio/mean": 0.9997097253799438, + "sampling/importance_sampling_ratio/min": 0.7120568752288818, + "sampling/sampling_logp_difference/max": 0.47057604789733887, + "sampling/sampling_logp_difference/mean": 0.011248650029301643, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 65.53125, + "completions/mean_terminated_length": 65.53125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3446110188961029, + "epoch": 0.7061946902654868, + "frac_reward_zero_std": 0.75, + "grad_norm": 13.730077407054516, + "kl": 0.015162178315222263, + "learning_rate": 9.82459618711906e-07, + "loss": 0.275, + "num_tokens": 6339490.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998742341995239, + "sampling/importance_sampling_ratio/min": 0.5365955829620361, + "sampling/sampling_logp_difference/max": 0.877489447593689, + "sampling/sampling_logp_difference/mean": 0.014317413792014122, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 61.59375, + "completions/mean_terminated_length": 61.59375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.3405458927154541, + "epoch": 0.7079646017699115, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11617255689871578, + "kl": 0.023211687803268433, + "learning_rate": 9.822562863440755e-07, + "loss": 0.0002, + "num_tokens": 6357672.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3631829023361206, + "sampling/importance_sampling_ratio/mean": 1.0004823207855225, + "sampling/importance_sampling_ratio/min": 0.6919498443603516, + "sampling/sampling_logp_difference/max": 0.3682417869567871, + "sampling/sampling_logp_difference/mean": 0.01271877158433199, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 91.984375, + "completions/mean_terminated_length": 91.984375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3686937987804413, + "epoch": 0.7097345132743362, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05523325223140905, + "kl": 0.01296952087432146, + "learning_rate": 9.820518035052889e-07, + "loss": 0.0001, + "num_tokens": 6374407.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4117615222930908, + "sampling/importance_sampling_ratio/mean": 1.0009119510650635, + "sampling/importance_sampling_ratio/min": 0.6341397166252136, + "sampling/sampling_logp_difference/max": 0.4554860591888428, + "sampling/sampling_logp_difference/mean": 0.013343364000320435, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 171.0, + "completions/max_terminated_length": 171.0, + "completions/mean_length": 69.921875, + "completions/mean_terminated_length": 69.921875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.32802262902259827, + "epoch": 0.7115044247787611, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08481747708423726, + "kl": 0.017437882721424103, + "learning_rate": 9.818461706833602e-07, + "loss": 0.0001, + "num_tokens": 6388386.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3902863264083862, + "sampling/importance_sampling_ratio/mean": 0.9997358322143555, + "sampling/importance_sampling_ratio/min": 0.6693275570869446, + "sampling/sampling_logp_difference/max": 0.4014817476272583, + "sampling/sampling_logp_difference/mean": 0.013436712324619293, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 71.171875, + "completions/mean_terminated_length": 71.171875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3199332654476166, + "epoch": 0.7132743362831858, + "frac_reward_zero_std": 0.75, + "grad_norm": 11.113427158524635, + "kl": 0.01687893643975258, + "learning_rate": 9.816393883688475e-07, + "loss": -0.3956, + "num_tokens": 6403565.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.577026605606079, + "sampling/importance_sampling_ratio/mean": 0.9993441700935364, + "sampling/importance_sampling_ratio/min": 0.5030588507652283, + "sampling/sampling_logp_difference/max": 0.6870481967926025, + "sampling/sampling_logp_difference/mean": 0.013590632006525993, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 82.15625, + "completions/mean_terminated_length": 82.15625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.34127122163772583, + "epoch": 0.7150442477876107, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.6826531843832764, + "kl": 0.020201334729790688, + "learning_rate": 9.814314570550505e-07, + "loss": -0.0153, + "num_tokens": 6424359.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.437758445739746, + "sampling/importance_sampling_ratio/mean": 0.9995735883712769, + "sampling/importance_sampling_ratio/min": 0.4791787564754486, + "sampling/sampling_logp_difference/max": 0.7356815338134766, + "sampling/sampling_logp_difference/mean": 0.013350830413401127, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 234.0, + "completions/max_terminated_length": 234.0, + "completions/mean_length": 59.859375, + "completions/mean_terminated_length": 59.859375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.3270696699619293, + "epoch": 0.7168141592920354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06489359238192514, + "kl": 0.022430576384067535, + "learning_rate": 9.812223772380105e-07, + "loss": 0.0002, + "num_tokens": 6443438.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5900388956069946, + "sampling/importance_sampling_ratio/mean": 0.9994983077049255, + "sampling/importance_sampling_ratio/min": 0.645928144454956, + "sampling/sampling_logp_difference/max": 0.4637584686279297, + "sampling/sampling_logp_difference/mean": 0.014710272662341595, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 82.078125, + "completions/mean_terminated_length": 82.078125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.41037648916244507, + "epoch": 0.7185840707964601, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04683548653083853, + "kl": 0.02197330817580223, + "learning_rate": 9.810121494165087e-07, + "loss": 0.0001, + "num_tokens": 6459123.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4226871728897095, + "sampling/importance_sampling_ratio/mean": 0.9984689354896545, + "sampling/importance_sampling_ratio/min": 0.6845190525054932, + "sampling/sampling_logp_difference/max": 0.37903881072998047, + "sampling/sampling_logp_difference/mean": 0.018058765679597855, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 155.0, + "completions/max_terminated_length": 155.0, + "completions/mean_length": 53.25, + "completions/mean_terminated_length": 53.25, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.29711028933525085, + "epoch": 0.720353982300885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07698501154185808, + "kl": 0.024243097752332687, + "learning_rate": 9.808007740920645e-07, + "loss": 0.0001, + "num_tokens": 6472723.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.2674616575241089, + "sampling/importance_sampling_ratio/mean": 1.0002200603485107, + "sampling/importance_sampling_ratio/min": 0.5505961775779724, + "sampling/sampling_logp_difference/max": 0.5967535972595215, + "sampling/sampling_logp_difference/mean": 0.012171144597232342, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 121.03125, + "completions/mean_terminated_length": 121.03125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4948740005493164, + "epoch": 0.7221238938053097, + "frac_reward_zero_std": 0.5, + "grad_norm": 7.680780315466812, + "kl": 0.015606466680765152, + "learning_rate": 9.80588251768935e-07, + "loss": 0.1649, + "num_tokens": 6491157.0, + "reward": 0.6875, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.3771916627883911, + "sampling/importance_sampling_ratio/mean": 0.999821126461029, + "sampling/importance_sampling_ratio/min": 0.7080368399620056, + "sampling/sampling_logp_difference/max": 0.3452591896057129, + "sampling/sampling_logp_difference/mean": 0.017087643966078758, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 67.546875, + "completions/mean_terminated_length": 67.546875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2259364277124405, + "epoch": 0.7238938053097345, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.059875820545522006, + "kl": 0.023707356303930283, + "learning_rate": 9.803745829541137e-07, + "loss": 0.0001, + "num_tokens": 6507720.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2575604915618896, + "sampling/importance_sampling_ratio/mean": 0.998282790184021, + "sampling/importance_sampling_ratio/min": 0.3100183606147766, + "sampling/sampling_logp_difference/max": 1.171123743057251, + "sampling/sampling_logp_difference/mean": 0.011251486837863922, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 111.15625, + "completions/mean_terminated_length": 111.15625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.4915921688079834, + "epoch": 0.7256637168141593, + "frac_reward_zero_std": 0.75, + "grad_norm": 9.162438830181475, + "kl": 0.011929306201636791, + "learning_rate": 9.801597681573289e-07, + "loss": -0.306, + "num_tokens": 6524306.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002862215042114, + "sampling/importance_sampling_ratio/min": 0.619353711605072, + "sampling/sampling_logp_difference/max": 0.8039919137954712, + "sampling/sampling_logp_difference/mean": 0.019170641899108887, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 94.578125, + "completions/mean_terminated_length": 94.578125, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.373543918132782, + "epoch": 0.727433628318584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05289658409649595, + "kl": 0.017881829291582108, + "learning_rate": 9.799438078910432e-07, + "loss": 0.0001, + "num_tokens": 6539895.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3026437759399414, + "sampling/importance_sampling_ratio/mean": 1.0004127025604248, + "sampling/importance_sampling_ratio/min": 0.6380023956298828, + "sampling/sampling_logp_difference/max": 0.4494132995605469, + "sampling/sampling_logp_difference/mean": 0.014169419184327126, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 94.734375, + "completions/mean_terminated_length": 94.734375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.25864890217781067, + "epoch": 0.7292035398230089, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03554165373600353, + "kl": 0.011999612674117088, + "learning_rate": 9.797267026704514e-07, + "loss": 0.0001, + "num_tokens": 6558230.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3425296545028687, + "sampling/importance_sampling_ratio/mean": 0.9993278980255127, + "sampling/importance_sampling_ratio/min": 0.6613734364509583, + "sampling/sampling_logp_difference/max": 0.4134366512298584, + "sampling/sampling_logp_difference/mean": 0.011325197294354439, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 96.0625, + "completions/mean_terminated_length": 96.0625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.38927680253982544, + "epoch": 0.7309734513274336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03206544035807514, + "kl": 0.009823622182011604, + "learning_rate": 9.7950845301348e-07, + "loss": 0.0001, + "num_tokens": 6577162.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4312158823013306, + "sampling/importance_sampling_ratio/mean": 1.0001397132873535, + "sampling/importance_sampling_ratio/min": 0.41807880997657776, + "sampling/sampling_logp_difference/max": 0.8720853328704834, + "sampling/sampling_logp_difference/mean": 0.015506758354604244, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 95.359375, + "completions/mean_terminated_length": 95.359375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2924111783504486, + "epoch": 0.7327433628318584, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.3881119270730435, + "kl": 0.015888947993516922, + "learning_rate": 9.792890594407855e-07, + "loss": -0.0539, + "num_tokens": 6601089.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.449302315711975, + "sampling/importance_sampling_ratio/mean": 1.0000194311141968, + "sampling/importance_sampling_ratio/min": 0.5437875986099243, + "sampling/sampling_logp_difference/max": 0.6091965436935425, + "sampling/sampling_logp_difference/mean": 0.011562496423721313, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 96.40625, + "completions/mean_terminated_length": 96.40625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.29087594151496887, + "epoch": 0.7345132743362832, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032274670059917146, + "kl": 0.00947969127446413, + "learning_rate": 9.790685224757532e-07, + "loss": 0.0001, + "num_tokens": 6623995.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4449760913848877, + "sampling/importance_sampling_ratio/mean": 0.9994337558746338, + "sampling/importance_sampling_ratio/min": 0.5143414735794067, + "sampling/sampling_logp_difference/max": 0.6648678779602051, + "sampling/sampling_logp_difference/mean": 0.016016721725463867, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 65.4375, + "completions/mean_terminated_length": 65.4375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2695866525173187, + "epoch": 0.736283185840708, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10092692024078374, + "kl": 0.025945590808987617, + "learning_rate": 9.788468426444967e-07, + "loss": 0.0001, + "num_tokens": 6640055.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.255564570426941, + "sampling/importance_sampling_ratio/mean": 1.000176191329956, + "sampling/importance_sampling_ratio/min": 0.4881514608860016, + "sampling/sampling_logp_difference/max": 0.7171295881271362, + "sampling/sampling_logp_difference/mean": 0.01213115081191063, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 224.0, + "completions/max_terminated_length": 224.0, + "completions/mean_length": 59.21875, + "completions/mean_terminated_length": 59.21875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.27273327112197876, + "epoch": 0.7380530973451327, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.046918830851836506, + "kl": 0.011121374554932117, + "learning_rate": 9.786240204758552e-07, + "loss": 0.0001, + "num_tokens": 6655957.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3171130418777466, + "sampling/importance_sampling_ratio/mean": 0.9994080662727356, + "sampling/importance_sampling_ratio/min": 0.5929533839225769, + "sampling/sampling_logp_difference/max": 0.5226395130157471, + "sampling/sampling_logp_difference/mean": 0.012166490778326988, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 119.375, + "completions/mean_terminated_length": 119.375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4039311110973358, + "epoch": 0.7398230088495575, + "frac_reward_zero_std": 0.5, + "grad_norm": 4.238070768225619, + "kl": 0.03022146411240101, + "learning_rate": 9.784000565013933e-07, + "loss": -0.0596, + "num_tokens": 6675757.0, + "reward": 0.15625, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.4661486148834229, + "sampling/importance_sampling_ratio/mean": 1.0014400482177734, + "sampling/importance_sampling_ratio/min": 0.40373674035072327, + "sampling/sampling_logp_difference/max": 0.9069922566413879, + "sampling/sampling_logp_difference/mean": 0.016016965731978416, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 91.609375, + "completions/mean_terminated_length": 91.609375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2584455907344818, + "epoch": 0.7415929203539823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038183061521502, + "kl": 0.012106215581297874, + "learning_rate": 9.781749512553998e-07, + "loss": 0.0001, + "num_tokens": 6694612.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4827467203140259, + "sampling/importance_sampling_ratio/mean": 0.9982657432556152, + "sampling/importance_sampling_ratio/min": 0.6861007213592529, + "sampling/sampling_logp_difference/max": 0.393896222114563, + "sampling/sampling_logp_difference/mean": 0.015866706147789955, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 91.4375, + "completions/mean_terminated_length": 91.4375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.4179188907146454, + "epoch": 0.7433628318584071, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02882059358884815, + "kl": 0.006676926743239164, + "learning_rate": 9.779487052748863e-07, + "loss": 0.0001, + "num_tokens": 6709984.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3979723453521729, + "sampling/importance_sampling_ratio/mean": 0.9994974732398987, + "sampling/importance_sampling_ratio/min": 0.6180513501167297, + "sampling/sampling_logp_difference/max": 0.4811837673187256, + "sampling/sampling_logp_difference/mean": 0.01743045449256897, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 191.0, + "completions/max_terminated_length": 191.0, + "completions/mean_length": 92.171875, + "completions/mean_terminated_length": 92.171875, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "entropy": 0.46604567766189575, + "epoch": 0.7451327433628319, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029793712984509103, + "kl": 0.008214936591684818, + "learning_rate": 9.777213190995847e-07, + "loss": 0.0001, + "num_tokens": 6725323.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.729860782623291, + "sampling/importance_sampling_ratio/mean": 1.0001142024993896, + "sampling/importance_sampling_ratio/min": 0.6954634189605713, + "sampling/sampling_logp_difference/max": 0.5480408668518066, + "sampling/sampling_logp_difference/mean": 0.017748814076185226, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 135.203125, + "completions/mean_terminated_length": 135.203125, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "entropy": 0.5472180843353271, + "epoch": 0.7469026548672566, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.990843197760746, + "kl": 0.013447487726807594, + "learning_rate": 9.774927932719482e-07, + "loss": 0.0167, + "num_tokens": 6743560.0, + "reward": -0.03125, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002206563949585, + "sampling/importance_sampling_ratio/min": 0.6289763450622559, + "sampling/sampling_logp_difference/max": 1.2453601360321045, + "sampling/sampling_logp_difference/mean": 0.018746210262179375, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 137.109375, + "completions/mean_terminated_length": 137.109375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.5125194191932678, + "epoch": 0.7486725663716814, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.2868626079139616, + "kl": 0.010398737154901028, + "learning_rate": 9.77263128337148e-07, + "loss": -0.098, + "num_tokens": 6761343.0, + "reward": -0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.3858596086502075, + "sampling/importance_sampling_ratio/mean": 1.0002697706222534, + "sampling/importance_sampling_ratio/min": 0.5133931636810303, + "sampling/sampling_logp_difference/max": 0.6667132377624512, + "sampling/sampling_logp_difference/mean": 0.017657287418842316, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 719.0, + "completions/max_terminated_length": 719.0, + "completions/mean_length": 116.3125, + "completions/mean_terminated_length": 116.3125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2789403796195984, + "epoch": 0.7504424778761062, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.833428871020576, + "kl": 0.010706520639359951, + "learning_rate": 9.770323248430727e-07, + "loss": -0.2037, + "num_tokens": 6778371.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.3047248125076294, + "sampling/importance_sampling_ratio/mean": 0.9989539980888367, + "sampling/importance_sampling_ratio/min": 0.6344852447509766, + "sampling/sampling_logp_difference/max": 0.4549412727355957, + "sampling/sampling_logp_difference/mean": 0.011665822938084602, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 112.546875, + "completions/mean_terminated_length": 112.546875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.528927206993103, + "epoch": 0.7522123893805309, + "frac_reward_zero_std": 0.75, + "grad_norm": 7.167222849854241, + "kl": 0.0181064922362566, + "learning_rate": 9.768003833403276e-07, + "loss": -0.0977, + "num_tokens": 6796278.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.610059142112732, + "sampling/importance_sampling_ratio/mean": 0.9996190667152405, + "sampling/importance_sampling_ratio/min": 0.6272081136703491, + "sampling/sampling_logp_difference/max": 0.4762709140777588, + "sampling/sampling_logp_difference/mean": 0.01853884942829609, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 47.734375, + "completions/mean_terminated_length": 47.734375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3304079473018646, + "epoch": 0.7539823008849558, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05113450168112818, + "kl": 0.01000053621828556, + "learning_rate": 9.765673043822324e-07, + "loss": 0.0001, + "num_tokens": 6813573.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999748468399048, + "sampling/importance_sampling_ratio/min": 0.7175012230873108, + "sampling/sampling_logp_difference/max": 0.739088773727417, + "sampling/sampling_logp_difference/mean": 0.014557043090462685, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 137.03125, + "completions/mean_terminated_length": 137.03125, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "entropy": 0.4420037567615509, + "epoch": 0.7557522123893805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02800507514263248, + "kl": 0.009223876520991325, + "learning_rate": 9.763330885248204e-07, + "loss": 0.0001, + "num_tokens": 6831543.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4312554597854614, + "sampling/importance_sampling_ratio/mean": 0.9998476505279541, + "sampling/importance_sampling_ratio/min": 0.6370866894721985, + "sampling/sampling_logp_difference/max": 0.4508495330810547, + "sampling/sampling_logp_difference/mean": 0.015897046774625778, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 50.671875, + "completions/mean_terminated_length": 50.671875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1680561751127243, + "epoch": 0.7575221238938054, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04842198314396375, + "kl": 0.007218526676297188, + "learning_rate": 9.760977363268373e-07, + "loss": 0.0001, + "num_tokens": 6846066.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2974542379379272, + "sampling/importance_sampling_ratio/mean": 1.0001842975616455, + "sampling/importance_sampling_ratio/min": 0.6183997988700867, + "sampling/sampling_logp_difference/max": 0.4806201457977295, + "sampling/sampling_logp_difference/mean": 0.011776283383369446, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 98.734375, + "completions/mean_terminated_length": 98.734375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.21390385925769806, + "epoch": 0.7592920353982301, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.798618793037985, + "kl": 0.010727117769420147, + "learning_rate": 9.758612483497394e-07, + "loss": 0.0569, + "num_tokens": 6863473.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.3427993059158325, + "sampling/importance_sampling_ratio/mean": 0.999305248260498, + "sampling/importance_sampling_ratio/min": 0.6970505714416504, + "sampling/sampling_logp_difference/max": 0.3608973026275635, + "sampling/sampling_logp_difference/mean": 0.010015236213803291, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 142.0, + "completions/max_terminated_length": 142.0, + "completions/mean_length": 34.109375, + "completions/mean_terminated_length": 34.109375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2121407836675644, + "epoch": 0.7610619469026548, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04642061517941747, + "kl": 0.008114354684948921, + "learning_rate": 9.756236251576924e-07, + "loss": 0.0001, + "num_tokens": 6878456.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.271593451499939, + "sampling/importance_sampling_ratio/mean": 1.000221848487854, + "sampling/importance_sampling_ratio/min": 0.6872886419296265, + "sampling/sampling_logp_difference/max": 0.3750009536743164, + "sampling/sampling_logp_difference/mean": 0.008882062509655952, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 185.09375, + "completions/mean_terminated_length": 185.09375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.44268298149108887, + "epoch": 0.7628318584070797, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.191665702774492, + "kl": 0.04039476066827774, + "learning_rate": 9.753848673175707e-07, + "loss": -0.1335, + "num_tokens": 6899774.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.6913667917251587, + "sampling/importance_sampling_ratio/mean": 0.9997708797454834, + "sampling/importance_sampling_ratio/min": 0.46763890981674194, + "sampling/sampling_logp_difference/max": 0.7600588798522949, + "sampling/sampling_logp_difference/mean": 0.01565675251185894, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 198.0, + "completions/max_terminated_length": 198.0, + "completions/mean_length": 43.234375, + "completions/mean_terminated_length": 43.234375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1949003040790558, + "epoch": 0.7646017699115044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06551818397017477, + "kl": 0.008975997567176819, + "learning_rate": 9.751449753989546e-07, + "loss": 0.0001, + "num_tokens": 6915357.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4351768493652344, + "sampling/importance_sampling_ratio/mean": 1.0001879930496216, + "sampling/importance_sampling_ratio/min": 0.5978617668151855, + "sampling/sampling_logp_difference/max": 0.5143957138061523, + "sampling/sampling_logp_difference/mean": 0.011728991754353046, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 109.34375, + "completions/mean_terminated_length": 109.34375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3204788267612457, + "epoch": 0.7663716814159292, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.4514060945140144, + "kl": 0.016035258769989014, + "learning_rate": 9.74903949974131e-07, + "loss": 0.174, + "num_tokens": 6933891.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.3704556226730347, + "sampling/importance_sampling_ratio/mean": 1.000302791595459, + "sampling/importance_sampling_ratio/min": 0.370818167924881, + "sampling/sampling_logp_difference/max": 0.9920434951782227, + "sampling/sampling_logp_difference/mean": 0.012358264066278934, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 122.0, + "completions/max_terminated_length": 122.0, + "completions/mean_length": 51.78125, + "completions/mean_terminated_length": 51.78125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2988964915275574, + "epoch": 0.768141592920354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0646780144526037, + "kl": 0.008315995335578918, + "learning_rate": 9.746617916180905e-07, + "loss": 0.0001, + "num_tokens": 6948405.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2864738702774048, + "sampling/importance_sampling_ratio/mean": 0.9995741844177246, + "sampling/importance_sampling_ratio/min": 0.682759165763855, + "sampling/sampling_logp_difference/max": 0.3816131353378296, + "sampling/sampling_logp_difference/mean": 0.010760385543107986, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 149.9375, + "completions/mean_terminated_length": 149.9375, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "entropy": 0.46958667039871216, + "epoch": 0.7699115044247787, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.2892048962797666, + "kl": 0.014349989593029022, + "learning_rate": 9.744185009085256e-07, + "loss": 0.058, + "num_tokens": 6967809.0, + "reward": 0.5, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.470845341682434, + "sampling/importance_sampling_ratio/mean": 1.0003101825714111, + "sampling/importance_sampling_ratio/min": 0.6346219778060913, + "sampling/sampling_logp_difference/max": 0.4547257423400879, + "sampling/sampling_logp_difference/mean": 0.01665390096604824, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 66.078125, + "completions/mean_terminated_length": 66.078125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1759306788444519, + "epoch": 0.7716814159292036, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.144032453870604, + "kl": 0.005398609209805727, + "learning_rate": 9.741740784258311e-07, + "loss": 0.189, + "num_tokens": 6984070.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.762777328491211, + "sampling/importance_sampling_ratio/mean": 0.998619794845581, + "sampling/importance_sampling_ratio/min": 0.5538696050643921, + "sampling/sampling_logp_difference/max": 0.5908260345458984, + "sampling/sampling_logp_difference/mean": 0.01123537216335535, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 123.640625, + "completions/mean_terminated_length": 123.640625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.5412547588348389, + "epoch": 0.7734513274336283, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.8359115425008254, + "kl": 0.013453925028443336, + "learning_rate": 9.739285247531017e-07, + "loss": 0.0471, + "num_tokens": 7002815.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000159502029419, + "sampling/importance_sampling_ratio/min": 0.44841375946998596, + "sampling/sampling_logp_difference/max": 1.0421335697174072, + "sampling/sampling_logp_difference/mean": 0.019044142216444016, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 108.671875, + "completions/mean_terminated_length": 108.671875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4487525224685669, + "epoch": 0.7752212389380531, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03290813086570887, + "kl": 0.011676624417304993, + "learning_rate": 9.736818404761302e-07, + "loss": 0.0001, + "num_tokens": 7020698.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5356125831604004, + "sampling/importance_sampling_ratio/mean": 0.9999417662620544, + "sampling/importance_sampling_ratio/min": 0.6842086911201477, + "sampling/sampling_logp_difference/max": 0.42892932891845703, + "sampling/sampling_logp_difference/mean": 0.018000802025198936, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 142.734375, + "completions/mean_terminated_length": 142.734375, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "entropy": 0.558229386806488, + "epoch": 0.7769911504424779, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.561573745299718, + "kl": 0.01746143400669098, + "learning_rate": 9.734340261834066e-07, + "loss": -0.0123, + "num_tokens": 7040329.0, + "reward": -0.09375, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": -0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.3913923501968384, + "sampling/importance_sampling_ratio/mean": 1.0002951622009277, + "sampling/importance_sampling_ratio/min": 0.2626253366470337, + "sampling/sampling_logp_difference/max": 1.337026834487915, + "sampling/sampling_logp_difference/mean": 0.018860086798667908, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 84.484375, + "completions/mean_terminated_length": 84.484375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.43578213453292847, + "epoch": 0.7787610619469026, + "frac_reward_zero_std": 0.5, + "grad_norm": 8.487202414597359, + "kl": 0.011191412806510925, + "learning_rate": 9.73185082466117e-07, + "loss": -0.1363, + "num_tokens": 7054808.0, + "reward": -0.1875, + "reward_std": 0.3811737596988678, + "rewards/decision_reward_func/mean": -0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.597430944442749, + "sampling/importance_sampling_ratio/mean": 1.0004340410232544, + "sampling/importance_sampling_ratio/min": 0.5151932239532471, + "sampling/sampling_logp_difference/max": 0.6632132530212402, + "sampling/sampling_logp_difference/mean": 0.015007372945547104, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 180.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 68.921875, + "completions/mean_terminated_length": 68.921875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.22991178929805756, + "epoch": 0.7805309734513274, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.0598608952177795, + "kl": 0.021865490823984146, + "learning_rate": 9.729350099181419e-07, + "loss": -0.0892, + "num_tokens": 7070659.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0008248090744019, + "sampling/importance_sampling_ratio/min": 0.4950145483016968, + "sampling/sampling_logp_difference/max": 0.9080419540405273, + "sampling/sampling_logp_difference/mean": 0.013804643414914608, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 105.8125, + "completions/mean_terminated_length": 105.8125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.49717026948928833, + "epoch": 0.7823008849557522, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04886234557938152, + "kl": 0.01464010775089264, + "learning_rate": 9.726838091360545e-07, + "loss": 0.0001, + "num_tokens": 7087271.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.9791940450668335, + "sampling/importance_sampling_ratio/mean": 0.9998542666435242, + "sampling/importance_sampling_ratio/min": 0.69510418176651, + "sampling/sampling_logp_difference/max": 0.6826896667480469, + "sampling/sampling_logp_difference/mean": 0.017835965380072594, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 173.484375, + "completions/mean_terminated_length": 173.484375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.5604503154754639, + "epoch": 0.784070796460177, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.387832890411488, + "kl": 0.013209588825702667, + "learning_rate": 9.724314807191196e-07, + "loss": 0.1046, + "num_tokens": 7108870.0, + "reward": -0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5783559083938599, + "sampling/importance_sampling_ratio/mean": 0.9999769926071167, + "sampling/importance_sampling_ratio/min": 0.5818358659744263, + "sampling/sampling_logp_difference/max": 0.5415668487548828, + "sampling/sampling_logp_difference/mean": 0.018716178834438324, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 101.421875, + "completions/mean_terminated_length": 101.421875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.42693156003952026, + "epoch": 0.7858407079646018, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.142223704463168, + "kl": 0.01894482970237732, + "learning_rate": 9.721780252692917e-07, + "loss": -0.0856, + "num_tokens": 7125889.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.4481490850448608, + "sampling/importance_sampling_ratio/mean": 1.0003788471221924, + "sampling/importance_sampling_ratio/min": 0.7009619474411011, + "sampling/sampling_logp_difference/max": 0.370286226272583, + "sampling/sampling_logp_difference/mean": 0.01637919247150421, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 69.28125, + "completions/mean_terminated_length": 69.28125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.376160204410553, + "epoch": 0.7876106194690266, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.813332613168112, + "kl": 0.01751999743282795, + "learning_rate": 9.719234433912146e-07, + "loss": 0.1039, + "num_tokens": 7140835.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.3959933519363403, + "sampling/importance_sampling_ratio/mean": 1.0019288063049316, + "sampling/importance_sampling_ratio/min": 0.7030871510505676, + "sampling/sampling_logp_difference/max": 0.35227441787719727, + "sampling/sampling_logp_difference/mean": 0.01676108129322529, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 104.9375, + "completions/mean_terminated_length": 104.9375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3870048522949219, + "epoch": 0.7893805309734513, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0487475996446023, + "kl": 0.016693545505404472, + "learning_rate": 9.716677356922192e-07, + "loss": 0.0001, + "num_tokens": 7156863.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3446028232574463, + "sampling/importance_sampling_ratio/mean": 0.999738335609436, + "sampling/importance_sampling_ratio/min": 0.6871829628944397, + "sampling/sampling_logp_difference/max": 0.3751547336578369, + "sampling/sampling_logp_difference/mean": 0.01365131326019764, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 103.28125, + "completions/mean_terminated_length": 103.28125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.19822871685028076, + "epoch": 0.7911504424778761, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04623919445889071, + "kl": 0.016442574560642242, + "learning_rate": 9.714109027823216e-07, + "loss": 0.0001, + "num_tokens": 7174289.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3470251560211182, + "sampling/importance_sampling_ratio/mean": 0.9989169836044312, + "sampling/importance_sampling_ratio/min": 0.6370871663093567, + "sampling/sampling_logp_difference/max": 0.4508488178253174, + "sampling/sampling_logp_difference/mean": 0.007454845122992992, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 80.140625, + "completions/mean_terminated_length": 80.140625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2729114592075348, + "epoch": 0.7929203539823009, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.5695436591573864, + "kl": 0.021001402288675308, + "learning_rate": 9.711529452742229e-07, + "loss": 0.0173, + "num_tokens": 7190314.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5101569890975952, + "sampling/importance_sampling_ratio/mean": 0.9994341135025024, + "sampling/importance_sampling_ratio/min": 0.6054678559303284, + "sampling/sampling_logp_difference/max": 0.5017538070678711, + "sampling/sampling_logp_difference/mean": 0.010070646181702614, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 197.0, + "completions/max_terminated_length": 197.0, + "completions/mean_length": 91.5625, + "completions/mean_terminated_length": 91.5625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.36996370553970337, + "epoch": 0.7946902654867256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053565450688631694, + "kl": 0.03582077473402023, + "learning_rate": 9.708938637833064e-07, + "loss": 0.0002, + "num_tokens": 7208782.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4157929420471191, + "sampling/importance_sampling_ratio/mean": 0.9996511936187744, + "sampling/importance_sampling_ratio/min": 0.5518012642860413, + "sampling/sampling_logp_difference/max": 0.5945672988891602, + "sampling/sampling_logp_difference/mean": 0.015002947300672531, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 101.875, + "completions/mean_terminated_length": 101.875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.5070844888687134, + "epoch": 0.7964601769911505, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.6667280847715173, + "kl": 0.0295796487480402, + "learning_rate": 9.706336589276374e-07, + "loss": -0.0625, + "num_tokens": 7226870.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.3591194152832031, + "sampling/importance_sampling_ratio/mean": 1.0007846355438232, + "sampling/importance_sampling_ratio/min": 0.6882472634315491, + "sampling/sampling_logp_difference/max": 0.37360715866088867, + "sampling/sampling_logp_difference/mean": 0.016192808747291565, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 77.359375, + "completions/mean_terminated_length": 77.359375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.27710431814193726, + "epoch": 0.7982300884955752, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.356774751619394, + "kl": 0.03306623920798302, + "learning_rate": 9.703723313279605e-07, + "loss": 0.1667, + "num_tokens": 7243117.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5509188175201416, + "sampling/importance_sampling_ratio/mean": 1.0007847547531128, + "sampling/importance_sampling_ratio/min": 0.6969705820083618, + "sampling/sampling_logp_difference/max": 0.43884754180908203, + "sampling/sampling_logp_difference/mean": 0.012798595242202282, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 60.84375, + "completions/mean_terminated_length": 60.84375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3076930642127991, + "epoch": 0.8, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09004043696286056, + "kl": 0.025182174518704414, + "learning_rate": 9.701098816076995e-07, + "loss": 0.0002, + "num_tokens": 7259971.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2837449312210083, + "sampling/importance_sampling_ratio/mean": 1.0000991821289062, + "sampling/importance_sampling_ratio/min": 0.4642953872680664, + "sampling/sampling_logp_difference/max": 0.7672343254089355, + "sampling/sampling_logp_difference/mean": 0.01526334322988987, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 70.25, + "completions/mean_terminated_length": 70.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3211529552936554, + "epoch": 0.8017699115044248, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08582681747073204, + "kl": 0.03589726239442825, + "learning_rate": 9.698463103929541e-07, + "loss": 0.0002, + "num_tokens": 7275139.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8302972316741943, + "sampling/importance_sampling_ratio/mean": 1.0005204677581787, + "sampling/importance_sampling_ratio/min": 0.6988446116447449, + "sampling/sampling_logp_difference/max": 0.6044783592224121, + "sampling/sampling_logp_difference/mean": 0.011284945532679558, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 213.0, + "completions/max_terminated_length": 213.0, + "completions/mean_length": 38.78125, + "completions/mean_terminated_length": 38.78125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.19094860553741455, + "epoch": 0.8035398230088495, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11639956999007596, + "kl": 0.0388774499297142, + "learning_rate": 9.695816183125003e-07, + "loss": 0.0003, + "num_tokens": 7288709.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.43803071975708, + "sampling/importance_sampling_ratio/mean": 0.9972746968269348, + "sampling/importance_sampling_ratio/min": 0.6347212195396423, + "sampling/sampling_logp_difference/max": 0.45456933975219727, + "sampling/sampling_logp_difference/mean": 0.011178547516465187, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 104.15625, + "completions/mean_terminated_length": 104.15625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.427107036113739, + "epoch": 0.8053097345132744, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06527203937308146, + "kl": 0.026374639943242073, + "learning_rate": 9.693158059977877e-07, + "loss": 0.0002, + "num_tokens": 7306815.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5272523164749146, + "sampling/importance_sampling_ratio/mean": 0.9991812705993652, + "sampling/importance_sampling_ratio/min": 0.5485890507698059, + "sampling/sampling_logp_difference/max": 0.6004055738449097, + "sampling/sampling_logp_difference/mean": 0.016266994178295135, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 98.703125, + "completions/mean_terminated_length": 98.703125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.31570273637771606, + "epoch": 0.8070796460176991, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07527148416420061, + "kl": 0.029133901000022888, + "learning_rate": 9.690488740829383e-07, + "loss": 0.0002, + "num_tokens": 7324908.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3732151985168457, + "sampling/importance_sampling_ratio/mean": 1.0015473365783691, + "sampling/importance_sampling_ratio/min": 0.6976590752601624, + "sampling/sampling_logp_difference/max": 0.36002469062805176, + "sampling/sampling_logp_difference/mean": 0.014895064756274223, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 105.59375, + "completions/mean_terminated_length": 105.59375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.38262939453125, + "epoch": 0.8088495575221238, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06269113840649056, + "kl": 0.021498549729585648, + "learning_rate": 9.68780823204745e-07, + "loss": 0.0002, + "num_tokens": 7341666.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5470013618469238, + "sampling/importance_sampling_ratio/mean": 0.9993369579315186, + "sampling/importance_sampling_ratio/min": 0.685339093208313, + "sampling/sampling_logp_difference/max": 0.43631839752197266, + "sampling/sampling_logp_difference/mean": 0.014647518284618855, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 101.40625, + "completions/mean_terminated_length": 101.40625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.29837697744369507, + "epoch": 0.8106194690265487, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06559806490152259, + "kl": 0.018434328958392143, + "learning_rate": 9.685116540026701e-07, + "loss": 0.0002, + "num_tokens": 7358172.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.436000108718872, + "sampling/importance_sampling_ratio/mean": 1.0009186267852783, + "sampling/importance_sampling_ratio/min": 0.6362788081169128, + "sampling/sampling_logp_difference/max": 0.4521183967590332, + "sampling/sampling_logp_difference/mean": 0.011897927150130272, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 792.0, + "completions/max_terminated_length": 792.0, + "completions/mean_length": 137.171875, + "completions/mean_terminated_length": 137.171875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.5502569675445557, + "epoch": 0.8123893805309734, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.735182949968868, + "kl": 0.01877054199576378, + "learning_rate": 9.682413671188444e-07, + "loss": 0.3218, + "num_tokens": 7379463.0, + "reward": -0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.593226432800293, + "sampling/importance_sampling_ratio/mean": 1.0002869367599487, + "sampling/importance_sampling_ratio/min": 0.36082524061203003, + "sampling/sampling_logp_difference/max": 1.0193616151809692, + "sampling/sampling_logp_difference/mean": 0.017945023253560066, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 179.0, + "completions/max_terminated_length": 179.0, + "completions/mean_length": 36.671875, + "completions/mean_terminated_length": 36.671875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.14537887275218964, + "epoch": 0.8141592920353983, + "frac_reward_zero_std": 0.75, + "grad_norm": 5.268007704313631, + "kl": 0.01473191287368536, + "learning_rate": 9.679699631980637e-07, + "loss": -0.0339, + "num_tokens": 7391026.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.2735956907272339, + "sampling/importance_sampling_ratio/mean": 0.9991209506988525, + "sampling/importance_sampling_ratio/min": 0.68517005443573, + "sampling/sampling_logp_difference/max": 0.37808823585510254, + "sampling/sampling_logp_difference/mean": 0.00664309598505497, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 59.921875, + "completions/mean_terminated_length": 59.921875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.257783442735672, + "epoch": 0.815929203539823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06853751772765107, + "kl": 0.012552602216601372, + "learning_rate": 9.6769744288779e-07, + "loss": 0.0002, + "num_tokens": 7404413.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5852875709533691, + "sampling/importance_sampling_ratio/mean": 0.9996552467346191, + "sampling/importance_sampling_ratio/min": 0.6521279215812683, + "sampling/sampling_logp_difference/max": 0.4607658386230469, + "sampling/sampling_logp_difference/mean": 0.01270623505115509, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 166.0, + "completions/max_terminated_length": 166.0, + "completions/mean_length": 16.140625, + "completions/mean_terminated_length": 16.140625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.18410514295101166, + "epoch": 0.8176991150442477, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17198806886123844, + "kl": 0.02371949329972267, + "learning_rate": 9.674238068381478e-07, + "loss": 0.0003, + "num_tokens": 7415350.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.637800693511963, + "sampling/importance_sampling_ratio/mean": 0.9997628927230835, + "sampling/importance_sampling_ratio/min": 0.42162972688674927, + "sampling/sampling_logp_difference/max": 0.8636277914047241, + "sampling/sampling_logp_difference/mean": 0.009763983078300953, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 110.875, + "completions/mean_terminated_length": 110.875, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.4821784496307373, + "epoch": 0.8194690265486726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07166451040935833, + "kl": 0.017150908708572388, + "learning_rate": 9.671490557019233e-07, + "loss": 0.0002, + "num_tokens": 7434286.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.764185905456543, + "sampling/importance_sampling_ratio/mean": 1.0012612342834473, + "sampling/importance_sampling_ratio/min": 0.6241204142570496, + "sampling/sampling_logp_difference/max": 0.5676894187927246, + "sampling/sampling_logp_difference/mean": 0.018654324114322662, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 125.703125, + "completions/mean_terminated_length": 125.703125, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 0.5399432182312012, + "epoch": 0.8212389380530973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07115499287856507, + "kl": 0.020138971507549286, + "learning_rate": 9.668731901345632e-07, + "loss": 0.0002, + "num_tokens": 7451675.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3973848819732666, + "sampling/importance_sampling_ratio/mean": 1.000847578048706, + "sampling/importance_sampling_ratio/min": 0.6998101472854614, + "sampling/sampling_logp_difference/max": 0.3569462299346924, + "sampling/sampling_logp_difference/mean": 0.01800524815917015, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 215.0, + "completions/max_terminated_length": 215.0, + "completions/mean_length": 61.75, + "completions/mean_terminated_length": 61.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.46671062707901, + "epoch": 0.8230088495575221, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08836532913771333, + "kl": 0.02342853508889675, + "learning_rate": 9.665962107941724e-07, + "loss": 0.0003, + "num_tokens": 7468491.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6156078577041626, + "sampling/importance_sampling_ratio/mean": 1.0004723072052002, + "sampling/importance_sampling_ratio/min": 0.6833137273788452, + "sampling/sampling_logp_difference/max": 0.47971129417419434, + "sampling/sampling_logp_difference/mean": 0.017525987699627876, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 86.828125, + "completions/mean_terminated_length": 86.828125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4605414867401123, + "epoch": 0.8247787610619469, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06902940022709253, + "kl": 0.015131618827581406, + "learning_rate": 9.663181183415131e-07, + "loss": 0.0002, + "num_tokens": 7485584.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6584858894348145, + "sampling/importance_sampling_ratio/mean": 0.9997375011444092, + "sampling/importance_sampling_ratio/min": 0.5581821799278259, + "sampling/sampling_logp_difference/max": 0.5830698013305664, + "sampling/sampling_logp_difference/mean": 0.017058638855814934, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 77.765625, + "completions/mean_terminated_length": 77.765625, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "entropy": 0.45054739713668823, + "epoch": 0.8265486725663717, + "frac_reward_zero_std": 0.75, + "grad_norm": 11.732709836704123, + "kl": 0.028658758848905563, + "learning_rate": 9.660389134400033e-07, + "loss": 0.0982, + "num_tokens": 7503057.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000295639038086, + "sampling/importance_sampling_ratio/min": 0.6674122214317322, + "sampling/sampling_logp_difference/max": 1.2609937191009521, + "sampling/sampling_logp_difference/mean": 0.021381115540862083, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 109.78125, + "completions/mean_terminated_length": 109.78125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3166119456291199, + "epoch": 0.8283185840707965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.17233483628358817, + "kl": 0.018055278807878494, + "learning_rate": 9.657585967557138e-07, + "loss": 0.0002, + "num_tokens": 7520851.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999561071395874, + "sampling/importance_sampling_ratio/min": 0.29186415672302246, + "sampling/sampling_logp_difference/max": 1.2314667701721191, + "sampling/sampling_logp_difference/mean": 0.012147579342126846, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 108.1875, + "completions/mean_terminated_length": 108.1875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.22439920902252197, + "epoch": 0.8300884955752212, + "frac_reward_zero_std": 0.5, + "grad_norm": 5.521774332190136, + "kl": 0.0083629060536623, + "learning_rate": 9.654771689573684e-07, + "loss": -0.1336, + "num_tokens": 7537055.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.7783924341201782, + "sampling/importance_sampling_ratio/mean": 1.00020170211792, + "sampling/importance_sampling_ratio/min": 0.6649647355079651, + "sampling/sampling_logp_difference/max": 0.5757098197937012, + "sampling/sampling_logp_difference/mean": 0.010277085937559605, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 38.5, + "completions/mean_terminated_length": 38.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1761656403541565, + "epoch": 0.831858407079646, + "frac_reward_zero_std": 0.75, + "grad_norm": 5.292711007458864, + "kl": 0.0075834584422409534, + "learning_rate": 9.651946307163416e-07, + "loss": 0.0957, + "num_tokens": 7550127.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.3057875633239746, + "sampling/importance_sampling_ratio/mean": 1.001237154006958, + "sampling/importance_sampling_ratio/min": 0.5409621000289917, + "sampling/sampling_logp_difference/max": 0.6144061088562012, + "sampling/sampling_logp_difference/mean": 0.013811590149998665, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 96.671875, + "completions/mean_terminated_length": 96.671875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.5126727819442749, + "epoch": 0.8336283185840708, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1124198074532935, + "kl": 0.027021564543247223, + "learning_rate": 9.64910982706657e-07, + "loss": 0.0002, + "num_tokens": 7573178.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5440689325332642, + "sampling/importance_sampling_ratio/mean": 0.9985802173614502, + "sampling/importance_sampling_ratio/min": 0.6255274415016174, + "sampling/sampling_logp_difference/max": 0.4691600799560547, + "sampling/sampling_logp_difference/mean": 0.019156504422426224, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 174.0, + "completions/max_terminated_length": 174.0, + "completions/mean_length": 79.421875, + "completions/mean_terminated_length": 79.421875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3693407475948334, + "epoch": 0.8353982300884956, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06026925116566931, + "kl": 0.029721589758992195, + "learning_rate": 9.646262256049852e-07, + "loss": 0.0002, + "num_tokens": 7586709.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3058912754058838, + "sampling/importance_sampling_ratio/mean": 0.9994974136352539, + "sampling/importance_sampling_ratio/min": 0.5555564761161804, + "sampling/sampling_logp_difference/max": 0.587785005569458, + "sampling/sampling_logp_difference/mean": 0.01478197705000639, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 59.390625, + "completions/mean_terminated_length": 59.390625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.21782368421554565, + "epoch": 0.8371681415929203, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06599991621803777, + "kl": 0.012956816703081131, + "learning_rate": 9.643403600906432e-07, + "loss": 0.0002, + "num_tokens": 7600206.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5796705484390259, + "sampling/importance_sampling_ratio/mean": 0.9997380375862122, + "sampling/importance_sampling_ratio/min": 0.6913318037986755, + "sampling/sampling_logp_difference/max": 0.4572162628173828, + "sampling/sampling_logp_difference/mean": 0.013016052544116974, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 72.265625, + "completions/mean_terminated_length": 72.265625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.27213335037231445, + "epoch": 0.8389380530973451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06018047672571673, + "kl": 0.010599360801279545, + "learning_rate": 9.640533868455918e-07, + "loss": 0.0002, + "num_tokens": 7618191.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4041589498519897, + "sampling/importance_sampling_ratio/mean": 0.9997031092643738, + "sampling/importance_sampling_ratio/min": 0.6552351117134094, + "sampling/sampling_logp_difference/max": 0.4227612018585205, + "sampling/sampling_logp_difference/mean": 0.010823149234056473, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 68.75, + "completions/mean_terminated_length": 68.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.37680357694625854, + "epoch": 0.8407079646017699, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07623883161439017, + "kl": 0.020373038947582245, + "learning_rate": 9.637653065544349e-07, + "loss": 0.0002, + "num_tokens": 7634287.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6568858623504639, + "sampling/importance_sampling_ratio/mean": 0.9996323585510254, + "sampling/importance_sampling_ratio/min": 0.7048661112785339, + "sampling/sampling_logp_difference/max": 0.5049397945404053, + "sampling/sampling_logp_difference/mean": 0.015485817566514015, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 93.6875, + "completions/mean_terminated_length": 93.6875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4273343086242676, + "epoch": 0.8424778761061947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08576728790420027, + "kl": 0.02765519730746746, + "learning_rate": 9.634761199044165e-07, + "loss": 0.0002, + "num_tokens": 7650235.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5675156116485596, + "sampling/importance_sampling_ratio/mean": 1.0012264251708984, + "sampling/importance_sampling_ratio/min": 0.28429707884788513, + "sampling/sampling_logp_difference/max": 1.2577356100082397, + "sampling/sampling_logp_difference/mean": 0.01669345051050186, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 57.390625, + "completions/mean_terminated_length": 57.390625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1557103544473648, + "epoch": 0.8442477876106195, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08774264129861918, + "kl": 0.01975005678832531, + "learning_rate": 9.63185827585421e-07, + "loss": 0.0002, + "num_tokens": 7666228.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.931877851486206, + "sampling/importance_sampling_ratio/mean": 0.9998209476470947, + "sampling/importance_sampling_ratio/min": 0.6257392168045044, + "sampling/sampling_logp_difference/max": 0.6584925651550293, + "sampling/sampling_logp_difference/mean": 0.010466136038303375, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 522.0, + "completions/max_terminated_length": 522.0, + "completions/mean_length": 75.609375, + "completions/mean_terminated_length": 75.609375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3157695531845093, + "epoch": 0.8460176991150442, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06225484644973683, + "kl": 0.022523541003465652, + "learning_rate": 9.628944302899695e-07, + "loss": 0.0002, + "num_tokens": 7682395.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5499114990234375, + "sampling/importance_sampling_ratio/mean": 1.0018589496612549, + "sampling/importance_sampling_ratio/min": 0.7029415965080261, + "sampling/sampling_logp_difference/max": 0.4381978511810303, + "sampling/sampling_logp_difference/mean": 0.015560553409159184, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 21.0, + "completions/max_terminated_length": 21.0, + "completions/mean_length": 17.0, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.06959155946969986, + "epoch": 0.8477876106194691, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.27239551020489455, + "kl": 0.013529862277209759, + "learning_rate": 9.6260192871322e-07, + "loss": 0.0001, + "num_tokens": 7695019.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7910892963409424, + "sampling/importance_sampling_ratio/mean": 1.000964879989624, + "sampling/importance_sampling_ratio/min": 0.7308072447776794, + "sampling/sampling_logp_difference/max": 0.5828239917755127, + "sampling/sampling_logp_difference/mean": 0.0050699952989816666, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 60.28125, + "completions/mean_terminated_length": 60.28125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2767450213432312, + "epoch": 0.8495575221238938, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05418953873731226, + "kl": 0.009159858338534832, + "learning_rate": 9.623083235529646e-07, + "loss": 0.0001, + "num_tokens": 7710733.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4479974508285522, + "sampling/importance_sampling_ratio/mean": 0.9996775388717651, + "sampling/importance_sampling_ratio/min": 0.6703086495399475, + "sampling/sampling_logp_difference/max": 0.4000170826911926, + "sampling/sampling_logp_difference/mean": 0.010580535978078842, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 67.609375, + "completions/mean_terminated_length": 67.609375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.26749616861343384, + "epoch": 0.8513274336283185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041986870827581534, + "kl": 0.007706194184720516, + "learning_rate": 9.620136155096275e-07, + "loss": 0.0001, + "num_tokens": 7724564.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5395063161849976, + "sampling/importance_sampling_ratio/mean": 0.9995875954627991, + "sampling/importance_sampling_ratio/min": 0.5570306777954102, + "sampling/sampling_logp_difference/max": 0.5851349830627441, + "sampling/sampling_logp_difference/mean": 0.0122038209810853, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 62.09375, + "completions/mean_terminated_length": 62.09375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.2679656147956848, + "epoch": 0.8530973451327434, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06334004054578615, + "kl": 0.013048840686678886, + "learning_rate": 9.617178052862649e-07, + "loss": 0.0001, + "num_tokens": 7738714.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4340221881866455, + "sampling/importance_sampling_ratio/mean": 1.0003671646118164, + "sampling/importance_sampling_ratio/min": 0.6112040877342224, + "sampling/sampling_logp_difference/max": 0.4923243522644043, + "sampling/sampling_logp_difference/mean": 0.012869300320744514, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 79.59375, + "completions/mean_terminated_length": 79.59375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3289504051208496, + "epoch": 0.8548672566371681, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04462738289464286, + "kl": 0.014411314390599728, + "learning_rate": 9.614208935885614e-07, + "loss": 0.0001, + "num_tokens": 7754512.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3118646144866943, + "sampling/importance_sampling_ratio/mean": 1.0010559558868408, + "sampling/importance_sampling_ratio/min": 0.6392340064048767, + "sampling/sampling_logp_difference/max": 0.44748473167419434, + "sampling/sampling_logp_difference/mean": 0.01603720150887966, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 78.796875, + "completions/mean_terminated_length": 78.796875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.28768643736839294, + "epoch": 0.856637168141593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040511614927051826, + "kl": 0.007257409393787384, + "learning_rate": 9.6112288112483e-07, + "loss": 0.0001, + "num_tokens": 7771427.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4626340866088867, + "sampling/importance_sampling_ratio/mean": 0.9994089603424072, + "sampling/importance_sampling_ratio/min": 0.6803320646286011, + "sampling/sampling_logp_difference/max": 0.3851742744445801, + "sampling/sampling_logp_difference/mean": 0.010980907827615738, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 110.84375, + "completions/mean_terminated_length": 110.84375, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "entropy": 0.4556252956390381, + "epoch": 0.8584070796460177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.054893842974183746, + "kl": 0.013409667648375034, + "learning_rate": 9.608237686060097e-07, + "loss": 0.0001, + "num_tokens": 7788601.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3639452457427979, + "sampling/importance_sampling_ratio/mean": 1.0000271797180176, + "sampling/importance_sampling_ratio/min": 0.561255931854248, + "sampling/sampling_logp_difference/max": 0.5775783061981201, + "sampling/sampling_logp_difference/mean": 0.0168587788939476, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 62.03125, + "completions/mean_terminated_length": 62.03125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3430402874946594, + "epoch": 0.8601769911504424, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05995714263974936, + "kl": 0.013722425326704979, + "learning_rate": 9.605235567456635e-07, + "loss": 0.0001, + "num_tokens": 7803531.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6505627632141113, + "sampling/importance_sampling_ratio/mean": 0.999412477016449, + "sampling/importance_sampling_ratio/min": 0.6296095848083496, + "sampling/sampling_logp_difference/max": 0.5011162757873535, + "sampling/sampling_logp_difference/mean": 0.014488639310002327, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 100.546875, + "completions/mean_terminated_length": 100.546875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3184402883052826, + "epoch": 0.8619469026548673, + "frac_reward_zero_std": 0.75, + "grad_norm": 6.789551644989066, + "kl": 0.014047389850020409, + "learning_rate": 9.602222462599766e-07, + "loss": 0.1634, + "num_tokens": 7819838.0, + "reward": -0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4881186485290527, + "sampling/importance_sampling_ratio/mean": 1.001170039176941, + "sampling/importance_sampling_ratio/min": 0.6323857307434082, + "sampling/sampling_logp_difference/max": 0.4582557678222656, + "sampling/sampling_logp_difference/mean": 0.013673178851604462, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 54.6875, + "completions/mean_terminated_length": 54.6875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.36572539806365967, + "epoch": 0.863716814159292, + "frac_reward_zero_std": 0.5, + "grad_norm": 37.659219580786726, + "kl": 0.07570485770702362, + "learning_rate": 9.599198378677558e-07, + "loss": 0.7396, + "num_tokens": 7837898.0, + "reward": 0.625, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0011978149414062, + "sampling/importance_sampling_ratio/min": 0.7209191918373108, + "sampling/sampling_logp_difference/max": 0.7518572807312012, + "sampling/sampling_logp_difference/mean": 0.013867147266864777, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.046875, + "completions/mean_terminated_length": 15.046875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.06888234615325928, + "epoch": 0.8654867256637168, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2142080578369344, + "kl": 0.024057459086179733, + "learning_rate": 9.596163322904269e-07, + "loss": 0.0002, + "num_tokens": 7851069.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3726508617401123, + "sampling/importance_sampling_ratio/mean": 1.0013298988342285, + "sampling/importance_sampling_ratio/min": 0.8065332174301147, + "sampling/sampling_logp_difference/max": 0.3167438507080078, + "sampling/sampling_logp_difference/mean": 0.006785606499761343, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 88.53125, + "completions/mean_terminated_length": 88.53125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.35719120502471924, + "epoch": 0.8672566371681416, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0314355509412585, + "kl": 0.011010359972715378, + "learning_rate": 9.593117302520328e-07, + "loss": 0.0001, + "num_tokens": 7866143.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.511204719543457, + "sampling/importance_sampling_ratio/mean": 0.9987002015113831, + "sampling/importance_sampling_ratio/min": 0.6565293073654175, + "sampling/sampling_logp_difference/max": 0.4207879304885864, + "sampling/sampling_logp_difference/mean": 0.014091964811086655, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 92.375, + "completions/mean_terminated_length": 92.375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4985180199146271, + "epoch": 0.8690265486725663, + "frac_reward_zero_std": 0.5, + "grad_norm": 7.644053057921895, + "kl": 0.018009070307016373, + "learning_rate": 9.590060324792325e-07, + "loss": 0.1329, + "num_tokens": 7881895.0, + "reward": 0.03125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.8456369638442993, + "sampling/importance_sampling_ratio/mean": 1.0008618831634521, + "sampling/importance_sampling_ratio/min": 0.6019213795661926, + "sampling/sampling_logp_difference/max": 0.6128244400024414, + "sampling/sampling_logp_difference/mean": 0.01921398565173149, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 211.0, + "completions/max_terminated_length": 211.0, + "completions/mean_length": 42.25, + "completions/mean_terminated_length": 42.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2719489336013794, + "epoch": 0.8707964601769912, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08532220988810806, + "kl": 0.010976046323776245, + "learning_rate": 9.58699239701299e-07, + "loss": 0.0001, + "num_tokens": 7898663.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.421367883682251, + "sampling/importance_sampling_ratio/mean": 1.0002838373184204, + "sampling/importance_sampling_ratio/min": 0.6368826031684875, + "sampling/sampling_logp_difference/max": 0.4511699676513672, + "sampling/sampling_logp_difference/mean": 0.012832563370466232, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 58.21875, + "completions/mean_terminated_length": 58.21875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1278567761182785, + "epoch": 0.8725663716814159, + "frac_reward_zero_std": 0.75, + "grad_norm": 8.595419109387096, + "kl": 0.01883358508348465, + "learning_rate": 9.58391352650117e-07, + "loss": -0.5106, + "num_tokens": 7911909.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.2526459693908691, + "sampling/importance_sampling_ratio/mean": 1.0006308555603027, + "sampling/importance_sampling_ratio/min": 0.4962599277496338, + "sampling/sampling_logp_difference/max": 0.700655460357666, + "sampling/sampling_logp_difference/mean": 0.010099800303578377, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 40.125, + "completions/mean_terminated_length": 40.125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.24204905331134796, + "epoch": 0.8743362831858407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06587748198923991, + "kl": 0.009902874939143658, + "learning_rate": 9.580823720601823e-07, + "loss": 0.0001, + "num_tokens": 7928429.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.445271372795105, + "sampling/importance_sampling_ratio/mean": 1.0001397132873535, + "sampling/importance_sampling_ratio/min": 0.7533186078071594, + "sampling/sampling_logp_difference/max": 0.36829710006713867, + "sampling/sampling_logp_difference/mean": 0.014522343873977661, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 56.25, + "completions/mean_terminated_length": 56.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2897334098815918, + "epoch": 0.8761061946902655, + "frac_reward_zero_std": 0.75, + "grad_norm": 7.1646682486535305, + "kl": 0.1686052680015564, + "learning_rate": 9.57772298668599e-07, + "loss": -0.1076, + "num_tokens": 7940925.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.336471676826477, + "sampling/importance_sampling_ratio/mean": 1.0002723932266235, + "sampling/importance_sampling_ratio/min": 0.7755419015884399, + "sampling/sampling_logp_difference/max": 0.29003310203552246, + "sampling/sampling_logp_difference/mean": 0.012305348180234432, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 71.4375, + "completions/mean_terminated_length": 71.4375, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.4769209027290344, + "epoch": 0.8778761061946903, + "frac_reward_zero_std": 0.75, + "grad_norm": 10.18835702156589, + "kl": 0.060292646288871765, + "learning_rate": 9.57461133215079e-07, + "loss": -0.1092, + "num_tokens": 7956665.0, + "reward": -0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": -0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.618024468421936, + "sampling/importance_sampling_ratio/mean": 0.99906325340271, + "sampling/importance_sampling_ratio/min": 0.6003147959709167, + "sampling/sampling_logp_difference/max": 0.5103011131286621, + "sampling/sampling_logp_difference/mean": 0.017318300902843475, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 58.671875, + "completions/mean_terminated_length": 58.671875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2595871090888977, + "epoch": 0.879646017699115, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0416886480079275, + "kl": 0.0072275553829967976, + "learning_rate": 9.57148876441938e-07, + "loss": 0.0001, + "num_tokens": 7972820.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.45339035987854, + "sampling/importance_sampling_ratio/mean": 1.000535011291504, + "sampling/importance_sampling_ratio/min": 0.6839017868041992, + "sampling/sampling_logp_difference/max": 0.3799409866333008, + "sampling/sampling_logp_difference/mean": 0.010540075600147247, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 200.0, + "completions/max_terminated_length": 200.0, + "completions/mean_length": 39.84375, + "completions/mean_terminated_length": 39.84375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2132524847984314, + "epoch": 0.8814159292035398, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05927996815786505, + "kl": 0.005652535706758499, + "learning_rate": 9.568355290940966e-07, + "loss": 0.0001, + "num_tokens": 7987946.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.584210753440857, + "sampling/importance_sampling_ratio/mean": 1.0005481243133545, + "sampling/importance_sampling_ratio/min": 0.655351996421814, + "sampling/sampling_logp_difference/max": 0.4600863456726074, + "sampling/sampling_logp_difference/mean": 0.010919488966464996, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 84.90625, + "completions/mean_terminated_length": 84.90625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.296769917011261, + "epoch": 0.8831858407079646, + "frac_reward_zero_std": 0.75, + "grad_norm": 6.735124506983188, + "kl": 0.016414642333984375, + "learning_rate": 9.565210919190763e-07, + "loss": 0.015, + "num_tokens": 8003700.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.9363287687301636, + "sampling/importance_sampling_ratio/mean": 0.9987481236457825, + "sampling/importance_sampling_ratio/min": 0.7225105166435242, + "sampling/sampling_logp_difference/max": 0.6607937812805176, + "sampling/sampling_logp_difference/mean": 0.015974685549736023, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 178.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 38.03125, + "completions/mean_terminated_length": 38.03125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.19154030084609985, + "epoch": 0.8849557522123894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08183335721489599, + "kl": 0.007817339152097702, + "learning_rate": 9.562055656669987e-07, + "loss": 0.0001, + "num_tokens": 8020886.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.598867416381836, + "sampling/importance_sampling_ratio/mean": 1.0001122951507568, + "sampling/importance_sampling_ratio/min": 0.5727236270904541, + "sampling/sampling_logp_difference/max": 0.5573520660400391, + "sampling/sampling_logp_difference/mean": 0.013924205675721169, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 97.796875, + "completions/mean_terminated_length": 97.796875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3268697261810303, + "epoch": 0.8867256637168142, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029052867098632782, + "kl": 0.007557909470051527, + "learning_rate": 9.558889510905835e-07, + "loss": 0.0001, + "num_tokens": 8039145.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.647871494293213, + "sampling/importance_sampling_ratio/mean": 1.000300407409668, + "sampling/importance_sampling_ratio/min": 0.6561062932014465, + "sampling/sampling_logp_difference/max": 0.49948447942733765, + "sampling/sampling_logp_difference/mean": 0.013082124292850494, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 94.796875, + "completions/mean_terminated_length": 94.796875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.43946853280067444, + "epoch": 0.8884955752212389, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04044782874748557, + "kl": 0.058621086180210114, + "learning_rate": 9.555712489451464e-07, + "loss": 0.0002, + "num_tokens": 8056556.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5867271423339844, + "sampling/importance_sampling_ratio/mean": 0.9995195269584656, + "sampling/importance_sampling_ratio/min": 0.6893123984336853, + "sampling/sampling_logp_difference/max": 0.4616734981536865, + "sampling/sampling_logp_difference/mean": 0.01584906503558159, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 105.546875, + "completions/mean_terminated_length": 105.546875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.5053263306617737, + "epoch": 0.8902654867256637, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.045677652748138, + "kl": 0.2734725773334503, + "learning_rate": 9.55252459988598e-07, + "loss": 0.0963, + "num_tokens": 8073807.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.57107675075531, + "sampling/importance_sampling_ratio/mean": 0.9994295239448547, + "sampling/importance_sampling_ratio/min": 0.5382452011108398, + "sampling/sampling_logp_difference/max": 0.619441032409668, + "sampling/sampling_logp_difference/mean": 0.018887978047132492, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 61.453125, + "completions/mean_terminated_length": 61.453125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.23520615696907043, + "epoch": 0.8920353982300885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044142626143724856, + "kl": 0.009435078129172325, + "learning_rate": 9.549325849814418e-07, + "loss": 0.0001, + "num_tokens": 8089500.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4041417837142944, + "sampling/importance_sampling_ratio/mean": 0.9998427629470825, + "sampling/importance_sampling_ratio/min": 0.6066602468490601, + "sampling/sampling_logp_difference/max": 0.499786376953125, + "sampling/sampling_logp_difference/mean": 0.013650336302816868, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 895.0, + "completions/max_terminated_length": 895.0, + "completions/mean_length": 128.921875, + "completions/mean_terminated_length": 128.921875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3584873080253601, + "epoch": 0.8938053097345132, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.164030357296124, + "kl": 0.006217114627361298, + "learning_rate": 9.546116246867713e-07, + "loss": 0.3262, + "num_tokens": 8108295.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.430129885673523, + "sampling/importance_sampling_ratio/mean": 1.0008875131607056, + "sampling/importance_sampling_ratio/min": 0.6261174082756042, + "sampling/sampling_logp_difference/max": 0.4682173728942871, + "sampling/sampling_logp_difference/mean": 0.013849091716110706, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 209.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 48.671875, + "completions/mean_terminated_length": 48.671875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.19223593175411224, + "epoch": 0.8955752212389381, + "frac_reward_zero_std": 0.75, + "grad_norm": 9.298491366271188, + "kl": 0.012916343286633492, + "learning_rate": 9.542895798702701e-07, + "loss": -0.1124, + "num_tokens": 8121762.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.254530429840088, + "sampling/importance_sampling_ratio/mean": 1.0021955966949463, + "sampling/importance_sampling_ratio/min": 0.6728146076202393, + "sampling/sampling_logp_difference/max": 0.3962855339050293, + "sampling/sampling_logp_difference/mean": 0.012941524386405945, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 109.03125, + "completions/mean_terminated_length": 109.03125, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "entropy": 0.4453309178352356, + "epoch": 0.8973451327433628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03895957066929234, + "kl": 0.01071190182119608, + "learning_rate": 9.539664513002084e-07, + "loss": 0.0001, + "num_tokens": 8138468.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3871991634368896, + "sampling/importance_sampling_ratio/mean": 1.0000834465026855, + "sampling/importance_sampling_ratio/min": 0.5638471245765686, + "sampling/sampling_logp_difference/max": 0.5729721784591675, + "sampling/sampling_logp_difference/mean": 0.016829367727041245, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.0, + "completions/max_terminated_length": 251.0, + "completions/mean_length": 83.03125, + "completions/mean_terminated_length": 83.03125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.34089598059654236, + "epoch": 0.8991150442477877, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.044027304289342664, + "kl": 0.009923753328621387, + "learning_rate": 9.536422397474418e-07, + "loss": 0.0001, + "num_tokens": 8153046.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4149080514907837, + "sampling/importance_sampling_ratio/mean": 1.0008606910705566, + "sampling/importance_sampling_ratio/min": 0.6992542147636414, + "sampling/sampling_logp_difference/max": 0.3577408790588379, + "sampling/sampling_logp_difference/mean": 0.013866678811609745, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 96.796875, + "completions/mean_terminated_length": 96.796875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.3181552290916443, + "epoch": 0.9008849557522124, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035081746701379146, + "kl": 0.009072013199329376, + "learning_rate": 9.533169459854098e-07, + "loss": 0.0001, + "num_tokens": 8169865.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.546746850013733, + "sampling/importance_sampling_ratio/mean": 0.9996622204780579, + "sampling/importance_sampling_ratio/min": 0.7007312774658203, + "sampling/sampling_logp_difference/max": 0.4361538887023926, + "sampling/sampling_logp_difference/mean": 0.012515220791101456, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 134.0, + "completions/max_terminated_length": 134.0, + "completions/mean_length": 33.640625, + "completions/mean_terminated_length": 33.640625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.16498476266860962, + "epoch": 0.9026548672566371, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04012191510593879, + "kl": 0.005203623324632645, + "learning_rate": 9.529905707901333e-07, + "loss": 0.0001, + "num_tokens": 8182402.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2919236421585083, + "sampling/importance_sampling_ratio/mean": 0.9999414086341858, + "sampling/importance_sampling_ratio/min": 0.5896210074424744, + "sampling/sampling_logp_difference/max": 0.5282753705978394, + "sampling/sampling_logp_difference/mean": 0.008416219614446163, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 113.21875, + "completions/mean_terminated_length": 113.21875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.3625057339668274, + "epoch": 0.904424778761062, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.568032198967511, + "kl": 0.006654072552919388, + "learning_rate": 9.526631149402134e-07, + "loss": 0.0482, + "num_tokens": 8201504.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.3120354413986206, + "sampling/importance_sampling_ratio/mean": 1.0005406141281128, + "sampling/importance_sampling_ratio/min": 0.5685505867004395, + "sampling/sampling_logp_difference/max": 0.5646649599075317, + "sampling/sampling_logp_difference/mean": 0.013463255017995834, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 114.828125, + "completions/mean_terminated_length": 114.828125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3808782994747162, + "epoch": 0.9061946902654867, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036424015039903675, + "kl": 0.009641697630286217, + "learning_rate": 9.523345792168288e-07, + "loss": 0.0001, + "num_tokens": 8218661.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5945333242416382, + "sampling/importance_sampling_ratio/mean": 0.9990466237068176, + "sampling/importance_sampling_ratio/min": 0.498073011636734, + "sampling/sampling_logp_difference/max": 0.6970086097717285, + "sampling/sampling_logp_difference/mean": 0.015809332951903343, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 89.890625, + "completions/mean_terminated_length": 89.890625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.36214619874954224, + "epoch": 0.9079646017699115, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02469736712140168, + "kl": 0.009469065815210342, + "learning_rate": 9.520049644037347e-07, + "loss": 0.0001, + "num_tokens": 8235598.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2883288860321045, + "sampling/importance_sampling_ratio/mean": 1.0004262924194336, + "sampling/importance_sampling_ratio/min": 0.5304144620895386, + "sampling/sampling_logp_difference/max": 0.634096622467041, + "sampling/sampling_logp_difference/mean": 0.011682311072945595, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 239.0, + "completions/max_terminated_length": 239.0, + "completions/mean_length": 88.03125, + "completions/mean_terminated_length": 88.03125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.44736549258232117, + "epoch": 0.9097345132743363, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036895631890060225, + "kl": 0.016795285046100616, + "learning_rate": 9.516742712872605e-07, + "loss": 0.0002, + "num_tokens": 8259312.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.6561634540557861, + "sampling/importance_sampling_ratio/mean": 0.9994844198226929, + "sampling/importance_sampling_ratio/min": 0.5980254411697388, + "sampling/sampling_logp_difference/max": 0.5141220092773438, + "sampling/sampling_logp_difference/mean": 0.017143195495009422, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 140.765625, + "completions/mean_terminated_length": 140.765625, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "entropy": 0.46250495314598083, + "epoch": 0.911504424778761, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.5997216064046285, + "kl": 0.008927903138101101, + "learning_rate": 9.513425006563078e-07, + "loss": -0.1194, + "num_tokens": 8278049.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.4426480531692505, + "sampling/importance_sampling_ratio/mean": 1.0001449584960938, + "sampling/importance_sampling_ratio/min": 0.6405063271522522, + "sampling/sampling_logp_difference/max": 0.4454963207244873, + "sampling/sampling_logp_difference/mean": 0.015760906040668488, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 96.828125, + "completions/mean_terminated_length": 96.828125, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "entropy": 0.5690308809280396, + "epoch": 0.9132743362831859, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0331973440363682, + "kl": 0.009910114109516144, + "learning_rate": 9.51009653302349e-07, + "loss": 0.0001, + "num_tokens": 8296374.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6122362613677979, + "sampling/importance_sampling_ratio/mean": 0.9999167323112488, + "sampling/importance_sampling_ratio/min": 0.5489570498466492, + "sampling/sampling_logp_difference/max": 0.5997350215911865, + "sampling/sampling_logp_difference/mean": 0.019404888153076172, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 154.234375, + "completions/mean_terminated_length": 154.234375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.36337944865226746, + "epoch": 0.9150442477876106, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.664883462583671, + "kl": 0.014331409707665443, + "learning_rate": 9.506757300194248e-07, + "loss": -0.0603, + "num_tokens": 8318053.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5481709241867065, + "sampling/importance_sampling_ratio/mean": 0.9998996257781982, + "sampling/importance_sampling_ratio/min": 0.6269489526748657, + "sampling/sampling_logp_difference/max": 0.4668901562690735, + "sampling/sampling_logp_difference/mean": 0.012643365189433098, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 139.34375, + "completions/mean_terminated_length": 139.34375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.43411684036254883, + "epoch": 0.9168141592920354, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.140124683083626, + "kl": 0.018226414918899536, + "learning_rate": 9.50340731604143e-07, + "loss": 0.2147, + "num_tokens": 8336539.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.62480890750885, + "sampling/importance_sampling_ratio/mean": 0.9991416931152344, + "sampling/importance_sampling_ratio/min": 0.236353799700737, + "sampling/sampling_logp_difference/max": 1.4424254894256592, + "sampling/sampling_logp_difference/mean": 0.015893306583166122, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 95.546875, + "completions/mean_terminated_length": 95.546875, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 0.5186383128166199, + "epoch": 0.9185840707964602, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039527739559405944, + "kl": 0.011614596471190453, + "learning_rate": 9.500046588556761e-07, + "loss": 0.0001, + "num_tokens": 8352046.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9996851682662964, + "sampling/importance_sampling_ratio/min": 0.7078847289085388, + "sampling/sampling_logp_difference/max": 0.9716105461120605, + "sampling/sampling_logp_difference/mean": 0.017761271446943283, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 107.421875, + "completions/mean_terminated_length": 107.421875, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "entropy": 0.3980678915977478, + "epoch": 0.9203539823008849, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03050835034680889, + "kl": 0.011641588993370533, + "learning_rate": 9.496675125757594e-07, + "loss": 0.0001, + "num_tokens": 8368809.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.9738656282424927, + "sampling/importance_sampling_ratio/mean": 1.0001617670059204, + "sampling/importance_sampling_ratio/min": 0.6098880767822266, + "sampling/sampling_logp_difference/max": 0.6799938678741455, + "sampling/sampling_logp_difference/mean": 0.014947582967579365, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1060.0, + "completions/max_terminated_length": 1060.0, + "completions/mean_length": 140.09375, + "completions/mean_terminated_length": 140.09375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.42774447798728943, + "epoch": 0.9221238938053097, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.4898741180099533, + "kl": 0.00820097140967846, + "learning_rate": 9.493292935686894e-07, + "loss": 0.261, + "num_tokens": 8387039.0, + "reward": 0.0, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.515367865562439, + "sampling/importance_sampling_ratio/mean": 1.0005155801773071, + "sampling/importance_sampling_ratio/min": 0.6541255116462708, + "sampling/sampling_logp_difference/max": 0.4244561195373535, + "sampling/sampling_logp_difference/mean": 0.01558619737625122, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 118.859375, + "completions/mean_terminated_length": 118.859375, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "entropy": 0.499830037355423, + "epoch": 0.9238938053097345, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0253552589765691, + "kl": 0.009436726570129395, + "learning_rate": 9.489900026413216e-07, + "loss": 0.0001, + "num_tokens": 8404118.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5663814544677734, + "sampling/importance_sampling_ratio/mean": 1.0002120733261108, + "sampling/importance_sampling_ratio/min": 0.4994365870952606, + "sampling/sampling_logp_difference/max": 0.6942746639251709, + "sampling/sampling_logp_difference/mean": 0.018683871254324913, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 91.046875, + "completions/mean_terminated_length": 91.046875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4420931935310364, + "epoch": 0.9256637168141593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032238059689454056, + "kl": 0.009348719380795956, + "learning_rate": 9.486496406030685e-07, + "loss": 0.0001, + "num_tokens": 8421017.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4921983480453491, + "sampling/importance_sampling_ratio/mean": 0.9986906051635742, + "sampling/importance_sampling_ratio/min": 0.7028223276138306, + "sampling/sampling_logp_difference/max": 0.4002504348754883, + "sampling/sampling_logp_difference/mean": 0.01746968738734722, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 42.96875, + "completions/mean_terminated_length": 42.96875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.21450337767601013, + "epoch": 0.9274336283185841, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06758275124603391, + "kl": 0.006579137407243252, + "learning_rate": 9.483082082658982e-07, + "loss": 0.0001, + "num_tokens": 8439863.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.434173345565796, + "sampling/importance_sampling_ratio/mean": 0.9990978240966797, + "sampling/importance_sampling_ratio/min": 0.5833903551101685, + "sampling/sampling_logp_difference/max": 0.5388987064361572, + "sampling/sampling_logp_difference/mean": 0.012227091938257217, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 122.5, + "completions/mean_terminated_length": 122.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.42718803882598877, + "epoch": 0.9292035398230089, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.389959562645312, + "kl": 0.010070500895380974, + "learning_rate": 9.479657064443321e-07, + "loss": -0.1146, + "num_tokens": 8459079.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.6004486083984375, + "sampling/importance_sampling_ratio/mean": 1.0002415180206299, + "sampling/importance_sampling_ratio/min": 0.6741685271263123, + "sampling/sampling_logp_difference/max": 0.47028398513793945, + "sampling/sampling_logp_difference/mean": 0.016869788989424706, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 95.09375, + "completions/mean_terminated_length": 95.09375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.5120389461517334, + "epoch": 0.9309734513274336, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.8066848398685473, + "kl": 0.02020437829196453, + "learning_rate": 9.476221359554423e-07, + "loss": 0.0886, + "num_tokens": 8477517.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.464606523513794, + "sampling/importance_sampling_ratio/mean": 0.9992693662643433, + "sampling/importance_sampling_ratio/min": 0.7041698694229126, + "sampling/sampling_logp_difference/max": 0.38158655166625977, + "sampling/sampling_logp_difference/mean": 0.017616450786590576, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 70.546875, + "completions/mean_terminated_length": 70.546875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.260747492313385, + "epoch": 0.9327433628318584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030913574652803144, + "kl": 0.006850100588053465, + "learning_rate": 9.472774976188513e-07, + "loss": 0.0001, + "num_tokens": 8493760.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6097594499588013, + "sampling/importance_sampling_ratio/mean": 1.001157522201538, + "sampling/importance_sampling_ratio/min": 0.7061507105827332, + "sampling/sampling_logp_difference/max": 0.47608476877212524, + "sampling/sampling_logp_difference/mean": 0.009453845210373402, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 159.0, + "completions/max_terminated_length": 159.0, + "completions/mean_length": 37.4375, + "completions/mean_terminated_length": 37.4375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.18946631252765656, + "epoch": 0.9345132743362832, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06875360822757838, + "kl": 0.009271876886487007, + "learning_rate": 9.469317922567286e-07, + "loss": 0.0002, + "num_tokens": 8512924.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5250033140182495, + "sampling/importance_sampling_ratio/mean": 1.0006039142608643, + "sampling/importance_sampling_ratio/min": 0.6538918614387512, + "sampling/sampling_logp_difference/max": 0.42481327056884766, + "sampling/sampling_logp_difference/mean": 0.010327961295843124, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 70.53125, + "completions/mean_terminated_length": 70.53125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.28202685713768005, + "epoch": 0.9362831858407079, + "frac_reward_zero_std": 0.75, + "grad_norm": 11.06847389682932, + "kl": 0.01626504212617874, + "learning_rate": 9.465850206937887e-07, + "loss": 0.1768, + "num_tokens": 8530238.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9993406534194946, + "sampling/importance_sampling_ratio/min": 0.6998425126075745, + "sampling/sampling_logp_difference/max": 0.7249191999435425, + "sampling/sampling_logp_difference/mean": 0.012953529134392738, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 62.25, + "completions/mean_terminated_length": 62.25, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2565949559211731, + "epoch": 0.9380530973451328, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03577417046315069, + "kl": 0.0072803376242518425, + "learning_rate": 9.462371837572906e-07, + "loss": 0.0001, + "num_tokens": 8545950.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.40841805934906, + "sampling/importance_sampling_ratio/mean": 1.0001753568649292, + "sampling/importance_sampling_ratio/min": 0.691235363483429, + "sampling/sampling_logp_difference/max": 0.3692748546600342, + "sampling/sampling_logp_difference/mean": 0.010468710213899612, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 193.0, + "completions/max_terminated_length": 193.0, + "completions/mean_length": 60.609375, + "completions/mean_terminated_length": 60.609375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.26018762588500977, + "epoch": 0.9398230088495575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05116817286548551, + "kl": 0.010109255090355873, + "learning_rate": 9.45888282277034e-07, + "loss": 0.0001, + "num_tokens": 8566389.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2612920999526978, + "sampling/importance_sampling_ratio/mean": 1.0020697116851807, + "sampling/importance_sampling_ratio/min": 0.5850200653076172, + "sampling/sampling_logp_difference/max": 0.5361090898513794, + "sampling/sampling_logp_difference/mean": 0.013840936124324799, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 66.421875, + "completions/mean_terminated_length": 66.421875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2860739529132843, + "epoch": 0.9415929203539823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03885608048943523, + "kl": 0.008966992609202862, + "learning_rate": 9.455383170853585e-07, + "loss": 0.0001, + "num_tokens": 8582464.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9738166332244873, + "sampling/importance_sampling_ratio/mean": 0.9996252059936523, + "sampling/importance_sampling_ratio/min": 0.5545295476913452, + "sampling/sampling_logp_difference/max": 0.679969072341919, + "sampling/sampling_logp_difference/mean": 0.01339352410286665, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 108.578125, + "completions/mean_terminated_length": 108.578125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.35880762338638306, + "epoch": 0.9433628318584071, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05997824834894199, + "kl": 0.016652042046189308, + "learning_rate": 9.451872890171419e-07, + "loss": 0.0002, + "num_tokens": 8609493.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.435707449913025, + "sampling/importance_sampling_ratio/mean": 0.999294638633728, + "sampling/importance_sampling_ratio/min": 0.47342342138290405, + "sampling/sampling_logp_difference/max": 0.747765064239502, + "sampling/sampling_logp_difference/mean": 0.013728786259889603, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 88.546875, + "completions/mean_terminated_length": 88.546875, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "entropy": 0.45613330602645874, + "epoch": 0.9451327433628318, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09313465584163444, + "kl": 0.015598511323332787, + "learning_rate": 9.448351989097962e-07, + "loss": 0.0002, + "num_tokens": 8626280.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3734320402145386, + "sampling/importance_sampling_ratio/mean": 0.9996883869171143, + "sampling/importance_sampling_ratio/min": 0.5861338376998901, + "sampling/sampling_logp_difference/max": 0.5342071056365967, + "sampling/sampling_logp_difference/mean": 0.014164279215037823, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 70.375, + "completions/mean_terminated_length": 70.375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.32646340131759644, + "epoch": 0.9469026548672567, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03490572895714704, + "kl": 0.00837424024939537, + "learning_rate": 9.444820476032685e-07, + "loss": 0.0001, + "num_tokens": 8642800.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.33780038356781, + "sampling/importance_sampling_ratio/mean": 0.9992144107818604, + "sampling/importance_sampling_ratio/min": 0.6560214757919312, + "sampling/sampling_logp_difference/max": 0.42156171798706055, + "sampling/sampling_logp_difference/mean": 0.010708148591220379, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 111.046875, + "completions/mean_terminated_length": 111.046875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.5900201201438904, + "epoch": 0.9486725663716814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0580239595566989, + "kl": 0.018893791362643242, + "learning_rate": 9.441278359400364e-07, + "loss": 0.0002, + "num_tokens": 8659587.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4816340208053589, + "sampling/importance_sampling_ratio/mean": 1.0002975463867188, + "sampling/importance_sampling_ratio/min": 0.6631470322608948, + "sampling/sampling_logp_difference/max": 0.41075849533081055, + "sampling/sampling_logp_difference/mean": 0.021431345492601395, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 220.0, + "completions/max_terminated_length": 220.0, + "completions/mean_length": 59.03125, + "completions/mean_terminated_length": 59.03125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2766770124435425, + "epoch": 0.9504424778761061, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08836968551118032, + "kl": 0.00837495643645525, + "learning_rate": 9.437725647651078e-07, + "loss": 0.0001, + "num_tokens": 8674293.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.759122371673584, + "sampling/importance_sampling_ratio/mean": 0.9989175200462341, + "sampling/importance_sampling_ratio/min": 0.5930877923965454, + "sampling/sampling_logp_difference/max": 0.5648150444030762, + "sampling/sampling_logp_difference/mean": 0.016760773956775665, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 190.0, + "completions/max_terminated_length": 190.0, + "completions/mean_length": 56.75, + "completions/mean_terminated_length": 56.75, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3292761445045471, + "epoch": 0.952212389380531, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03191293281770827, + "kl": 0.007732439786195755, + "learning_rate": 9.434162349260178e-07, + "loss": 0.0001, + "num_tokens": 8688741.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.281243920326233, + "sampling/importance_sampling_ratio/mean": 0.9997268915176392, + "sampling/importance_sampling_ratio/min": 0.7160525321960449, + "sampling/sampling_logp_difference/max": 0.3340017795562744, + "sampling/sampling_logp_difference/mean": 0.0122067891061306, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 76.296875, + "completions/mean_terminated_length": 76.296875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "entropy": 0.35782527923583984, + "epoch": 0.9539823008849557, + "frac_reward_zero_std": 0.75, + "grad_norm": 10.676094231660771, + "kl": 0.010862261056900024, + "learning_rate": 9.430588472728269e-07, + "loss": 0.0578, + "num_tokens": 8704760.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0010135173797607, + "sampling/importance_sampling_ratio/min": 0.6073746085166931, + "sampling/sampling_logp_difference/max": 0.8597202301025391, + "sampling/sampling_logp_difference/mean": 0.013356505893170834, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 74.0, + "completions/mean_terminated_length": 74.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3447580337524414, + "epoch": 0.9557522123893806, + "frac_reward_zero_std": 0.75, + "grad_norm": 11.105585897945515, + "kl": 0.013816621154546738, + "learning_rate": 9.427004026581196e-07, + "loss": 0.1718, + "num_tokens": 8719640.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.652379035949707, + "sampling/importance_sampling_ratio/mean": 1.001023769378662, + "sampling/importance_sampling_ratio/min": 0.7009364366531372, + "sampling/sampling_logp_difference/max": 0.502216100692749, + "sampling/sampling_logp_difference/mean": 0.013380978256464005, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 123.703125, + "completions/mean_terminated_length": 123.703125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3127603530883789, + "epoch": 0.9575221238938053, + "frac_reward_zero_std": 0.75, + "grad_norm": 5.913535613821327, + "kl": 0.015127960592508316, + "learning_rate": 9.423409019370014e-07, + "loss": -0.202, + "num_tokens": 8738533.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.567948818206787, + "sampling/importance_sampling_ratio/mean": 0.9999661445617676, + "sampling/importance_sampling_ratio/min": 0.6265570521354675, + "sampling/sampling_logp_difference/max": 0.4675154685974121, + "sampling/sampling_logp_difference/mean": 0.012319693341851234, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 87.265625, + "completions/mean_terminated_length": 87.265625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4443648159503937, + "epoch": 0.95929203539823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08539988253972454, + "kl": 0.018398810178041458, + "learning_rate": 9.419803459670979e-07, + "loss": 0.0002, + "num_tokens": 8754758.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.574440360069275, + "sampling/importance_sampling_ratio/mean": 0.9996380805969238, + "sampling/importance_sampling_ratio/min": 0.5782991051673889, + "sampling/sampling_logp_difference/max": 0.5476641654968262, + "sampling/sampling_logp_difference/mean": 0.015303626656532288, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 105.0625, + "completions/mean_terminated_length": 105.0625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.5941020250320435, + "epoch": 0.9610619469026549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04467205979015622, + "kl": 0.015284831635653973, + "learning_rate": 9.416187356085512e-07, + "loss": 0.0002, + "num_tokens": 8770970.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3732892274856567, + "sampling/importance_sampling_ratio/mean": 1.0000840425491333, + "sampling/importance_sampling_ratio/min": 0.27647536993026733, + "sampling/sampling_logp_difference/max": 1.2856335639953613, + "sampling/sampling_logp_difference/mean": 0.020597338676452637, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 135.796875, + "completions/mean_terminated_length": 135.796875, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "entropy": 0.4649226665496826, + "epoch": 0.9628318584070796, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0357517830995541, + "kl": 0.014059186913073063, + "learning_rate": 9.412560717240195e-07, + "loss": 0.0001, + "num_tokens": 8789613.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002797842025757, + "sampling/importance_sampling_ratio/min": 0.5656505823135376, + "sampling/sampling_logp_difference/max": 0.6993000507354736, + "sampling/sampling_logp_difference/mean": 0.017499998211860657, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 232.0, + "completions/max_terminated_length": 232.0, + "completions/mean_length": 33.875, + "completions/mean_terminated_length": 33.875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1782006323337555, + "epoch": 0.9646017699115044, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09206582105872552, + "kl": 0.01672673039138317, + "learning_rate": 9.408923551786742e-07, + "loss": 0.0002, + "num_tokens": 8804885.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4249764680862427, + "sampling/importance_sampling_ratio/mean": 0.9994561672210693, + "sampling/importance_sampling_ratio/min": 0.7034332752227783, + "sampling/sampling_logp_difference/max": 0.3541553020477295, + "sampling/sampling_logp_difference/mean": 0.011499391868710518, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 41.890625, + "completions/mean_terminated_length": 41.890625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.1523999273777008, + "epoch": 0.9663716814159292, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037107556403945, + "kl": 0.0039587770588696, + "learning_rate": 9.405275868401974e-07, + "loss": 0.0001, + "num_tokens": 8821678.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4257996082305908, + "sampling/importance_sampling_ratio/mean": 0.999606728553772, + "sampling/importance_sampling_ratio/min": 0.725745677947998, + "sampling/sampling_logp_difference/max": 0.3547327518463135, + "sampling/sampling_logp_difference/mean": 0.005943705327808857, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 116.28125, + "completions/mean_terminated_length": 116.28125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4968239665031433, + "epoch": 0.968141592920354, + "frac_reward_zero_std": 0.75, + "grad_norm": 6.457847220461041, + "kl": 0.021092496812343597, + "learning_rate": 9.40161767578781e-07, + "loss": 0.1605, + "num_tokens": 8838672.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.600094199180603, + "sampling/importance_sampling_ratio/mean": 1.0008270740509033, + "sampling/importance_sampling_ratio/min": 0.26678600907325745, + "sampling/sampling_logp_difference/max": 1.3213083744049072, + "sampling/sampling_logp_difference/mean": 0.018169770017266273, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 54.5, + "completions/mean_terminated_length": 54.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2445429116487503, + "epoch": 0.9699115044247788, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07124112019982765, + "kl": 0.010958978906273842, + "learning_rate": 9.397948982671236e-07, + "loss": 0.0001, + "num_tokens": 8852992.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6126388311386108, + "sampling/importance_sampling_ratio/mean": 1.0011223554611206, + "sampling/importance_sampling_ratio/min": 0.7174001336097717, + "sampling/sampling_logp_difference/max": 0.4778718948364258, + "sampling/sampling_logp_difference/mean": 0.010054649785161018, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 82.65625, + "completions/mean_terminated_length": 82.65625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4400416612625122, + "epoch": 0.9716814159292035, + "frac_reward_zero_std": 0.75, + "grad_norm": 6.758155387149215, + "kl": 0.01893545500934124, + "learning_rate": 9.394269797804288e-07, + "loss": 0.1139, + "num_tokens": 8870970.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.4193644523620605, + "sampling/importance_sampling_ratio/mean": 0.999426007270813, + "sampling/importance_sampling_ratio/min": 0.4615151286125183, + "sampling/sampling_logp_difference/max": 0.7732404470443726, + "sampling/sampling_logp_difference/mean": 0.015387221239507198, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 92.0625, + "completions/mean_terminated_length": 92.0625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.4796140491962433, + "epoch": 0.9734513274336283, + "frac_reward_zero_std": 0.5, + "grad_norm": 9.630481719584537, + "kl": 0.016388213261961937, + "learning_rate": 9.390580129964035e-07, + "loss": 0.2398, + "num_tokens": 8886062.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.8056321144104004, + "sampling/importance_sampling_ratio/mean": 0.9994021654129028, + "sampling/importance_sampling_ratio/min": 0.6967775821685791, + "sampling/sampling_logp_difference/max": 0.5909106731414795, + "sampling/sampling_logp_difference/mean": 0.02025291696190834, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 135.0, + "completions/max_terminated_length": 135.0, + "completions/mean_length": 49.71875, + "completions/mean_terminated_length": 49.71875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.21508081257343292, + "epoch": 0.9752212389380531, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.066991866362209, + "kl": 0.013176451437175274, + "learning_rate": 9.386879987952549e-07, + "loss": 0.0001, + "num_tokens": 8899756.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4295161962509155, + "sampling/importance_sampling_ratio/mean": 1.0003056526184082, + "sampling/importance_sampling_ratio/min": 0.7088531851768494, + "sampling/sampling_logp_difference/max": 0.35733604431152344, + "sampling/sampling_logp_difference/mean": 0.009776454418897629, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 61.96875, + "completions/mean_terminated_length": 61.96875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.325038880109787, + "epoch": 0.9769911504424779, + "frac_reward_zero_std": 0.75, + "grad_norm": 13.860276512203823, + "kl": 0.04639158397912979, + "learning_rate": 9.383169380596892e-07, + "loss": 0.1206, + "num_tokens": 8916202.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.8170350790023804, + "sampling/importance_sampling_ratio/mean": 0.9982390403747559, + "sampling/importance_sampling_ratio/min": 0.6422437429428101, + "sampling/sampling_logp_difference/max": 0.5972061157226562, + "sampling/sampling_logp_difference/mean": 0.014898296445608139, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 76.328125, + "completions/mean_terminated_length": 76.328125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2448478639125824, + "epoch": 0.9787610619469026, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.10604575756165804, + "kl": 0.0240221805870533, + "learning_rate": 9.37944831674909e-07, + "loss": 0.0002, + "num_tokens": 8930895.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4379768371582031, + "sampling/importance_sampling_ratio/mean": 0.9995517134666443, + "sampling/importance_sampling_ratio/min": 0.6523293852806091, + "sampling/sampling_logp_difference/max": 0.4272056221961975, + "sampling/sampling_logp_difference/mean": 0.010927535593509674, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 175.0, + "completions/max_terminated_length": 175.0, + "completions/mean_length": 66.859375, + "completions/mean_terminated_length": 66.859375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.26345235109329224, + "epoch": 0.9805309734513274, + "frac_reward_zero_std": 0.75, + "grad_norm": 9.897098778736009, + "kl": 0.02797047048807144, + "learning_rate": 9.37571680528612e-07, + "loss": -0.0969, + "num_tokens": 8945126.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0015347003936768, + "sampling/importance_sampling_ratio/min": 0.5551202893257141, + "sampling/sampling_logp_difference/max": 0.7081301212310791, + "sampling/sampling_logp_difference/mean": 0.010650092735886574, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 65.53125, + "completions/mean_terminated_length": 65.53125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.2612670361995697, + "epoch": 0.9823008849557522, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09061518316154932, + "kl": 0.015878070145845413, + "learning_rate": 9.371974855109874e-07, + "loss": 0.0002, + "num_tokens": 8962232.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5268092155456543, + "sampling/importance_sampling_ratio/mean": 1.0007729530334473, + "sampling/importance_sampling_ratio/min": 0.6973515152931213, + "sampling/sampling_logp_difference/max": 0.42318010330200195, + "sampling/sampling_logp_difference/mean": 0.011717341840267181, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 17.5, + "completions/mean_terminated_length": 17.5, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.10745927691459656, + "epoch": 0.984070796460177, + "frac_reward_zero_std": 0.75, + "grad_norm": 23.680701551151667, + "kl": 0.20216985046863556, + "learning_rate": 9.368222475147153e-07, + "loss": -0.623, + "num_tokens": 8973160.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.8384283781051636, + "sampling/importance_sampling_ratio/mean": 1.0007233619689941, + "sampling/importance_sampling_ratio/min": 0.7110517024993896, + "sampling/sampling_logp_difference/max": 0.6089110374450684, + "sampling/sampling_logp_difference/mean": 0.01067114993929863, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 17.015625, + "completions/mean_terminated_length": 17.015625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.06491994112730026, + "epoch": 0.9858407079646018, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2432899230676877, + "kl": 0.1308480203151703, + "learning_rate": 9.36445967434964e-07, + "loss": 0.0012, + "num_tokens": 8989801.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.1004294157028198, + "sampling/importance_sampling_ratio/mean": 1.0001790523529053, + "sampling/importance_sampling_ratio/min": 0.9071494936943054, + "sampling/sampling_logp_difference/max": 0.09744799137115479, + "sampling/sampling_logp_difference/mean": 0.0038687689229846, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 91.8125, + "completions/mean_terminated_length": 91.8125, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "entropy": 0.34381309151649475, + "epoch": 0.9876106194690265, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.125285207917033, + "kl": 0.02384905517101288, + "learning_rate": 9.360686461693872e-07, + "loss": 0.1224, + "num_tokens": 9005949.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4320323467254639, + "sampling/importance_sampling_ratio/mean": 1.0000149011611938, + "sampling/importance_sampling_ratio/min": 0.7160810232162476, + "sampling/sampling_logp_difference/max": 0.35909461975097656, + "sampling/sampling_logp_difference/mean": 0.01379399560391903, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 111.0, + "completions/max_terminated_length": 111.0, + "completions/mean_length": 23.640625, + "completions/mean_terminated_length": 23.640625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.25869446992874146, + "epoch": 0.9893805309734514, + "frac_reward_zero_std": 0.5, + "grad_norm": 14.470531190670112, + "kl": 0.21281038224697113, + "learning_rate": 9.356902846181228e-07, + "loss": 0.1498, + "num_tokens": 9019494.0, + "reward": 0.375, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.56514573097229, + "sampling/importance_sampling_ratio/mean": 1.0025410652160645, + "sampling/importance_sampling_ratio/min": 0.7757983803749084, + "sampling/sampling_logp_difference/max": 0.4479789733886719, + "sampling/sampling_logp_difference/mean": 0.012579815462231636, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 61.671875, + "completions/mean_terminated_length": 61.671875, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "entropy": 0.32114139199256897, + "epoch": 0.9911504424778761, + "frac_reward_zero_std": 0.75, + "grad_norm": 5.609035510367293, + "kl": 0.018139678984880447, + "learning_rate": 9.353108836837905e-07, + "loss": 0.093, + "num_tokens": 9036113.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6116080284118652, + "sampling/importance_sampling_ratio/mean": 1.0010817050933838, + "sampling/importance_sampling_ratio/min": 0.6868107914924622, + "sampling/sampling_logp_difference/max": 0.4772324562072754, + "sampling/sampling_logp_difference/mean": 0.012453297153115273, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 14.59375, + "completions/mean_terminated_length": 14.59375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.07570166885852814, + "epoch": 0.9929203539823008, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20701388280592103, + "kl": 0.19794254004955292, + "learning_rate": 9.349304442714895e-07, + "loss": 0.0019, + "num_tokens": 9050007.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2850689888000488, + "sampling/importance_sampling_ratio/mean": 0.99912029504776, + "sampling/importance_sampling_ratio/min": 0.8150334358215332, + "sampling/sampling_logp_difference/max": 0.2508124113082886, + "sampling/sampling_logp_difference/mean": 0.011712642386555672, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 52.640625, + "completions/mean_terminated_length": 52.640625, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.32283660769462585, + "epoch": 0.9946902654867257, + "frac_reward_zero_std": 0.75, + "grad_norm": 8.951665755188321, + "kl": 0.12831877171993256, + "learning_rate": 9.345489672887962e-07, + "loss": -0.1108, + "num_tokens": 9063488.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4442304372787476, + "sampling/importance_sampling_ratio/mean": 0.999579906463623, + "sampling/importance_sampling_ratio/min": 0.6357463002204895, + "sampling/sampling_logp_difference/max": 0.4529557228088379, + "sampling/sampling_logp_difference/mean": 0.013744027353823185, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 39.609375, + "completions/mean_terminated_length": 39.609375, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.23001480102539062, + "epoch": 0.9964601769911504, + "frac_reward_zero_std": 0.75, + "grad_norm": 21.980215852115183, + "kl": 0.34417223930358887, + "learning_rate": 9.341664536457625e-07, + "loss": -0.1346, + "num_tokens": 9075783.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9994174838066101, + "sampling/importance_sampling_ratio/min": 0.685285210609436, + "sampling/sampling_logp_difference/max": 1.5750985145568848, + "sampling/sampling_logp_difference/mean": 0.015202976763248444, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 20.0, + "completions/max_terminated_length": 20.0, + "completions/mean_length": 15.828125, + "completions/mean_terminated_length": 15.828125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.08043765276670456, + "epoch": 0.9982300884955753, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11158896786330862, + "kl": 0.08205146342515945, + "learning_rate": 9.337829042549133e-07, + "loss": 0.0008, + "num_tokens": 9087580.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4533406496047974, + "sampling/importance_sampling_ratio/mean": 1.0018446445465088, + "sampling/importance_sampling_ratio/min": 0.6155814528465271, + "sampling/sampling_logp_difference/max": 0.48518800735473633, + "sampling/sampling_logp_difference/mean": 0.01030774973332882, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 219.0, + "completions/max_terminated_length": 219.0, + "completions/mean_length": 53.546875, + "completions/mean_terminated_length": 53.546875, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "entropy": 0.3179180324077606, + "epoch": 1.0, + "frac_reward_zero_std": 0.75, + "grad_norm": 9.110451794643717, + "kl": 0.07360747456550598, + "learning_rate": 9.33398320031244e-07, + "loss": -0.2468, + "num_tokens": 9100799.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.5650666952133179, + "sampling/importance_sampling_ratio/mean": 0.9979598522186279, + "sampling/importance_sampling_ratio/min": 0.6005945205688477, + "sampling/sampling_logp_difference/max": 0.5098352432250977, + "sampling/sampling_logp_difference/mean": 0.017205916345119476, + "step": 565 + } + ], + "logging_steps": 1, + "max_steps": 2260, + "num_input_tokens_seen": 9100799, + "num_train_epochs": 4, + "save_steps": 565, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}