{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.24, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1484375, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 500.625, "completions/mean_terminated_length": 409.39447021484375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.6560961306095123, "epoch": 0.0008, "frac_reward_zero_std": 0.0, "grad_norm": 0.36120137572288513, "learning_rate": 1e-05, "loss": 0.2767, "num_tokens": 82504.0, "reward": 2.5569920539855957, "reward_std": 0.6717760562896729, "rewards/evaluation_direction_reward/mean": 0.57421875, "rewards/evaluation_direction_reward/std": 0.2829807996749878, "rewards/format_reward/mean": 0.8953125476837158, "rewards/format_reward/std": 0.25343602895736694, "rewards/move_legality_reward/mean": 0.034882768988609314, "rewards/move_legality_reward/std": 0.1746065467596054, "rewards/pv_length_reward/mean": 0.06718750298023224, "rewards/pv_length_reward/std": 0.18697687983512878, "rewards/pv_quality_reward/mean": 0.00390625, "rewards/pv_quality_reward/std": 0.04419417306780815, "rewards/verbosity_reward/mean": 0.9814844131469727, "rewards/verbosity_reward/std": 0.09694620221853256, "sampling/importance_sampling_ratio/max": 2.860222339630127, "sampling/importance_sampling_ratio/mean": 0.7686270475387573, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0085698366165161, "sampling/sampling_logp_difference/mean": 0.018200814723968506, "step": 1, "step_time": 71.96860210597515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 548.453125, "completions/mean_terminated_length": 433.0291442871094, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.626050628721714, "epoch": 0.0016, "frac_reward_zero_std": 0.0, "grad_norm": 0.2216157466173172, "learning_rate": 9.999998245403766e-06, "loss": 0.0771, "num_tokens": 171434.0, "reward": 2.4845311641693115, "reward_std": 0.794001042842865, "rewards/evaluation_direction_reward/mean": 0.591796875, "rewards/evaluation_direction_reward/std": 0.3150913715362549, "rewards/format_reward/mean": 0.8539062738418579, "rewards/format_reward/std": 0.302457720041275, "rewards/move_legality_reward/mean": 0.03405492752790451, "rewards/move_legality_reward/std": 0.1744968593120575, "rewards/pv_length_reward/mean": 0.04828869178891182, "rewards/pv_length_reward/std": 0.1359073668718338, "rewards/pv_quality_reward/mean": 0.0, "rewards/pv_quality_reward/std": 0.0, "rewards/verbosity_reward/mean": 0.9564843773841858, "rewards/verbosity_reward/std": 0.1955271065235138, "sampling/importance_sampling_ratio/max": 2.891538619995117, "sampling/importance_sampling_ratio/mean": 0.7769277095794678, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6102156639099121, "sampling/sampling_logp_difference/mean": 0.017398906871676445, "step": 2, "step_time": 72.6464070379734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 548.640625, "completions/mean_terminated_length": 438.94232177734375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.6589304786175489, "epoch": 0.0024, "frac_reward_zero_std": 0.0, "grad_norm": 0.32068803906440735, "learning_rate": 9.999992981616292e-06, "loss": 0.2949, "num_tokens": 260300.0, "reward": 2.5128250122070312, "reward_std": 0.6395830512046814, "rewards/evaluation_direction_reward/mean": 0.623046875, "rewards/evaluation_direction_reward/std": 0.31684356927871704, "rewards/format_reward/mean": 0.8695312738418579, "rewards/format_reward/std": 0.276239812374115, "rewards/move_legality_reward/mean": 0.0020958324894309044, "rewards/move_legality_reward/std": 0.00782585795968771, "rewards/pv_length_reward/mean": 0.03619791567325592, "rewards/pv_length_reward/std": 0.1094028428196907, "rewards/pv_quality_reward/mean": 0.0, "rewards/pv_quality_reward/std": 0.0, "rewards/verbosity_reward/mean": 0.9819531440734863, "rewards/verbosity_reward/std": 0.12498855590820312, "sampling/importance_sampling_ratio/max": 2.7939300537109375, "sampling/importance_sampling_ratio/mean": 0.711390495300293, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9767757654190063, "sampling/sampling_logp_difference/mean": 0.01793469861149788, "step": 3, "step_time": 72.20076160877943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 499.03125, "completions/mean_terminated_length": 418.6306457519531, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.6709191389381886, "epoch": 0.0032, "frac_reward_zero_std": 0.0, "grad_norm": 0.2714180052280426, "learning_rate": 9.999984208641271e-06, "loss": 0.1519, "num_tokens": 342936.0, "reward": 2.656071186065674, "reward_std": 0.7505378127098083, "rewards/evaluation_direction_reward/mean": 0.61328125, "rewards/evaluation_direction_reward/std": 0.30394288897514343, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2422824501991272, "rewards/move_legality_reward/mean": 0.04185234755277634, "rewards/move_legality_reward/std": 0.19410046935081482, "rewards/pv_length_reward/mean": 0.08203125, "rewards/pv_length_reward/std": 0.1656743288040161, "rewards/pv_quality_reward/mean": 0.03125, "rewards/pv_quality_reward/std": 0.1599704623222351, "rewards/verbosity_reward/mean": 0.9814062118530273, "rewards/verbosity_reward/std": 0.12665119767189026, "sampling/importance_sampling_ratio/max": 2.796851396560669, "sampling/importance_sampling_ratio/mean": 0.7669062614440918, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7271425127983093, "sampling/sampling_logp_difference/mean": 0.01818077825009823, "step": 4, "step_time": 72.02034368366003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 475.234375, "completions/mean_terminated_length": 407.84210205078125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.6294571608304977, "epoch": 0.004, "frac_reward_zero_std": 0.0, "grad_norm": 0.3440217077732086, "learning_rate": 9.999971926484865e-06, "loss": 0.3077, "num_tokens": 422366.0, "reward": 2.632345676422119, "reward_std": 0.566673994064331, "rewards/evaluation_direction_reward/mean": 0.62890625, "rewards/evaluation_direction_reward/std": 0.2976025640964508, "rewards/format_reward/mean": 0.9234374761581421, "rewards/format_reward/std": 0.21933484077453613, "rewards/move_legality_reward/mean": 0.017215546220541, "rewards/move_legality_reward/std": 0.12452225387096405, "rewards/pv_length_reward/mean": 0.07122395932674408, "rewards/pv_length_reward/std": 0.14717692136764526, "rewards/pv_quality_reward/mean": 0.00390625, "rewards/pv_quality_reward/std": 0.04419417306780815, "rewards/verbosity_reward/mean": 0.9876562356948853, "rewards/verbosity_reward/std": 0.0921953096985817, "sampling/importance_sampling_ratio/max": 2.95393443107605, "sampling/importance_sampling_ratio/mean": 0.8594086170196533, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8800551891326904, "sampling/sampling_logp_difference/mean": 0.017784297466278076, "step": 5, "step_time": 71.80874399095774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 422.265625, "completions/mean_terminated_length": 371.27117919921875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.655173234641552, "epoch": 0.0048, "frac_reward_zero_std": 0.0, "grad_norm": 0.3512960374355316, "learning_rate": 9.999956135155688e-06, "loss": 0.1663, "num_tokens": 495064.0, "reward": 2.724811553955078, "reward_std": 0.5912332534790039, "rewards/evaluation_direction_reward/mean": 0.650390625, "rewards/evaluation_direction_reward/std": 0.2776799499988556, "rewards/format_reward/mean": 0.9437500238418579, "rewards/format_reward/std": 0.19592301547527313, "rewards/move_legality_reward/mean": 0.008959498256444931, "rewards/move_legality_reward/std": 0.08848195523023605, "rewards/pv_length_reward/mean": 0.1220238208770752, "rewards/pv_length_reward/std": 0.23900912702083588, "rewards/pv_quality_reward/mean": 0.0078125, "rewards/pv_quality_reward/std": 0.0883883461356163, "rewards/verbosity_reward/mean": 0.9918749928474426, "rewards/verbosity_reward/std": 0.0884312093257904, "sampling/importance_sampling_ratio/max": 2.908254384994507, "sampling/importance_sampling_ratio/mean": 0.7682571411132812, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9736442565917969, "sampling/sampling_logp_difference/mean": 0.018716951832175255, "step": 6, "step_time": 70.96145160496235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 416.859375, "completions/mean_terminated_length": 359.7778015136719, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.6495263390243053, "epoch": 0.0056, "frac_reward_zero_std": 0.0, "grad_norm": 0.2966015338897705, "learning_rate": 9.99993683466483e-06, "loss": 0.2005, "num_tokens": 566574.0, "reward": 2.769590377807617, "reward_std": 0.5881563425064087, "rewards/evaluation_direction_reward/mean": 0.689453125, "rewards/evaluation_direction_reward/std": 0.25822320580482483, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.20620301365852356, "rewards/move_legality_reward/mean": 0.018198952078819275, "rewards/move_legality_reward/std": 0.12456312775611877, "rewards/pv_length_reward/mean": 0.14170387387275696, "rewards/pv_length_reward/std": 0.26125428080558777, "rewards/pv_quality_reward/mean": 0.0, "rewards/pv_quality_reward/std": 0.0, "rewards/verbosity_reward/mean": 0.9827343821525574, "rewards/verbosity_reward/std": 0.12502418458461761, "sampling/importance_sampling_ratio/max": 2.947230339050293, "sampling/importance_sampling_ratio/mean": 0.76502525806427, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.556466102600098, "sampling/sampling_logp_difference/mean": 0.01834770478308201, "step": 7, "step_time": 70.88223604857922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 1024.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 411.5078125, "completions/mean_terminated_length": 365.1849060058594, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.5972983613610268, "epoch": 0.0064, "frac_reward_zero_std": 0.0, "grad_norm": 0.3770979046821594, "learning_rate": 9.999914025025831e-06, "loss": 0.3349, "num_tokens": 637471.0, "reward": 2.806765079498291, "reward_std": 0.6247501373291016, "rewards/evaluation_direction_reward/mean": 0.65625, "rewards/evaluation_direction_reward/std": 0.26987895369529724, "rewards/format_reward/mean": 0.953906238079071, "rewards/format_reward/std": 0.17156177759170532, "rewards/move_legality_reward/mean": 0.07140910625457764, "rewards/move_legality_reward/std": 0.256425678730011, "rewards/pv_length_reward/mean": 0.10277777910232544, "rewards/pv_length_reward/std": 0.13777947425842285, "rewards/pv_quality_reward/mean": 0.0234375, "rewards/pv_quality_reward/std": 0.1383163034915924, "rewards/verbosity_reward/mean": 0.9989843964576721, "rewards/verbosity_reward/std": 0.009207995608448982, "sampling/importance_sampling_ratio/max": 2.9623913764953613, "sampling/importance_sampling_ratio/mean": 0.8583903908729553, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2047157287597656, "sampling/sampling_logp_difference/mean": 0.017655594274401665, "step": 8, "step_time": 70.72673750668764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 362.4453125, "completions/mean_terminated_length": 329.9098205566406, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.6519573740661144, "epoch": 0.0072, "frac_reward_zero_std": 0.0, "grad_norm": 0.31688192486763, "learning_rate": 9.999887706254703e-06, "loss": 0.3002, "num_tokens": 702816.0, "reward": 2.8312597274780273, "reward_std": 0.5179862380027771, "rewards/evaluation_direction_reward/mean": 0.68359375, "rewards/evaluation_direction_reward/std": 0.2294386625289917, "rewards/format_reward/mean": 0.9671875238418579, "rewards/format_reward/std": 0.1485411375761032, "rewards/move_legality_reward/mean": 0.06625983119010925, "rewards/move_legality_reward/std": 0.24261799454689026, "rewards/pv_length_reward/mean": 0.11250001192092896, "rewards/pv_length_reward/std": 0.17775455117225647, "rewards/pv_quality_reward/mean": 0.00390625, "rewards/pv_quality_reward/std": 0.04419417306780815, "rewards/verbosity_reward/mean": 0.9978125095367432, "rewards/verbosity_reward/std": 0.023874258622527122, "sampling/importance_sampling_ratio/max": 2.823880195617676, "sampling/importance_sampling_ratio/mean": 0.8166234493255615, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7246406078338623, "sampling/sampling_logp_difference/mean": 0.018925875425338745, "step": 9, "step_time": 70.14423652738333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 347.203125, "completions/mean_terminated_length": 302.0833435058594, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.6403288021683693, "epoch": 0.008, "frac_reward_zero_std": 0.0, "grad_norm": 0.40145885944366455, "learning_rate": 9.999857878369917e-06, "loss": 0.2671, "num_tokens": 765570.0, "reward": 2.9616429805755615, "reward_std": 0.7066532969474792, "rewards/evaluation_direction_reward/mean": 0.689453125, "rewards/evaluation_direction_reward/std": 0.21678142249584198, "rewards/format_reward/mean": 0.956250011920929, "rewards/format_reward/std": 0.17010879516601562, "rewards/move_legality_reward/mean": 0.09740559756755829, "rewards/move_legality_reward/std": 0.29207131266593933, "rewards/pv_length_reward/mean": 0.18751861155033112, "rewards/pv_length_reward/std": 0.29332292079925537, "rewards/pv_quality_reward/mean": 0.03125, "rewards/pv_quality_reward/std": 0.13675081729888916, "rewards/verbosity_reward/mean": 0.9997656345367432, "rewards/verbosity_reward/std": 0.002651647897437215, "sampling/importance_sampling_ratio/max": 2.9424304962158203, "sampling/importance_sampling_ratio/mean": 0.8155466914176941, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7091341018676758, "sampling/sampling_logp_difference/mean": 0.018316859379410744, "step": 10, "step_time": 70.13506919145584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 250.171875, "completions/mean_terminated_length": 237.88890075683594, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.7209676317870617, "epoch": 0.0088, "frac_reward_zero_std": 0.0, "grad_norm": 0.5357649326324463, "learning_rate": 9.999824541392404e-06, "loss": 0.1862, "num_tokens": 816136.0, "reward": 3.081167221069336, "reward_std": 0.5162502527236938, "rewards/evaluation_direction_reward/mean": 0.73828125, "rewards/evaluation_direction_reward/std": 0.12093538790941238, "rewards/format_reward/mean": 0.989062488079071, "rewards/format_reward/std": 0.0871548280119896, "rewards/move_legality_reward/mean": 0.10314657539129257, "rewards/move_legality_reward/std": 0.3027922213077545, "rewards/pv_length_reward/mean": 0.24700520932674408, "rewards/pv_length_reward/std": 0.2786076068878174, "rewards/pv_quality_reward/mean": 0.00390625, "rewards/pv_quality_reward/std": 0.04419417306780815, "rewards/verbosity_reward/mean": 0.9997656345367432, "rewards/verbosity_reward/std": 0.002651647897437215, "sampling/importance_sampling_ratio/max": 2.701174259185791, "sampling/importance_sampling_ratio/mean": 0.9075109958648682, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.193087100982666, "sampling/sampling_logp_difference/mean": 0.02100287936627865, "step": 11, "step_time": 68.84980444610119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 243.890625, "completions/mean_terminated_length": 237.74803161621094, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 0.715137455612421, "epoch": 0.0096, "frac_reward_zero_std": 0.0, "grad_norm": 0.47040796279907227, "learning_rate": 9.999787695345565e-06, "loss": 0.0998, "num_tokens": 866218.0, "reward": 2.9702610969543457, "reward_std": 0.45935243368148804, "rewards/evaluation_direction_reward/mean": 0.7109375, "rewards/evaluation_direction_reward/std": 0.17590458691120148, "rewards/format_reward/mean": 0.9945312738418579, "rewards/format_reward/std": 0.06187184154987335, "rewards/move_legality_reward/mean": 0.06508989632129669, "rewards/move_legality_reward/std": 0.24289610981941223, "rewards/pv_length_reward/mean": 0.18407738208770752, "rewards/pv_length_reward/std": 0.1760839819908142, "rewards/pv_quality_reward/mean": 0.0234375, "rewards/pv_quality_reward/std": 0.11500385403633118, "rewards/verbosity_reward/mean": 0.9921875, "rewards/verbosity_reward/std": 0.0883883461356163, "sampling/importance_sampling_ratio/max": 2.97737193107605, "sampling/importance_sampling_ratio/mean": 0.9195723533630371, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6940200924873352, "sampling/sampling_logp_difference/mean": 0.0214652419090271, "step": 12, "step_time": 69.29206018894911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 262.8359375, "completions/mean_terminated_length": 250.7539825439453, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 0.6825901791453362, "epoch": 0.0104, "frac_reward_zero_std": 0.0, "grad_norm": 0.5149370431900024, "learning_rate": 9.99974734025526e-06, "loss": 0.2592, "num_tokens": 918213.0, "reward": 3.1207804679870605, "reward_std": 0.6580043435096741, "rewards/evaluation_direction_reward/mean": 0.736328125, "rewards/evaluation_direction_reward/std": 0.1413816511631012, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.10039293020963669, "rewards/move_legality_reward/mean": 0.1352337896823883, "rewards/move_legality_reward/std": 0.3343343436717987, "rewards/pv_length_reward/mean": 0.255859375, "rewards/pv_length_reward/std": 0.29896724224090576, "rewards/pv_quality_reward/mean": 0.021484375, "rewards/pv_quality_reward/std": 0.12559834122657776, "rewards/verbosity_reward/mean": 0.984375, "rewards/verbosity_reward/std": 0.12450689822435379, "sampling/importance_sampling_ratio/max": 2.848503828048706, "sampling/importance_sampling_ratio/mean": 0.9846678376197815, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5420827865600586, "sampling/sampling_logp_difference/mean": 0.020886365324258804, "step": 13, "step_time": 68.67882964760065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 238.359375, "completions/mean_terminated_length": 219.50401306152344, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.6671992726624012, "epoch": 0.0112, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5719023942947388, "learning_rate": 9.999703476149808e-06, "loss": 0.1684, "num_tokens": 967019.0, "reward": 3.0764482021331787, "reward_std": 0.6577057242393494, "rewards/evaluation_direction_reward/mean": 0.6796875, "rewards/evaluation_direction_reward/std": 0.21721550822257996, "rewards/format_reward/mean": 0.9820312261581421, "rewards/format_reward/std": 0.11734168231487274, "rewards/move_legality_reward/mean": 0.16689953207969666, "rewards/move_legality_reward/std": 0.37084129452705383, "rewards/pv_length_reward/mean": 0.21111111342906952, "rewards/pv_length_reward/std": 0.15340706706047058, "rewards/pv_quality_reward/mean": 0.044921875, "rewards/pv_quality_reward/std": 0.18409597873687744, "rewards/verbosity_reward/mean": 0.9917968511581421, "rewards/verbosity_reward/std": 0.08846399933099747, "sampling/importance_sampling_ratio/max": 2.252124786376953, "sampling/importance_sampling_ratio/mean": 0.8157683610916138, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8735430240631104, "sampling/sampling_logp_difference/mean": 0.02020445093512535, "step": 14, "step_time": 68.57560022175312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 211.15625, "completions/mean_terminated_length": 204.7559051513672, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.7108959034085274, "epoch": 0.012, "frac_reward_zero_std": 0.125, "grad_norm": 0.6072821021080017, "learning_rate": 9.999656103060001e-06, "loss": 0.2392, "num_tokens": 1012623.0, "reward": 3.034933090209961, "reward_std": 0.46723416447639465, "rewards/evaluation_direction_reward/mean": 0.671875, "rewards/evaluation_direction_reward/std": 0.17953899502754211, "rewards/format_reward/mean": 0.9945312738418579, "rewards/format_reward/std": 0.06187184154987335, "rewards/move_legality_reward/mean": 0.11037605255842209, "rewards/move_legality_reward/std": 0.31305187940597534, "rewards/pv_length_reward/mean": 0.2369791716337204, "rewards/pv_length_reward/std": 0.14312684535980225, "rewards/pv_quality_reward/mean": 0.021484375, "rewards/pv_quality_reward/std": 0.1403992623090744, "rewards/verbosity_reward/mean": 0.9996874928474426, "rewards/verbosity_reward/std": 0.0035355358850210905, "sampling/importance_sampling_ratio/max": 2.778346538543701, "sampling/importance_sampling_ratio/mean": 0.9076911211013794, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6860556602478027, "sampling/sampling_logp_difference/mean": 0.021008187904953957, "step": 15, "step_time": 68.54023861885071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 174.3046875, "completions/mean_terminated_length": 174.3046875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.7646763287484646, "epoch": 0.0128, "frac_reward_zero_std": 0.0, "grad_norm": 0.7574782371520996, "learning_rate": 9.999605221019082e-06, "loss": 0.1293, "num_tokens": 1053430.0, "reward": 3.367555856704712, "reward_std": 0.5319560766220093, "rewards/evaluation_direction_reward/mean": 0.767578125, "rewards/evaluation_direction_reward/std": 0.10950674116611481, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.21138392388820648, "rewards/move_legality_reward/std": 0.40937650203704834, "rewards/pv_length_reward/mean": 0.3578125238418579, "rewards/pv_length_reward/std": 0.25879067182540894, "rewards/pv_quality_reward/mean": 0.03125, "rewards/pv_quality_reward/std": 0.1254911571741104, "rewards/verbosity_reward/mean": 0.9995312690734863, "rewards/verbosity_reward/std": 0.005303301382809877, "sampling/importance_sampling_ratio/max": 2.616349220275879, "sampling/importance_sampling_ratio/mean": 0.8918777704238892, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6763777732849121, "sampling/sampling_logp_difference/mean": 0.023647772148251534, "step": 16, "step_time": 42.42176040261984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 172.34375, "completions/mean_terminated_length": 172.34375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.7491214200854301, "epoch": 0.0136, "frac_reward_zero_std": 0.0, "grad_norm": 0.5977244973182678, "learning_rate": 9.999550830062762e-06, "loss": 0.0851, "num_tokens": 1093954.0, "reward": 3.2321813106536865, "reward_std": 0.5472072958946228, "rewards/evaluation_direction_reward/mean": 0.712890625, "rewards/evaluation_direction_reward/std": 0.2035362720489502, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1704365313053131, "rewards/move_legality_reward/std": 0.3717705011367798, "rewards/pv_length_reward/mean": 0.32565104961395264, "rewards/pv_length_reward/std": 0.25718235969543457, "rewards/pv_quality_reward/mean": 0.0234375, "rewards/pv_quality_reward/std": 0.14182965457439423, "rewards/verbosity_reward/mean": 0.9997656345367432, "rewards/verbosity_reward/std": 0.0019701868295669556, "sampling/importance_sampling_ratio/max": 2.9697015285491943, "sampling/importance_sampling_ratio/mean": 0.9158414602279663, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8807966709136963, "sampling/sampling_logp_difference/mean": 0.023736771196126938, "step": 17, "step_time": 39.00079958885908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 168.0390625, "completions/mean_terminated_length": 161.29920959472656, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.7403665073215961, "epoch": 0.0144, "frac_reward_zero_std": 0.0625, "grad_norm": 0.747062623500824, "learning_rate": 9.999492930229217e-06, "loss": -0.0471, "num_tokens": 1133927.0, "reward": 3.2617359161376953, "reward_std": 0.5797854065895081, "rewards/evaluation_direction_reward/mean": 0.767578125, "rewards/evaluation_direction_reward/std": 0.1477641463279724, "rewards/format_reward/mean": 0.9945312738418579, "rewards/format_reward/std": 0.06187184154987335, "rewards/move_legality_reward/mean": 0.1640625, "rewards/move_legality_reward/std": 0.371787428855896, "rewards/pv_length_reward/mean": 0.29236114025115967, "rewards/pv_length_reward/std": 0.13911879062652588, "rewards/pv_quality_reward/mean": 0.046875, "rewards/pv_quality_reward/std": 0.17953899502754211, "rewards/verbosity_reward/mean": 0.9963281154632568, "rewards/verbosity_reward/std": 0.0296135526150465, "sampling/importance_sampling_ratio/max": 2.8364381790161133, "sampling/importance_sampling_ratio/mean": 0.8816821575164795, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.813590407371521, "sampling/sampling_logp_difference/mean": 0.023670485243201256, "step": 18, "step_time": 68.03891561180353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 180.7890625, "completions/mean_terminated_length": 167.40476989746094, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.7401547506451607, "epoch": 0.0152, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6116119027137756, "learning_rate": 9.999431521559081e-06, "loss": 0.215, "num_tokens": 1175300.0, "reward": 3.07588529586792, "reward_std": 0.5464367270469666, "rewards/evaluation_direction_reward/mean": 0.71875, "rewards/evaluation_direction_reward/std": 0.13675081729888916, "rewards/format_reward/mean": 0.985156238079071, "rewards/format_reward/std": 0.11844786256551743, "rewards/move_legality_reward/mean": 0.078125, "rewards/move_legality_reward/std": 0.2694226801395416, "rewards/pv_length_reward/mean": 0.2979166805744171, "rewards/pv_length_reward/std": 0.19933666288852692, "rewards/pv_quality_reward/mean": 0.01171875, "rewards/pv_quality_reward/std": 0.07594143599271774, "rewards/verbosity_reward/mean": 0.9842187166213989, "rewards/verbosity_reward/std": 0.12449968606233597, "sampling/importance_sampling_ratio/max": 2.9920105934143066, "sampling/importance_sampling_ratio/mean": 0.9740588665008545, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7493896484375, "sampling/sampling_logp_difference/mean": 0.022488679736852646, "step": 19, "step_time": 67.9201202467084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 171.3515625, "completions/mean_terminated_length": 171.3515625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.760954886674881, "epoch": 0.016, "frac_reward_zero_std": 0.1875, "grad_norm": 0.5682782530784607, "learning_rate": 9.999366604095458e-06, "loss": 0.0634, "num_tokens": 1215777.0, "reward": 3.325747013092041, "reward_std": 0.6463515758514404, "rewards/evaluation_direction_reward/mean": 0.69921875, "rewards/evaluation_direction_reward/std": 0.19682364165782928, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1602260023355484, "rewards/move_legality_reward/std": 0.365479052066803, "rewards/pv_length_reward/mean": 0.3962239623069763, "rewards/pv_length_reward/std": 0.27009132504463196, "rewards/pv_quality_reward/mean": 0.078125, "rewards/pv_quality_reward/std": 0.2694226801395416, "rewards/verbosity_reward/mean": 0.9919531345367432, "rewards/verbosity_reward/std": 0.0884072408080101, "sampling/importance_sampling_ratio/max": 2.9550673961639404, "sampling/importance_sampling_ratio/mean": 0.8960939645767212, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5155284404754639, "sampling/sampling_logp_difference/mean": 0.02351573295891285, "step": 20, "step_time": 49.991986371576786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 158.46875, "completions/mean_terminated_length": 158.46875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.7378099896013737, "epoch": 0.0168, "frac_reward_zero_std": 0.125, "grad_norm": 0.799536406993866, "learning_rate": 9.999298177883902e-06, "loss": -0.0148, "num_tokens": 1254573.0, "reward": 3.2996840476989746, "reward_std": 0.6274939179420471, "rewards/evaluation_direction_reward/mean": 0.716796875, "rewards/evaluation_direction_reward/std": 0.1516675502061844, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.234375, "rewards/move_legality_reward/std": 0.42527204751968384, "rewards/pv_length_reward/mean": 0.3016369342803955, "rewards/pv_length_reward/std": 0.20972603559494019, "rewards/pv_quality_reward/mean": 0.046875, "rewards/pv_quality_reward/std": 0.16226133704185486, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6936237812042236, "sampling/importance_sampling_ratio/mean": 1.0434277057647705, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1253104209899902, "sampling/sampling_logp_difference/mean": 0.02410244755446911, "step": 21, "step_time": 31.85833729058504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 159.359375, "completions/mean_terminated_length": 159.359375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.7587304636836052, "epoch": 0.0176, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7192447185516357, "learning_rate": 9.999226242972445e-06, "loss": -0.1296, "num_tokens": 1293651.0, "reward": 3.3765625953674316, "reward_std": 0.6936578750610352, "rewards/evaluation_direction_reward/mean": 0.736328125, "rewards/evaluation_direction_reward/std": 0.13423959910869598, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1979166567325592, "rewards/move_legality_reward/std": 0.3978012502193451, "rewards/pv_length_reward/mean": 0.3505208492279053, "rewards/pv_length_reward/std": 0.18585050106048584, "rewards/pv_quality_reward/mean": 0.091796875, "rewards/pv_quality_reward/std": 0.2526845932006836, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.416477680206299, "sampling/importance_sampling_ratio/mean": 0.8812757730484009, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2333898544311523, "sampling/sampling_logp_difference/mean": 0.024150649085640907, "step": 22, "step_time": 30.106890238821507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 161.984375, "completions/mean_terminated_length": 155.1968536376953, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.7647850066423416, "epoch": 0.0184, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7128013968467712, "learning_rate": 9.999150799411565e-06, "loss": -0.0035, "num_tokens": 1332809.0, "reward": 3.224271059036255, "reward_std": 0.6508077383041382, "rewards/evaluation_direction_reward/mean": 0.728515625, "rewards/evaluation_direction_reward/std": 0.14724284410476685, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.0883883461356163, "rewards/move_legality_reward/mean": 0.16445311903953552, "rewards/move_legality_reward/std": 0.3716399073600769, "rewards/pv_length_reward/mean": 0.2942708432674408, "rewards/pv_length_reward/std": 0.08553361892700195, "rewards/pv_quality_reward/mean": 0.0546875, "rewards/pv_quality_reward/std": 0.20073069632053375, "rewards/verbosity_reward/mean": 0.9901562333106995, "rewards/verbosity_reward/std": 0.089688740670681, "sampling/importance_sampling_ratio/max": 2.9595608711242676, "sampling/importance_sampling_ratio/mean": 0.9407302141189575, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3744014501571655, "sampling/sampling_logp_difference/mean": 0.024176521226763725, "step": 23, "step_time": 67.73014653474092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 161.2734375, "completions/mean_terminated_length": 161.2734375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.779983751475811, "epoch": 0.0192, "frac_reward_zero_std": 0.0625, "grad_norm": 0.742508590221405, "learning_rate": 9.999071847254219e-06, "loss": 0.0974, "num_tokens": 1371892.0, "reward": 3.348794460296631, "reward_std": 0.6403000354766846, "rewards/evaluation_direction_reward/mean": 0.71484375, "rewards/evaluation_direction_reward/std": 0.1560283899307251, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.234375, "rewards/move_legality_reward/std": 0.42527204751968384, "rewards/pv_length_reward/mean": 0.33504465222358704, "rewards/pv_length_reward/std": 0.18823933601379395, "rewards/pv_quality_reward/mean": 0.06640625, "rewards/pv_quality_reward/std": 0.20924586057662964, "rewards/verbosity_reward/mean": 0.9981250166893005, "rewards/verbosity_reward/std": 0.02121320366859436, "sampling/importance_sampling_ratio/max": 2.8890206813812256, "sampling/importance_sampling_ratio/mean": 0.9595061540603638, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5529100894927979, "sampling/sampling_logp_difference/mean": 0.024538787081837654, "step": 24, "step_time": 23.303539499640465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 162.3125, "completions/mean_terminated_length": 162.3125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.7820797227323055, "epoch": 0.02, "frac_reward_zero_std": 0.1875, "grad_norm": 0.5962402820587158, "learning_rate": 9.998989386555815e-06, "loss": 0.0577, "num_tokens": 1410860.0, "reward": 3.153437376022339, "reward_std": 0.5093904137611389, "rewards/evaluation_direction_reward/mean": 0.666015625, "rewards/evaluation_direction_reward/std": 0.20156216621398926, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1015625, "rewards/move_legality_reward/std": 0.3032590448856354, "rewards/pv_length_reward/mean": 0.3578125238418579, "rewards/pv_length_reward/std": 0.22262588143348694, "rewards/pv_quality_reward/mean": 0.03515625, "rewards/pv_quality_reward/std": 0.179496169090271, "rewards/verbosity_reward/mean": 0.9928905963897705, "rewards/verbosity_reward/std": 0.054659321904182434, "sampling/importance_sampling_ratio/max": 2.920964479446411, "sampling/importance_sampling_ratio/mean": 1.0120117664337158, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48549771308898926, "sampling/sampling_logp_difference/mean": 0.023384016007184982, "step": 25, "step_time": 24.59798062592745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 158.7109375, "completions/mean_terminated_length": 158.7109375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.775823600590229, "epoch": 0.0208, "frac_reward_zero_std": 0.0, "grad_norm": 0.7893826365470886, "learning_rate": 9.998903417374228e-06, "loss": 0.0092, "num_tokens": 1449679.0, "reward": 3.5349130630493164, "reward_std": 0.7725033760070801, "rewards/evaluation_direction_reward/mean": 0.775390625, "rewards/evaluation_direction_reward/std": 0.15947701036930084, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.203125, "rewards/move_legality_reward/std": 0.40390563011169434, "rewards/pv_length_reward/mean": 0.44131943583488464, "rewards/pv_length_reward/std": 0.2835564613342285, "rewards/pv_quality_reward/mean": 0.115234375, "rewards/pv_quality_reward/std": 0.292461633682251, "rewards/verbosity_reward/mean": 0.9998437166213989, "rewards/verbosity_reward/std": 0.0017677652649581432, "sampling/importance_sampling_ratio/max": 2.805802583694458, "sampling/importance_sampling_ratio/mean": 0.9449859857559204, "sampling/importance_sampling_ratio/min": 0.09257541596889496, "sampling/sampling_logp_difference/max": 0.48613524436950684, "sampling/sampling_logp_difference/mean": 0.023909203708171844, "step": 26, "step_time": 21.919015668332577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 158.0, "completions/mean_terminated_length": 158.0, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.758297573775053, "epoch": 0.0216, "frac_reward_zero_std": 0.0, "grad_norm": 0.8011346459388733, "learning_rate": 9.998813939769794e-06, "loss": -0.0012, "num_tokens": 1488359.0, "reward": 3.2530953884124756, "reward_std": 0.4846065640449524, "rewards/evaluation_direction_reward/mean": 0.73828125, "rewards/evaluation_direction_reward/std": 0.13258251547813416, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1901041716337204, "rewards/move_legality_reward/std": 0.3916977345943451, "rewards/pv_length_reward/mean": 0.2995535731315613, "rewards/pv_length_reward/std": 0.09656266123056412, "rewards/pv_quality_reward/mean": 0.025390625, "rewards/pv_quality_reward/std": 0.12874257564544678, "rewards/verbosity_reward/mean": 0.9997656345367432, "rewards/verbosity_reward/std": 0.002651647897437215, "sampling/importance_sampling_ratio/max": 2.8302247524261475, "sampling/importance_sampling_ratio/mean": 0.9071841239929199, "sampling/importance_sampling_ratio/min": 0.1371804177761078, "sampling/sampling_logp_difference/max": 0.7437748908996582, "sampling/sampling_logp_difference/mean": 0.024097571149468422, "step": 27, "step_time": 21.854372024536133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 158.90625, "completions/mean_terminated_length": 158.90625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.753263421356678, "epoch": 0.0224, "frac_reward_zero_std": 0.125, "grad_norm": 0.7303109765052795, "learning_rate": 9.998720953805312e-06, "loss": 0.0486, "num_tokens": 1527115.0, "reward": 3.2772879600524902, "reward_std": 0.687494695186615, "rewards/evaluation_direction_reward/mean": 0.703125, "rewards/evaluation_direction_reward/std": 0.16226133704185486, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1640625, "rewards/move_legality_reward/std": 0.371787428855896, "rewards/pv_length_reward/mean": 0.32220983505249023, "rewards/pv_length_reward/std": 0.19419552385807037, "rewards/pv_quality_reward/mean": 0.087890625, "rewards/pv_quality_reward/std": 0.27456092834472656, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7865078449249268, "sampling/importance_sampling_ratio/mean": 0.9858951568603516, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7982273101806641, "sampling/sampling_logp_difference/mean": 0.024174511432647705, "step": 28, "step_time": 22.00207509100437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 155.984375, "completions/mean_terminated_length": 155.984375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.7570559531450272, "epoch": 0.0232, "frac_reward_zero_std": 0.0, "grad_norm": 0.6816264390945435, "learning_rate": 9.998624459546043e-06, "loss": 0.0195, "num_tokens": 1565553.0, "reward": 3.433255195617676, "reward_std": 0.6007082462310791, "rewards/evaluation_direction_reward/mean": 0.78125, "rewards/evaluation_direction_reward/std": 0.11311620473861694, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.30260416865348816, "rewards/move_legality_reward/std": 0.45722755789756775, "rewards/pv_length_reward/mean": 0.2838541865348816, "rewards/pv_length_reward/std": 0.07735618203878403, "rewards/pv_quality_reward/mean": 0.06640625, "rewards/pv_quality_reward/std": 0.18949885666370392, "rewards/verbosity_reward/mean": 0.9991406202316284, "rewards/verbosity_reward/std": 0.00972271990031004, "sampling/importance_sampling_ratio/max": 2.352013111114502, "sampling/importance_sampling_ratio/mean": 0.8883757591247559, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.560823917388916, "sampling/sampling_logp_difference/mean": 0.0242600254714489, "step": 29, "step_time": 22.39192306995392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 154.953125, "completions/mean_terminated_length": 154.953125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.7525178268551826, "epoch": 0.024, "frac_reward_zero_std": 0.125, "grad_norm": 0.7103913426399231, "learning_rate": 9.99852445705971e-06, "loss": -0.0335, "num_tokens": 1603507.0, "reward": 3.3338279724121094, "reward_std": 0.5911248326301575, "rewards/evaluation_direction_reward/mean": 0.75390625, "rewards/evaluation_direction_reward/std": 0.14024856686592102, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1171875, "rewards/move_legality_reward/std": 0.322907418012619, "rewards/pv_length_reward/mean": 0.3875000476837158, "rewards/pv_length_reward/std": 0.24687017500400543, "rewards/pv_quality_reward/mean": 0.076171875, "rewards/pv_quality_reward/std": 0.24414117634296417, "rewards/verbosity_reward/mean": 0.9990625381469727, "rewards/verbosity_reward/std": 0.010606602765619755, "sampling/importance_sampling_ratio/max": 2.961373805999756, "sampling/importance_sampling_ratio/mean": 1.023087501525879, "sampling/importance_sampling_ratio/min": 0.21081623435020447, "sampling/sampling_logp_difference/max": 0.5827305316925049, "sampling/sampling_logp_difference/mean": 0.023817777633666992, "step": 30, "step_time": 22.034147918224335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 155.0625, "completions/mean_terminated_length": 155.0625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.7611386701464653, "epoch": 0.0248, "frac_reward_zero_std": 0.0625, "grad_norm": 0.774687647819519, "learning_rate": 9.9984209464165e-06, "loss": 0.0617, "num_tokens": 1641763.0, "reward": 3.2529168128967285, "reward_std": 0.5455808639526367, "rewards/evaluation_direction_reward/mean": 0.654296875, "rewards/evaluation_direction_reward/std": 0.19622693955898285, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.171875, "rewards/move_legality_reward/std": 0.3787541687488556, "rewards/pv_length_reward/mean": 0.3838541805744171, "rewards/pv_length_reward/std": 0.2327132672071457, "rewards/pv_quality_reward/mean": 0.04296875, "rewards/pv_quality_reward/std": 0.18852247297763824, "rewards/verbosity_reward/mean": 0.9999218583106995, "rewards/verbosity_reward/std": 0.0008838826324790716, "sampling/importance_sampling_ratio/max": 2.9469711780548096, "sampling/importance_sampling_ratio/mean": 1.0659351348876953, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8244247436523438, "sampling/sampling_logp_difference/mean": 0.024036189541220665, "step": 31, "step_time": 21.547868631780148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 156.5390625, "completions/mean_terminated_length": 156.5390625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.7413457073271275, "epoch": 0.0256, "frac_reward_zero_std": 0.125, "grad_norm": 0.7056702375411987, "learning_rate": 9.99831392768906e-06, "loss": -0.0449, "num_tokens": 1680456.0, "reward": 3.411198139190674, "reward_std": 0.6288177371025085, "rewards/evaluation_direction_reward/mean": 0.765625, "rewards/evaluation_direction_reward/std": 0.12839867174625397, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.3916666805744171, "rewards/pv_length_reward/std": 0.24015161395072937, "rewards/pv_quality_reward/mean": 0.06640625, "rewards/pv_quality_reward/std": 0.2294386625289917, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9629368782043457, "sampling/importance_sampling_ratio/mean": 0.9628728628158569, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.60223388671875, "sampling/sampling_logp_difference/mean": 0.023110516369342804, "step": 32, "step_time": 21.74431799352169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 155.5859375, "completions/mean_terminated_length": 155.5859375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.7453483939170837, "epoch": 0.0264, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8574742078781128, "learning_rate": 9.9982034009525e-06, "loss": -0.0633, "num_tokens": 1718819.0, "reward": 3.3247270584106445, "reward_std": 0.6053956747055054, "rewards/evaluation_direction_reward/mean": 0.73828125, "rewards/evaluation_direction_reward/std": 0.1655917912721634, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1796875, "rewards/move_legality_reward/std": 0.3854354918003082, "rewards/pv_length_reward/mean": 0.3442584276199341, "rewards/pv_length_reward/std": 0.24046160280704498, "rewards/pv_quality_reward/mean": 0.0625, "rewards/pv_quality_reward/std": 0.19841894507408142, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7742345333099365, "sampling/importance_sampling_ratio/mean": 0.9104142189025879, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9265029430389404, "sampling/sampling_logp_difference/mean": 0.0241338312625885, "step": 33, "step_time": 21.42495448887348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 153.703125, "completions/mean_terminated_length": 153.703125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.7334603518247604, "epoch": 0.0272, "frac_reward_zero_std": 0.125, "grad_norm": 0.7588344812393188, "learning_rate": 9.998089366284392e-06, "loss": -0.0683, "num_tokens": 1756877.0, "reward": 3.2668588161468506, "reward_std": 0.5356349945068359, "rewards/evaluation_direction_reward/mean": 0.716796875, "rewards/evaluation_direction_reward/std": 0.14503289759159088, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.2109375, "rewards/move_legality_reward/std": 0.4095771610736847, "rewards/pv_length_reward/mean": 0.30396825075149536, "rewards/pv_length_reward/std": 0.20702512562274933, "rewards/pv_quality_reward/mean": 0.03515625, "rewards/pv_quality_reward/std": 0.15915121138095856, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.740201711654663, "sampling/importance_sampling_ratio/mean": 0.9714398980140686, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5911002159118652, "sampling/sampling_logp_difference/mean": 0.023954275995492935, "step": 34, "step_time": 20.828951351344585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 154.3984375, "completions/mean_terminated_length": 154.3984375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.7280027978122234, "epoch": 0.028, "frac_reward_zero_std": 0.125, "grad_norm": 0.7012949585914612, "learning_rate": 9.997971823764766e-06, "loss": -0.0259, "num_tokens": 1794984.0, "reward": 3.318091630935669, "reward_std": 0.6837766170501709, "rewards/evaluation_direction_reward/mean": 0.693359375, "rewards/evaluation_direction_reward/std": 0.13752180337905884, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.171875, "rewards/move_legality_reward/std": 0.3787541687488556, "rewards/pv_length_reward/mean": 0.3850446343421936, "rewards/pv_length_reward/std": 0.29806017875671387, "rewards/pv_quality_reward/mean": 0.068359375, "rewards/pv_quality_reward/std": 0.2320629060268402, "rewards/verbosity_reward/mean": 0.9994531273841858, "rewards/verbosity_reward/std": 0.005369584076106548, "sampling/importance_sampling_ratio/max": 2.4428398609161377, "sampling/importance_sampling_ratio/mean": 0.9288046360015869, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6620540618896484, "sampling/sampling_logp_difference/mean": 0.02342369593679905, "step": 35, "step_time": 22.12903292477131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 159.15625, "completions/mean_terminated_length": 152.34645080566406, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.732108898460865, "epoch": 0.0288, "frac_reward_zero_std": 0.125, "grad_norm": 0.5977607369422913, "learning_rate": 9.997850773476126e-06, "loss": -0.0119, "num_tokens": 1833636.0, "reward": 3.2461421489715576, "reward_std": 0.574303925037384, "rewards/evaluation_direction_reward/mean": 0.728515625, "rewards/evaluation_direction_reward/std": 0.12559834122657776, "rewards/format_reward/mean": 0.992968738079071, "rewards/format_reward/std": 0.07954951375722885, "rewards/move_legality_reward/mean": 0.140625, "rewards/move_legality_reward/std": 0.3490002751350403, "rewards/pv_length_reward/mean": 0.3630952537059784, "rewards/pv_length_reward/std": 0.2574384808540344, "rewards/pv_quality_reward/mean": 0.029296875, "rewards/pv_quality_reward/std": 0.1424652636051178, "rewards/verbosity_reward/mean": 0.9916406273841858, "rewards/verbosity_reward/std": 0.08855602145195007, "sampling/importance_sampling_ratio/max": 2.814176082611084, "sampling/importance_sampling_ratio/mean": 0.85819411277771, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8072912693023682, "sampling/sampling_logp_difference/mean": 0.024186497554183006, "step": 36, "step_time": 67.64696326106787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 162.375, "completions/mean_terminated_length": 155.59054565429688, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.7298510260879993, "epoch": 0.0296, "frac_reward_zero_std": 0.375, "grad_norm": 0.5471920967102051, "learning_rate": 9.997726215503422e-06, "loss": 0.0424, "num_tokens": 1872996.0, "reward": 3.198690414428711, "reward_std": 0.5704857110977173, "rewards/evaluation_direction_reward/mean": 0.7421875, "rewards/evaluation_direction_reward/std": 0.06971186399459839, "rewards/format_reward/mean": 0.992968738079071, "rewards/format_reward/std": 0.07954951375722885, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.32306551933288574, "rewards/pv_length_reward/std": 0.24453435838222504, "rewards/pv_quality_reward/mean": 0.0234375, "rewards/pv_quality_reward/std": 0.1383163034915924, "rewards/verbosity_reward/mean": 0.9920312166213989, "rewards/verbosity_reward/std": 0.08839210122823715, "sampling/importance_sampling_ratio/max": 2.5905189514160156, "sampling/importance_sampling_ratio/mean": 0.922892689704895, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4323863983154297, "sampling/sampling_logp_difference/mean": 0.023304445669054985, "step": 37, "step_time": 67.96052779257298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 160.015625, "completions/mean_terminated_length": 153.2126007080078, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.7370035611093044, "epoch": 0.0304, "frac_reward_zero_std": 0.1875, "grad_norm": 0.5714994668960571, "learning_rate": 9.99759814993408e-06, "loss": -0.0238, "num_tokens": 1911990.0, "reward": 3.310751438140869, "reward_std": 0.7223958969116211, "rewards/evaluation_direction_reward/mean": 0.69921875, "rewards/evaluation_direction_reward/std": 0.19682364165782928, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.0883883461356163, "rewards/move_legality_reward/mean": 0.2421875, "rewards/move_legality_reward/std": 0.4300905168056488, "rewards/pv_length_reward/mean": 0.31153273582458496, "rewards/pv_length_reward/std": 0.19324718415737152, "rewards/pv_quality_reward/mean": 0.07421875, "rewards/pv_quality_reward/std": 0.22041338682174683, "rewards/verbosity_reward/mean": 0.991406261920929, "rewards/verbosity_reward/std": 0.08875991404056549, "sampling/importance_sampling_ratio/max": 2.866506814956665, "sampling/importance_sampling_ratio/mean": 0.9299622774124146, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1058385372161865, "sampling/sampling_logp_difference/mean": 0.02403780072927475, "step": 38, "step_time": 67.90115378797054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 156.8828125, "completions/mean_terminated_length": 156.8828125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.7407548204064369, "epoch": 0.0312, "frac_reward_zero_std": 0.375, "grad_norm": 0.5819073915481567, "learning_rate": 9.997466576857974e-06, "loss": 0.0766, "num_tokens": 1950767.0, "reward": 3.249721050262451, "reward_std": 0.5214716792106628, "rewards/evaluation_direction_reward/mean": 0.71875, "rewards/evaluation_direction_reward/std": 0.15686394274234772, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1640625, "rewards/move_legality_reward/std": 0.371787428855896, "rewards/pv_length_reward/mean": 0.3415178656578064, "rewards/pv_length_reward/std": 0.26047155261039734, "rewards/pv_quality_reward/mean": 0.025390625, "rewards/pv_quality_reward/std": 0.13617324829101562, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9269704818725586, "sampling/importance_sampling_ratio/mean": 0.9434336423873901, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.718372106552124, "sampling/sampling_logp_difference/mean": 0.023840289562940598, "step": 39, "step_time": 21.354627683758736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 161.609375, "completions/mean_terminated_length": 154.8188934326172, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.7472634017467499, "epoch": 0.032, "frac_reward_zero_std": 0.25, "grad_norm": 0.6999654769897461, "learning_rate": 9.997331496367455e-06, "loss": 0.0926, "num_tokens": 1990229.0, "reward": 3.2220983505249023, "reward_std": 0.49406856298446655, "rewards/evaluation_direction_reward/mean": 0.7578125, "rewards/evaluation_direction_reward/std": 0.14694225788116455, "rewards/format_reward/mean": 0.992968738079071, "rewards/format_reward/std": 0.07954951375722885, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.34241074323654175, "rewards/pv_length_reward/std": 0.26350125670433044, "rewards/pv_quality_reward/mean": 0.01171875, "rewards/pv_quality_reward/std": 0.08216661959886551, "rewards/verbosity_reward/mean": 0.9921875, "rewards/verbosity_reward/std": 0.0883883461356163, "sampling/importance_sampling_ratio/max": 2.9493746757507324, "sampling/importance_sampling_ratio/mean": 0.8947200179100037, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.738062858581543, "sampling/sampling_logp_difference/mean": 0.024203067645430565, "step": 40, "step_time": 67.82810782641172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 158.046875, "completions/mean_terminated_length": 158.046875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.7450300939381123, "epoch": 0.0328, "frac_reward_zero_std": 0.3125, "grad_norm": 0.6867831945419312, "learning_rate": 9.997192908557322e-06, "loss": 0.0042, "num_tokens": 2029115.0, "reward": 3.1441333293914795, "reward_std": 0.5245504379272461, "rewards/evaluation_direction_reward/mean": 0.6875, "rewards/evaluation_direction_reward/std": 0.19841894507408142, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1484375, "rewards/move_legality_reward/std": 0.356930136680603, "rewards/pv_length_reward/mean": 0.27358630299568176, "rewards/pv_length_reward/std": 0.08224852383136749, "rewards/pv_quality_reward/mean": 0.037109375, "rewards/pv_quality_reward/std": 0.16024662554264069, "rewards/verbosity_reward/mean": 0.9975000023841858, "rewards/verbosity_reward/std": 0.027407683432102203, "sampling/importance_sampling_ratio/max": 2.7430760860443115, "sampling/importance_sampling_ratio/mean": 0.9648184776306152, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7469444274902344, "sampling/sampling_logp_difference/mean": 0.023738941177725792, "step": 41, "step_time": 23.766801618039608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 155.8671875, "completions/mean_terminated_length": 155.8671875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.7497773058712482, "epoch": 0.0336, "frac_reward_zero_std": 0.1875, "grad_norm": 0.6463050246238708, "learning_rate": 9.997050813524843e-06, "loss": 0.0427, "num_tokens": 2067330.0, "reward": 3.270904064178467, "reward_std": 0.4789833724498749, "rewards/evaluation_direction_reward/mean": 0.767578125, "rewards/evaluation_direction_reward/std": 0.0897480845451355, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.3459821343421936, "rewards/pv_length_reward/std": 0.25775831937789917, "rewards/pv_quality_reward/mean": 0.033203125, "rewards/pv_quality_reward/std": 0.16413424909114838, "rewards/verbosity_reward/mean": 0.9991406202316284, "rewards/verbosity_reward/std": 0.00972271990031004, "sampling/importance_sampling_ratio/max": 2.8234026432037354, "sampling/importance_sampling_ratio/mean": 0.9769226312637329, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7688560485839844, "sampling/sampling_logp_difference/mean": 0.02452355995774269, "step": 42, "step_time": 22.09769831597805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 162.1484375, "completions/mean_terminated_length": 162.1484375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.7687288299202919, "epoch": 0.0344, "frac_reward_zero_std": 0.1875, "grad_norm": 0.8095977902412415, "learning_rate": 9.996905211369748e-06, "loss": -0.0731, "num_tokens": 2106533.0, "reward": 3.14204740524292, "reward_std": 0.42346882820129395, "rewards/evaluation_direction_reward/mean": 0.75, "rewards/evaluation_direction_reward/std": 0.18293322622776031, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1171875, "rewards/move_legality_reward/std": 0.322907418012619, "rewards/pv_length_reward/mean": 0.2479066699743271, "rewards/pv_length_reward/std": 0.08237244933843613, "rewards/pv_quality_reward/mean": 0.02734375, "rewards/pv_quality_reward/std": 0.15444329380989075, "rewards/verbosity_reward/mean": 0.9996093511581421, "rewards/verbosity_reward/std": 0.004419418517500162, "sampling/importance_sampling_ratio/max": 2.870375633239746, "sampling/importance_sampling_ratio/mean": 0.8994949460029602, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9347965717315674, "sampling/sampling_logp_difference/mean": 0.024843700230121613, "step": 43, "step_time": 21.885631695389748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 158.453125, "completions/mean_terminated_length": 158.453125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.7553810700774193, "epoch": 0.0352, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6152256727218628, "learning_rate": 9.996756102194222e-06, "loss": -0.0102, "num_tokens": 2145159.0, "reward": 3.391502857208252, "reward_std": 0.7206579446792603, "rewards/evaluation_direction_reward/mean": 0.712890625, "rewards/evaluation_direction_reward/std": 0.18317477405071259, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.234375, "rewards/move_legality_reward/std": 0.42527204751968384, "rewards/pv_length_reward/mean": 0.3761904835700989, "rewards/pv_length_reward/std": 0.25473394989967346, "rewards/pv_quality_reward/mean": 0.0703125, "rewards/pv_quality_reward/std": 0.2346414178609848, "rewards/verbosity_reward/mean": 0.9977343678474426, "rewards/verbosity_reward/std": 0.025632621720433235, "sampling/importance_sampling_ratio/max": 2.742367744445801, "sampling/importance_sampling_ratio/mean": 0.8713757991790771, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7345540523529053, "sampling/sampling_logp_difference/mean": 0.025292692705988884, "step": 44, "step_time": 21.76990383863449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 158.84375, "completions/mean_terminated_length": 158.84375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.7437651976943016, "epoch": 0.036, "frac_reward_zero_std": 0.0, "grad_norm": 0.6621956825256348, "learning_rate": 9.996603486102918e-06, "loss": 0.0913, "num_tokens": 2184027.0, "reward": 3.4177565574645996, "reward_std": 0.5786046385765076, "rewards/evaluation_direction_reward/mean": 0.767578125, "rewards/evaluation_direction_reward/std": 0.10950674116611481, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.3464285731315613, "rewards/pv_length_reward/std": 0.1798577606678009, "rewards/pv_quality_reward/mean": 0.0546875, "rewards/pv_quality_reward/std": 0.19067199528217316, "rewards/verbosity_reward/mean": 0.9990625381469727, "rewards/verbosity_reward/std": 0.010606602765619755, "sampling/importance_sampling_ratio/max": 2.6467809677124023, "sampling/importance_sampling_ratio/mean": 0.8925071954727173, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.46550917625427246, "sampling/sampling_logp_difference/mean": 0.024237770587205887, "step": 45, "step_time": 22.50900561362505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 161.78125, "completions/mean_terminated_length": 161.78125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.7383739352226257, "epoch": 0.0368, "frac_reward_zero_std": 0.125, "grad_norm": 0.7217046022415161, "learning_rate": 9.996447363202947e-06, "loss": -0.0317, "num_tokens": 2223479.0, "reward": 3.415651321411133, "reward_std": 0.6209614872932434, "rewards/evaluation_direction_reward/mean": 0.78515625, "rewards/evaluation_direction_reward/std": 0.10278778523206711, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.2109375, "rewards/move_legality_reward/std": 0.4095771610736847, "rewards/pv_length_reward/mean": 0.3447916805744171, "rewards/pv_length_reward/std": 0.17793594300746918, "rewards/pv_quality_reward/mean": 0.078125, "rewards/pv_quality_reward/std": 0.2524486482143402, "rewards/verbosity_reward/mean": 0.9966406226158142, "rewards/verbosity_reward/std": 0.038006991147994995, "sampling/importance_sampling_ratio/max": 2.9473042488098145, "sampling/importance_sampling_ratio/mean": 0.9418916702270508, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.3083322048187256, "sampling/sampling_logp_difference/mean": 0.024169940501451492, "step": 46, "step_time": 25.64507805556059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 160.5703125, "completions/mean_terminated_length": 160.5703125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.7689120136201382, "epoch": 0.0376, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6961278319358826, "learning_rate": 9.996287733603883e-06, "loss": -0.0032, "num_tokens": 2262880.0, "reward": 3.2961721420288086, "reward_std": 0.5324179530143738, "rewards/evaluation_direction_reward/mean": 0.765625, "rewards/evaluation_direction_reward/std": 0.09280196577310562, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.21875, "rewards/move_legality_reward/std": 0.41502299904823303, "rewards/pv_length_reward/mean": 0.2718750238418579, "rewards/pv_length_reward/std": 0.08307704329490662, "rewards/pv_quality_reward/mean": 0.041015625, "rewards/pv_quality_reward/std": 0.16234424710273743, "rewards/verbosity_reward/mean": 0.9989062547683716, "rewards/verbosity_reward/std": 0.007759614381939173, "sampling/importance_sampling_ratio/max": 2.9637997150421143, "sampling/importance_sampling_ratio/mean": 0.9281865358352661, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6116471290588379, "sampling/sampling_logp_difference/mean": 0.024828940629959106, "step": 47, "step_time": 23.25685614347458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 164.4765625, "completions/mean_terminated_length": 157.7086639404297, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.7504761926829815, "epoch": 0.0384, "frac_reward_zero_std": 0.1875, "grad_norm": 0.5777831673622131, "learning_rate": 9.99612459741776e-06, "loss": -0.021, "num_tokens": 2302453.0, "reward": 3.3500967025756836, "reward_std": 0.5761553049087524, "rewards/evaluation_direction_reward/mean": 0.7578125, "rewards/evaluation_direction_reward/std": 0.10839514434337616, "rewards/format_reward/mean": 0.992968738079071, "rewards/format_reward/std": 0.07954951375722885, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.3965030014514923, "rewards/pv_length_reward/std": 0.2920161783695221, "rewards/pv_quality_reward/mean": 0.025390625, "rewards/pv_quality_reward/std": 0.1397404968738556, "rewards/verbosity_reward/mean": 0.9899218678474426, "rewards/verbosity_reward/std": 0.09183600544929504, "sampling/importance_sampling_ratio/max": 2.6730222702026367, "sampling/importance_sampling_ratio/mean": 0.9618862867355347, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9987711906433105, "sampling/sampling_logp_difference/mean": 0.02404855564236641, "step": 48, "step_time": 67.96128482371569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 164.8515625, "completions/mean_terminated_length": 164.8515625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.7571643069386482, "epoch": 0.0392, "frac_reward_zero_std": 0.25, "grad_norm": 0.6725802421569824, "learning_rate": 9.995957954759073e-06, "loss": 0.012, "num_tokens": 2342570.0, "reward": 3.2595274448394775, "reward_std": 0.5475298762321472, "rewards/evaluation_direction_reward/mean": 0.759765625, "rewards/evaluation_direction_reward/std": 0.10118560492992401, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.15625, "rewards/move_legality_reward/std": 0.3645188808441162, "rewards/pv_length_reward/mean": 0.2821056544780731, "rewards/pv_length_reward/std": 0.07048811763525009, "rewards/pv_quality_reward/mean": 0.0625, "rewards/pv_quality_reward/std": 0.20810355246067047, "rewards/verbosity_reward/mean": 0.9989062547683716, "rewards/verbosity_reward/std": 0.012374366633594036, "sampling/importance_sampling_ratio/max": 2.839958429336548, "sampling/importance_sampling_ratio/mean": 0.9254074692726135, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8445730209350586, "sampling/sampling_logp_difference/mean": 0.024585092440247536, "step": 49, "step_time": 23.269100673496723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 164.5625, "completions/mean_terminated_length": 164.5625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 0.765543095767498, "epoch": 0.04, "frac_reward_zero_std": 0.1875, "grad_norm": 0.6083560585975647, "learning_rate": 9.995787805744778e-06, "loss": 0.0092, "num_tokens": 2382026.0, "reward": 3.171086311340332, "reward_std": 0.4755476713180542, "rewards/evaluation_direction_reward/mean": 0.712890625, "rewards/evaluation_direction_reward/std": 0.15714555978775024, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.31264883279800415, "rewards/pv_length_reward/std": 0.19698427617549896, "rewards/pv_quality_reward/mean": 0.029296875, "rewards/pv_quality_reward/std": 0.15880054235458374, "rewards/verbosity_reward/mean": 0.9912499785423279, "rewards/verbosity_reward/std": 0.08884207159280777, "sampling/importance_sampling_ratio/max": 2.894732713699341, "sampling/importance_sampling_ratio/mean": 0.9009460210800171, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7852058410644531, "sampling/sampling_logp_difference/mean": 0.02552664466202259, "step": 50, "step_time": 24.36190015822649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 163.484375, "completions/mean_terminated_length": 163.484375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.7710943594574928, "epoch": 0.0408, "frac_reward_zero_std": 0.1875, "grad_norm": 0.8463158011436462, "learning_rate": 9.995614150494293e-06, "loss": -0.1366, "num_tokens": 2421272.0, "reward": 3.334244966506958, "reward_std": 0.5453184843063354, "rewards/evaluation_direction_reward/mean": 0.771484375, "rewards/evaluation_direction_reward/std": 0.0703432559967041, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1640625, "rewards/move_legality_reward/std": 0.371787428855896, "rewards/pv_length_reward/mean": 0.3635416626930237, "rewards/pv_length_reward/std": 0.25428736209869385, "rewards/pv_quality_reward/mean": 0.03515625, "rewards/pv_quality_reward/std": 0.16521988809108734, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.942884683609009, "sampling/importance_sampling_ratio/mean": 0.8680646419525146, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.664923906326294, "sampling/sampling_logp_difference/mean": 0.025256389752030373, "step": 51, "step_time": 21.985373340547085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 165.4453125, "completions/mean_terminated_length": 165.4453125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.7798475921154022, "epoch": 0.0416, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6969653367996216, "learning_rate": 9.995436989129495e-06, "loss": -0.0451, "num_tokens": 2460833.0, "reward": 3.220736503601074, "reward_std": 0.41793355345726013, "rewards/evaluation_direction_reward/mean": 0.78125, "rewards/evaluation_direction_reward/std": 0.13310347497463226, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.109375, "rewards/move_legality_reward/std": 0.31333550810813904, "rewards/pv_length_reward/mean": 0.3118303418159485, "rewards/pv_length_reward/std": 0.1051488146185875, "rewards/pv_quality_reward/mean": 0.021484375, "rewards/pv_quality_reward/std": 0.09934502094984055, "rewards/verbosity_reward/mean": 0.9967969059944153, "rewards/verbosity_reward/std": 0.022516261786222458, "sampling/importance_sampling_ratio/max": 2.895958423614502, "sampling/importance_sampling_ratio/mean": 0.877860426902771, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4594693183898926, "sampling/sampling_logp_difference/mean": 0.02466382086277008, "step": 52, "step_time": 22.988937743008137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 166.390625, "completions/mean_terminated_length": 166.390625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.7903901487588882, "epoch": 0.0424, "frac_reward_zero_std": 0.125, "grad_norm": 0.5783517360687256, "learning_rate": 9.995256321774722e-06, "loss": 0.0658, "num_tokens": 2500523.0, "reward": 3.364866256713867, "reward_std": 0.560910165309906, "rewards/evaluation_direction_reward/mean": 0.787109375, "rewards/evaluation_direction_reward/std": 0.08923253417015076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.2109375, "rewards/move_legality_reward/std": 0.4095771610736847, "rewards/pv_length_reward/mean": 0.33236610889434814, "rewards/pv_length_reward/std": 0.18877319991588593, "rewards/pv_quality_reward/mean": 0.03515625, "rewards/pv_quality_reward/std": 0.16521988809108734, "rewards/verbosity_reward/mean": 0.9992969036102295, "rewards/verbosity_reward/std": 0.007954949513077736, "sampling/importance_sampling_ratio/max": 2.9885942935943604, "sampling/importance_sampling_ratio/mean": 0.8512861132621765, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5357000827789307, "sampling/sampling_logp_difference/mean": 0.025290025398135185, "step": 53, "step_time": 22.327414087951183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 166.875, "completions/mean_terminated_length": 166.875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.7771810740232468, "epoch": 0.0432, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8558650612831116, "learning_rate": 9.995072148556776e-06, "loss": 0.0659, "num_tokens": 2540595.0, "reward": 3.426406145095825, "reward_std": 0.6111400723457336, "rewards/evaluation_direction_reward/mean": 0.81640625, "rewards/evaluation_direction_reward/std": 0.13488247990608215, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1953125, "rewards/move_legality_reward/std": 0.3979988098144531, "rewards/pv_length_reward/mean": 0.36093753576278687, "rewards/pv_length_reward/std": 0.19882020354270935, "rewards/pv_quality_reward/mean": 0.05859375, "rewards/pv_quality_reward/std": 0.20206701755523682, "rewards/verbosity_reward/mean": 0.9951562881469727, "rewards/verbosity_reward/std": 0.03595676273107529, "sampling/importance_sampling_ratio/max": 2.711960792541504, "sampling/importance_sampling_ratio/mean": 0.9555923938751221, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8241837024688721, "sampling/sampling_logp_difference/mean": 0.025408463552594185, "step": 54, "step_time": 24.212016209959984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 169.9765625, "completions/mean_terminated_length": 169.9765625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.7853886261582375, "epoch": 0.044, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6893293261528015, "learning_rate": 9.994884469604913e-06, "loss": -0.0009, "num_tokens": 2580864.0, "reward": 3.263411521911621, "reward_std": 0.5531147718429565, "rewards/evaluation_direction_reward/mean": 0.72265625, "rewards/evaluation_direction_reward/std": 0.2062850445508957, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.171875, "rewards/move_legality_reward/std": 0.3787541687488556, "rewards/pv_length_reward/mean": 0.32278648018836975, "rewards/pv_length_reward/std": 0.11712267994880676, "rewards/pv_quality_reward/mean": 0.048828125, "rewards/pv_quality_reward/std": 0.20102736353874207, "rewards/verbosity_reward/mean": 0.997265636920929, "rewards/verbosity_reward/std": 0.019671106711030006, "sampling/importance_sampling_ratio/max": 2.9478282928466797, "sampling/importance_sampling_ratio/mean": 0.8704152703285217, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0941632986068726, "sampling/sampling_logp_difference/mean": 0.025664357468485832, "step": 55, "step_time": 23.290547765791416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 168.5546875, "completions/mean_terminated_length": 168.5546875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.7870371825993061, "epoch": 0.0448, "frac_reward_zero_std": 0.0625, "grad_norm": 0.543262779712677, "learning_rate": 9.994693285050858e-06, "loss": 0.0953, "num_tokens": 2620943.0, "reward": 3.2977123260498047, "reward_std": 0.5711995363235474, "rewards/evaluation_direction_reward/mean": 0.759765625, "rewards/evaluation_direction_reward/std": 0.16719767451286316, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.21875, "rewards/move_legality_reward/std": 0.41502299904823303, "rewards/pv_length_reward/mean": 0.25357145071029663, "rewards/pv_length_reward/std": 0.19306915998458862, "rewards/pv_quality_reward/mean": 0.0703125, "rewards/pv_quality_reward/std": 0.23253460228443146, "rewards/verbosity_reward/mean": 0.9953124523162842, "rewards/verbosity_reward/std": 0.026677941903471947, "sampling/importance_sampling_ratio/max": 2.987025499343872, "sampling/importance_sampling_ratio/mean": 0.9366671442985535, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.811215877532959, "sampling/sampling_logp_difference/mean": 0.025069553405046463, "step": 56, "step_time": 23.448591202497482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 170.4609375, "completions/mean_terminated_length": 170.4609375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.7965232729911804, "epoch": 0.0456, "frac_reward_zero_std": 0.0, "grad_norm": 0.6594361662864685, "learning_rate": 9.994498595028787e-06, "loss": -0.0041, "num_tokens": 2661202.0, "reward": 3.310486316680908, "reward_std": 0.5327402353286743, "rewards/evaluation_direction_reward/mean": 0.732421875, "rewards/evaluation_direction_reward/std": 0.1723608374595642, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.234375, "rewards/move_legality_reward/std": 0.42527204751968384, "rewards/pv_length_reward/mean": 0.31892359256744385, "rewards/pv_length_reward/std": 0.17824937403202057, "rewards/pv_quality_reward/mean": 0.025390625, "rewards/pv_quality_reward/std": 0.10329151898622513, "rewards/verbosity_reward/mean": 0.9993749856948853, "rewards/verbosity_reward/std": 0.006243105977773666, "sampling/importance_sampling_ratio/max": 2.7127599716186523, "sampling/importance_sampling_ratio/mean": 0.9076737761497498, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49561595916748047, "sampling/sampling_logp_difference/mean": 0.025449281558394432, "step": 57, "step_time": 22.201383143663406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 174.4453125, "completions/mean_terminated_length": 174.4453125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.7849788703024387, "epoch": 0.0464, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8016583919525146, "learning_rate": 9.994300399675342e-06, "loss": -0.0423, "num_tokens": 2702107.0, "reward": 3.371328115463257, "reward_std": 0.5806735157966614, "rewards/evaluation_direction_reward/mean": 0.763671875, "rewards/evaluation_direction_reward/std": 0.11445090174674988, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1796875, "rewards/move_legality_reward/std": 0.3854354918003082, "rewards/pv_length_reward/mean": 0.3812500238418579, "rewards/pv_length_reward/std": 0.2515307366847992, "rewards/pv_quality_reward/mean": 0.052734375, "rewards/pv_quality_reward/std": 0.17369407415390015, "rewards/verbosity_reward/mean": 0.9939843416213989, "rewards/verbosity_reward/std": 0.03207617998123169, "sampling/importance_sampling_ratio/max": 2.7131943702697754, "sampling/importance_sampling_ratio/mean": 0.91136234998703, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4942188262939453, "sampling/sampling_logp_difference/mean": 0.02505636401474476, "step": 58, "step_time": 27.912130549550056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 170.078125, "completions/mean_terminated_length": 170.078125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.7643914930522442, "epoch": 0.0472, "frac_reward_zero_std": 0.1875, "grad_norm": 0.5354869365692139, "learning_rate": 9.994098699129628e-06, "loss": -0.0317, "num_tokens": 2742269.0, "reward": 3.373735189437866, "reward_std": 0.6396881341934204, "rewards/evaluation_direction_reward/mean": 0.734375, "rewards/evaluation_direction_reward/std": 0.14631295204162598, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.21875, "rewards/move_legality_reward/std": 0.41502299904823303, "rewards/pv_length_reward/mean": 0.34248512983322144, "rewards/pv_length_reward/std": 0.19778768718242645, "rewards/pv_quality_reward/mean": 0.080078125, "rewards/pv_quality_reward/std": 0.25280627608299255, "rewards/verbosity_reward/mean": 0.998046875, "rewards/verbosity_reward/std": 0.015975305810570717, "sampling/importance_sampling_ratio/max": 2.1538703441619873, "sampling/importance_sampling_ratio/mean": 0.8538177609443665, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5620508193969727, "sampling/sampling_logp_difference/mean": 0.024901166558265686, "step": 59, "step_time": 23.187653608620167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 172.484375, "completions/mean_terminated_length": 172.484375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.7864728234708309, "epoch": 0.048, "frac_reward_zero_std": 0.125, "grad_norm": 0.6802086234092712, "learning_rate": 9.993893493533203e-06, "loss": -0.0787, "num_tokens": 2782875.0, "reward": 3.298412799835205, "reward_std": 0.47108182311058044, "rewards/evaluation_direction_reward/mean": 0.775390625, "rewards/evaluation_direction_reward/std": 0.10795086622238159, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.21875, "rewards/move_legality_reward/std": 0.41502299904823303, "rewards/pv_length_reward/mean": 0.2906002104282379, "rewards/pv_length_reward/std": 0.12417778372764587, "rewards/pv_quality_reward/mean": 0.015625, "rewards/pv_quality_reward/std": 0.10286256670951843, "rewards/verbosity_reward/mean": 0.998046875, "rewards/verbosity_reward/std": 0.010726271197199821, "sampling/importance_sampling_ratio/max": 2.5518360137939453, "sampling/importance_sampling_ratio/mean": 0.9310914278030396, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.586662769317627, "sampling/sampling_logp_difference/mean": 0.025207214057445526, "step": 60, "step_time": 23.467936851084232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 175.390625, "completions/mean_terminated_length": 175.390625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.7962206527590752, "epoch": 0.0488, "frac_reward_zero_std": 0.0, "grad_norm": 0.701721727848053, "learning_rate": 9.99368478303009e-06, "loss": -0.0467, "num_tokens": 2824045.0, "reward": 3.2590365409851074, "reward_std": 0.5364654064178467, "rewards/evaluation_direction_reward/mean": 0.71875, "rewards/evaluation_direction_reward/std": 0.20331890881061554, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.19687500596046448, "rewards/move_legality_reward/std": 0.3976184129714966, "rewards/pv_length_reward/mean": 0.3083333373069763, "rewards/pv_length_reward/std": 0.11425133794546127, "rewards/pv_quality_reward/mean": 0.037109375, "rewards/pv_quality_reward/std": 0.16627532243728638, "rewards/verbosity_reward/mean": 0.9979687333106995, "rewards/verbosity_reward/std": 0.011928882449865341, "sampling/importance_sampling_ratio/max": 2.8572804927825928, "sampling/importance_sampling_ratio/mean": 0.8598295450210571, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9053612947463989, "sampling/sampling_logp_difference/mean": 0.025146789848804474, "step": 61, "step_time": 23.403426982462406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 174.59375, "completions/mean_terminated_length": 174.59375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.7908320091664791, "epoch": 0.0496, "frac_reward_zero_std": 0.1875, "grad_norm": 0.6148458123207092, "learning_rate": 9.993472567766764e-06, "loss": 0.0218, "num_tokens": 2864889.0, "reward": 3.382760524749756, "reward_std": 0.5364227890968323, "rewards/evaluation_direction_reward/mean": 0.806640625, "rewards/evaluation_direction_reward/std": 0.10506299138069153, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1328125, "rewards/move_legality_reward/std": 0.3407054841518402, "rewards/pv_length_reward/mean": 0.41197916865348816, "rewards/pv_length_reward/std": 0.2449314445257187, "rewards/pv_quality_reward/mean": 0.03515625, "rewards/pv_quality_reward/std": 0.16817213594913483, "rewards/verbosity_reward/mean": 0.9961718320846558, "rewards/verbosity_reward/std": 0.02085273154079914, "sampling/importance_sampling_ratio/max": 2.4945590496063232, "sampling/importance_sampling_ratio/mean": 0.9244694709777832, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5645421743392944, "sampling/sampling_logp_difference/mean": 0.024926748126745224, "step": 62, "step_time": 23.1907971277833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 182.09375, "completions/mean_terminated_length": 182.09375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.8044224791228771, "epoch": 0.0504, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6193534731864929, "learning_rate": 9.993256847892175e-06, "loss": -0.072, "num_tokens": 2906861.0, "reward": 3.405681610107422, "reward_std": 0.5709084272384644, "rewards/evaluation_direction_reward/mean": 0.81640625, "rewards/evaluation_direction_reward/std": 0.11520426720380783, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.2506009638309479, "rewards/move_legality_reward/std": 0.4344188868999481, "rewards/pv_length_reward/mean": 0.30359625816345215, "rewards/pv_length_reward/std": 0.15513227880001068, "rewards/pv_quality_reward/mean": 0.048828125, "rewards/pv_quality_reward/std": 0.1935439109802246, "rewards/verbosity_reward/mean": 0.9862500429153442, "rewards/verbosity_reward/std": 0.05702603608369827, "sampling/importance_sampling_ratio/max": 2.592485189437866, "sampling/importance_sampling_ratio/mean": 0.8203837275505066, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5609418153762817, "sampling/sampling_logp_difference/mean": 0.025368066504597664, "step": 63, "step_time": 25.03784531354904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 193.8125, "completions/mean_terminated_length": 180.6349334716797, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.7895400822162628, "epoch": 0.0512, "frac_reward_zero_std": 0.0, "grad_norm": 0.6988568902015686, "learning_rate": 9.993037623557716e-06, "loss": 0.0821, "num_tokens": 2949765.0, "reward": 3.1963541507720947, "reward_std": 0.606959879398346, "rewards/evaluation_direction_reward/mean": 0.705078125, "rewards/evaluation_direction_reward/std": 0.20912639796733856, "rewards/format_reward/mean": 0.9859374761581421, "rewards/format_reward/std": 0.11205621063709259, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.3760416805744171, "rewards/pv_length_reward/std": 0.16345663368701935, "rewards/pv_quality_reward/mean": 0.02734375, "rewards/pv_quality_reward/std": 0.10515443980693817, "rewards/verbosity_reward/mean": 0.9769531488418579, "rewards/verbosity_reward/std": 0.1294054090976715, "sampling/importance_sampling_ratio/max": 2.7167160511016846, "sampling/importance_sampling_ratio/mean": 0.8723496198654175, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49741077423095703, "sampling/sampling_logp_difference/mean": 0.02460077591240406, "step": 64, "step_time": 68.30778413265944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 184.1953125, "completions/mean_terminated_length": 184.1953125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.8275677375495434, "epoch": 0.052, "frac_reward_zero_std": 0.0, "grad_norm": 0.6256161332130432, "learning_rate": 9.992814894917251e-06, "loss": -0.0457, "num_tokens": 2991902.0, "reward": 3.522669792175293, "reward_std": 0.6729851961135864, "rewards/evaluation_direction_reward/mean": 0.826171875, "rewards/evaluation_direction_reward/std": 0.14566773176193237, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.2109375, "rewards/move_legality_reward/std": 0.4095771610736847, "rewards/pv_length_reward/mean": 0.4084511399269104, "rewards/pv_length_reward/std": 0.22093993425369263, "rewards/pv_quality_reward/mean": 0.0859375, "rewards/pv_quality_reward/std": 0.23776663839817047, "rewards/verbosity_reward/mean": 0.9911718368530273, "rewards/verbosity_reward/std": 0.03517383337020874, "sampling/importance_sampling_ratio/max": 2.5357069969177246, "sampling/importance_sampling_ratio/mean": 0.8204823732376099, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4760289192199707, "sampling/sampling_logp_difference/mean": 0.02570314146578312, "step": 65, "step_time": 23.97499793767929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 183.1640625, "completions/mean_terminated_length": 183.1640625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.8003145940601826, "epoch": 0.0528, "frac_reward_zero_std": 0.0, "grad_norm": 0.7777498960494995, "learning_rate": 9.9925886621271e-06, "loss": 0.0423, "num_tokens": 3033779.0, "reward": 3.349093198776245, "reward_std": 0.41966474056243896, "rewards/evaluation_direction_reward/mean": 0.814453125, "rewards/evaluation_direction_reward/std": 0.12249896675348282, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.3530776798725128, "rewards/pv_length_reward/std": 0.21247678995132446, "rewards/pv_quality_reward/mean": 0.00390625, "rewards/pv_quality_reward/std": 0.04419417306780815, "rewards/verbosity_reward/mean": 0.9901562333106995, "rewards/verbosity_reward/std": 0.03930467367172241, "sampling/importance_sampling_ratio/max": 2.9903295040130615, "sampling/importance_sampling_ratio/mean": 0.9527254104614258, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4645366668701172, "sampling/sampling_logp_difference/mean": 0.025757255032658577, "step": 66, "step_time": 25.84467777609825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 186.953125, "completions/mean_terminated_length": 186.953125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.8163335882127285, "epoch": 0.0536, "frac_reward_zero_std": 0.0, "grad_norm": 0.564437985420227, "learning_rate": 9.99235892534604e-06, "loss": 0.0829, "num_tokens": 3075917.0, "reward": 3.268645763397217, "reward_std": 0.4906236529350281, "rewards/evaluation_direction_reward/mean": 0.7109375, "rewards/evaluation_direction_reward/std": 0.22718212008476257, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.11328125, "rewards/move_legality_reward/std": 0.31507307291030884, "rewards/pv_length_reward/mean": 0.39895832538604736, "rewards/pv_length_reward/std": 0.15471236407756805, "rewards/pv_quality_reward/mean": 0.05859375, "rewards/pv_quality_reward/std": 0.222911074757576, "rewards/verbosity_reward/mean": 0.9868749976158142, "rewards/verbosity_reward/std": 0.054998211562633514, "sampling/importance_sampling_ratio/max": 2.977252960205078, "sampling/importance_sampling_ratio/mean": 0.8806940913200378, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1535568237304688, "sampling/sampling_logp_difference/mean": 0.02551642619073391, "step": 67, "step_time": 24.96667180210352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 192.609375, "completions/mean_terminated_length": 192.609375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.8299113884568214, "epoch": 0.0544, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5829207301139832, "learning_rate": 9.99212568473531e-06, "loss": 0.0209, "num_tokens": 3119067.0, "reward": 3.259587287902832, "reward_std": 0.4889030158519745, "rewards/evaluation_direction_reward/mean": 0.794921875, "rewards/evaluation_direction_reward/std": 0.1702059656381607, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.171875, "rewards/move_legality_reward/std": 0.3787541687488556, "rewards/pv_length_reward/mean": 0.2973214387893677, "rewards/pv_length_reward/std": 0.12955951690673828, "rewards/pv_quality_reward/mean": 0.021484375, "rewards/pv_quality_reward/std": 0.12161698937416077, "rewards/verbosity_reward/mean": 0.9739843606948853, "rewards/verbosity_reward/std": 0.07568417489528656, "sampling/importance_sampling_ratio/max": 2.7081961631774902, "sampling/importance_sampling_ratio/mean": 0.8975620269775391, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5678520202636719, "sampling/sampling_logp_difference/mean": 0.026339726522564888, "step": 68, "step_time": 25.178516030311584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 192.5078125, "completions/mean_terminated_length": 192.5078125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.8321091830730438, "epoch": 0.0552, "frac_reward_zero_std": 0.0625, "grad_norm": 0.648246169090271, "learning_rate": 9.991888940458605e-06, "loss": -0.0123, "num_tokens": 3162116.0, "reward": 3.2435901165008545, "reward_std": 0.4025772213935852, "rewards/evaluation_direction_reward/mean": 0.810546875, "rewards/evaluation_direction_reward/std": 0.13940994441509247, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1015625, "rewards/move_legality_reward/std": 0.3032590448856354, "rewards/pv_length_reward/mean": 0.33757442235946655, "rewards/pv_length_reward/std": 0.19636130332946777, "rewards/pv_quality_reward/mean": 0.009765625, "rewards/pv_quality_reward/std": 0.05786850303411484, "rewards/verbosity_reward/mean": 0.9841406345367432, "rewards/verbosity_reward/std": 0.04293425381183624, "sampling/importance_sampling_ratio/max": 2.6919782161712646, "sampling/importance_sampling_ratio/mean": 0.8792309165000916, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.556157112121582, "sampling/sampling_logp_difference/mean": 0.02574452944099903, "step": 69, "step_time": 24.05536651611328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1024.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 209.0625, "completions/mean_terminated_length": 196.1269989013672, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.8182870224118233, "epoch": 0.056, "frac_reward_zero_std": 0.0, "grad_norm": 0.6632302403450012, "learning_rate": 9.991648692682083e-06, "loss": 0.094, "num_tokens": 3207396.0, "reward": 3.21010422706604, "reward_std": 0.6435966491699219, "rewards/evaluation_direction_reward/mean": 0.7421875, "rewards/evaluation_direction_reward/std": 0.2528139054775238, "rewards/format_reward/mean": 0.9859374761581421, "rewards/format_reward/std": 0.11205621063709259, "rewards/move_legality_reward/mean": 0.15625, "rewards/move_legality_reward/std": 0.3645188808441162, "rewards/pv_length_reward/mean": 0.31822916865348816, "rewards/pv_length_reward/std": 0.11042425781488419, "rewards/pv_quality_reward/mean": 0.048828125, "rewards/pv_quality_reward/std": 0.21291609108448029, "rewards/verbosity_reward/mean": 0.9586718678474426, "rewards/verbosity_reward/std": 0.13549774885177612, "sampling/importance_sampling_ratio/max": 2.6625349521636963, "sampling/importance_sampling_ratio/mean": 0.8818848133087158, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.648794412612915, "sampling/sampling_logp_difference/mean": 0.02595561370253563, "step": 70, "step_time": 68.33396926522255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 193.4453125, "completions/mean_terminated_length": 193.4453125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.8175845853984356, "epoch": 0.0568, "frac_reward_zero_std": 0.0, "grad_norm": 0.5589584708213806, "learning_rate": 9.99140494157436e-06, "loss": 0.0272, "num_tokens": 3250877.0, "reward": 3.2982850074768066, "reward_std": 0.4585789740085602, "rewards/evaluation_direction_reward/mean": 0.78515625, "rewards/evaluation_direction_reward/std": 0.16817213594913483, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.15625, "rewards/move_legality_reward/std": 0.3645188808441162, "rewards/pv_length_reward/mean": 0.3452380895614624, "rewards/pv_length_reward/std": 0.14393047988414764, "rewards/pv_quality_reward/mean": 0.025390625, "rewards/pv_quality_reward/std": 0.13617324829101562, "rewards/verbosity_reward/mean": 0.9862500429153442, "rewards/verbosity_reward/std": 0.04052801430225372, "sampling/importance_sampling_ratio/max": 2.741119861602783, "sampling/importance_sampling_ratio/mean": 0.8338184356689453, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0769603252410889, "sampling/sampling_logp_difference/mean": 0.02685299888253212, "step": 71, "step_time": 24.37147957086563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 197.59375, "completions/mean_terminated_length": 197.59375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.8362932093441486, "epoch": 0.0576, "frac_reward_zero_std": 0.0, "grad_norm": 0.6466670036315918, "learning_rate": 9.99115768730651e-06, "loss": -0.0021, "num_tokens": 3294697.0, "reward": 3.3107552528381348, "reward_std": 0.449493408203125, "rewards/evaluation_direction_reward/mean": 0.849609375, "rewards/evaluation_direction_reward/std": 0.16404053568840027, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1171875, "rewards/move_legality_reward/std": 0.322907418012619, "rewards/pv_length_reward/mean": 0.3333333730697632, "rewards/pv_length_reward/std": 0.176680788397789, "rewards/pv_quality_reward/mean": 0.03125, "rewards/pv_quality_reward/std": 0.1471514254808426, "rewards/verbosity_reward/mean": 0.9793750047683716, "rewards/verbosity_reward/std": 0.05182876065373421, "sampling/importance_sampling_ratio/max": 2.7015717029571533, "sampling/importance_sampling_ratio/mean": 0.7875320911407471, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7117123603820801, "sampling/sampling_logp_difference/mean": 0.025689849629998207, "step": 72, "step_time": 24.14013072103262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 203.8828125, "completions/mean_terminated_length": 197.42520141601562, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.8309086076915264, "epoch": 0.0584, "frac_reward_zero_std": 0.0, "grad_norm": 0.5317049026489258, "learning_rate": 9.990906930052065e-06, "loss": 0.1404, "num_tokens": 3339474.0, "reward": 3.321093797683716, "reward_std": 0.5457583665847778, "rewards/evaluation_direction_reward/mean": 0.806640625, "rewards/evaluation_direction_reward/std": 0.20140951871871948, "rewards/format_reward/mean": 0.992968738079071, "rewards/format_reward/std": 0.07954951375722885, "rewards/move_legality_reward/mean": 0.1484375, "rewards/move_legality_reward/std": 0.356930136680603, "rewards/pv_length_reward/mean": 0.39375001192092896, "rewards/pv_length_reward/std": 0.15971048176288605, "rewards/pv_quality_reward/mean": 0.013671875, "rewards/pv_quality_reward/std": 0.09571834653615952, "rewards/verbosity_reward/mean": 0.965624988079071, "rewards/verbosity_reward/std": 0.11051597446203232, "sampling/importance_sampling_ratio/max": 2.9661707878112793, "sampling/importance_sampling_ratio/mean": 0.763164758682251, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6238114833831787, "sampling/sampling_logp_difference/mean": 0.026166090741753578, "step": 73, "step_time": 68.01223184913397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 201.5078125, "completions/mean_terminated_length": 195.031494140625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.8293045982718468, "epoch": 0.0592, "frac_reward_zero_std": 0.125, "grad_norm": 0.6283258199691772, "learning_rate": 9.990652669987016e-06, "loss": -0.082, "num_tokens": 3384219.0, "reward": 3.0658743381500244, "reward_std": 0.5185508728027344, "rewards/evaluation_direction_reward/mean": 0.71875, "rewards/evaluation_direction_reward/std": 0.23892804980278015, "rewards/format_reward/mean": 0.992968738079071, "rewards/format_reward/std": 0.07954951375722885, "rewards/move_legality_reward/mean": 0.109375, "rewards/move_legality_reward/std": 0.31333550810813904, "rewards/pv_length_reward/mean": 0.24501490592956543, "rewards/pv_length_reward/std": 0.1246902123093605, "rewards/pv_quality_reward/mean": 0.025390625, "rewards/pv_quality_reward/std": 0.14661483466625214, "rewards/verbosity_reward/mean": 0.9743750095367432, "rewards/verbosity_reward/std": 0.1046309545636177, "sampling/importance_sampling_ratio/max": 2.496654748916626, "sampling/importance_sampling_ratio/mean": 0.7628964185714722, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4971919059753418, "sampling/sampling_logp_difference/mean": 0.026151282712817192, "step": 74, "step_time": 68.18953393399715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 205.015625, "completions/mean_terminated_length": 198.56692504882812, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.837326031178236, "epoch": 0.06, "frac_reward_zero_std": 0.0, "grad_norm": 0.7175588011741638, "learning_rate": 9.990394907289811e-06, "loss": 0.1643, "num_tokens": 3428701.0, "reward": 3.3068602085113525, "reward_std": 0.6110777258872986, "rewards/evaluation_direction_reward/mean": 0.818359375, "rewards/evaluation_direction_reward/std": 0.20504187047481537, "rewards/format_reward/mean": 0.992968738079071, "rewards/format_reward/std": 0.07954951375722885, "rewards/move_legality_reward/mean": 0.1484375, "rewards/move_legality_reward/std": 0.356930136680603, "rewards/pv_length_reward/mean": 0.3534226417541504, "rewards/pv_length_reward/std": 0.22674433887004852, "rewards/pv_quality_reward/mean": 0.025390625, "rewards/pv_quality_reward/std": 0.12486151605844498, "rewards/verbosity_reward/mean": 0.9682812690734863, "rewards/verbosity_reward/std": 0.10217408090829849, "sampling/importance_sampling_ratio/max": 2.5660698413848877, "sampling/importance_sampling_ratio/mean": 0.8441048860549927, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9361478090286255, "sampling/sampling_logp_difference/mean": 0.02669171243906021, "step": 75, "step_time": 67.98095198720694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 198.8984375, "completions/mean_terminated_length": 192.40158081054688, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.8157648481428623, "epoch": 0.0608, "frac_reward_zero_std": 0.0, "grad_norm": 0.781823456287384, "learning_rate": 9.990133642141359e-06, "loss": -0.0823, "num_tokens": 3472480.0, "reward": 3.216151475906372, "reward_std": 0.5876901745796204, "rewards/evaluation_direction_reward/mean": 0.7578125, "rewards/evaluation_direction_reward/std": 0.2566775679588318, "rewards/format_reward/mean": 0.992968738079071, "rewards/format_reward/std": 0.07954951375722885, "rewards/move_legality_reward/mean": 0.1171875, "rewards/move_legality_reward/std": 0.322907418012619, "rewards/pv_length_reward/mean": 0.34138599038124084, "rewards/pv_length_reward/std": 0.23778969049453735, "rewards/pv_quality_reward/mean": 0.033203125, "rewards/pv_quality_reward/std": 0.1516675502061844, "rewards/verbosity_reward/mean": 0.9735937714576721, "rewards/verbosity_reward/std": 0.10836461186408997, "sampling/importance_sampling_ratio/max": 2.9362480640411377, "sampling/importance_sampling_ratio/mean": 0.9162384271621704, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.735630989074707, "sampling/sampling_logp_difference/mean": 0.025474566966295242, "step": 76, "step_time": 68.0809210985899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 191.09375, "completions/mean_terminated_length": 191.09375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.8165283687412739, "epoch": 0.0616, "frac_reward_zero_std": 0.0, "grad_norm": 0.854888379573822, "learning_rate": 9.989868874725026e-06, "loss": 0.0614, "num_tokens": 3515404.0, "reward": 3.307072162628174, "reward_std": 0.5380861163139343, "rewards/evaluation_direction_reward/mean": 0.779296875, "rewards/evaluation_direction_reward/std": 0.23397685587406158, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.140625, "rewards/move_legality_reward/std": 0.3490002751350403, "rewards/pv_length_reward/mean": 0.3351190686225891, "rewards/pv_length_reward/std": 0.141143336892128, "rewards/pv_quality_reward/mean": 0.060546875, "rewards/pv_quality_reward/std": 0.212192565202713, "rewards/verbosity_reward/mean": 0.9914844036102295, "rewards/verbosity_reward/std": 0.031823575496673584, "sampling/importance_sampling_ratio/max": 2.772390127182007, "sampling/importance_sampling_ratio/mean": 0.9425092339515686, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7733529806137085, "sampling/sampling_logp_difference/mean": 0.026152830570936203, "step": 77, "step_time": 23.28309803456068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 1024.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 213.28125, "completions/mean_terminated_length": 193.82400512695312, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.7979506775736809, "epoch": 0.0624, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6489567160606384, "learning_rate": 9.989600605226637e-06, "loss": 0.0693, "num_tokens": 3561264.0, "reward": 3.2568228244781494, "reward_std": 0.6942301392555237, "rewards/evaluation_direction_reward/mean": 0.76171875, "rewards/evaluation_direction_reward/std": 0.26779091358184814, "rewards/format_reward/mean": 0.979687511920929, "rewards/format_reward/std": 0.1318310648202896, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.3213541805744171, "rewards/pv_length_reward/std": 0.13752955198287964, "rewards/pv_quality_reward/mean": 0.046875, "rewards/pv_quality_reward/std": 0.18758200109004974, "rewards/verbosity_reward/mean": 0.9596874713897705, "rewards/verbosity_reward/std": 0.13558818399906158, "sampling/importance_sampling_ratio/max": 2.9937121868133545, "sampling/importance_sampling_ratio/mean": 0.9434062838554382, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6270439624786377, "sampling/sampling_logp_difference/mean": 0.02404472418129444, "step": 78, "step_time": 68.42001333087683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 194.6640625, "completions/mean_terminated_length": 194.6640625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.8085077852010727, "epoch": 0.0632, "frac_reward_zero_std": 0.0, "grad_norm": 0.6440566182136536, "learning_rate": 9.989328833834472e-06, "loss": 0.033, "num_tokens": 3604853.0, "reward": 3.510925054550171, "reward_std": 0.6392576098442078, "rewards/evaluation_direction_reward/mean": 0.8046875, "rewards/evaluation_direction_reward/std": 0.16876548528671265, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.3882688581943512, "rewards/pv_length_reward/std": 0.2383068948984146, "rewards/pv_quality_reward/mean": 0.083984375, "rewards/pv_quality_reward/std": 0.2611251771450043, "rewards/verbosity_reward/mean": 0.9839843511581421, "rewards/verbosity_reward/std": 0.09163724631071091, "sampling/importance_sampling_ratio/max": 2.829496145248413, "sampling/importance_sampling_ratio/mean": 0.8221385478973389, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49064111709594727, "sampling/sampling_logp_difference/mean": 0.026148468255996704, "step": 79, "step_time": 42.33941093087196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 192.109375, "completions/mean_terminated_length": 192.109375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.8259737119078636, "epoch": 0.064, "frac_reward_zero_std": 0.0, "grad_norm": 0.8017685413360596, "learning_rate": 9.989053560739272e-06, "loss": 0.0951, "num_tokens": 3647891.0, "reward": 3.4814844131469727, "reward_std": 0.6999573707580566, "rewards/evaluation_direction_reward/mean": 0.861328125, "rewards/evaluation_direction_reward/std": 0.21464261412620544, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1796875, "rewards/move_legality_reward/std": 0.3854354918003082, "rewards/pv_length_reward/mean": 0.41484376788139343, "rewards/pv_length_reward/std": 0.22228467464447021, "rewards/pv_quality_reward/mean": 0.044921875, "rewards/pv_quality_reward/std": 0.20436571538448334, "rewards/verbosity_reward/mean": 0.9807031154632568, "rewards/verbosity_reward/std": 0.06371916830539703, "sampling/importance_sampling_ratio/max": 2.76517653465271, "sampling/importance_sampling_ratio/mean": 0.8836460113525391, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5489654541015625, "sampling/sampling_logp_difference/mean": 0.02566969022154808, "step": 80, "step_time": 25.885647669434547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 190.2578125, "completions/mean_terminated_length": 190.2578125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.8139186576008797, "epoch": 0.0648, "frac_reward_zero_std": 0.0, "grad_norm": 0.601330578327179, "learning_rate": 9.988774786134235e-06, "loss": -0.0268, "num_tokens": 3690556.0, "reward": 3.3305344581604004, "reward_std": 0.608963668346405, "rewards/evaluation_direction_reward/mean": 0.796875, "rewards/evaluation_direction_reward/std": 0.2278580516576767, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.3632688522338867, "rewards/pv_length_reward/std": 0.21617741882801056, "rewards/pv_quality_reward/mean": 0.0546875, "rewards/pv_quality_reward/std": 0.21030887961387634, "rewards/verbosity_reward/mean": 0.9907031059265137, "rewards/verbosity_reward/std": 0.03207235038280487, "sampling/importance_sampling_ratio/max": 2.6052262783050537, "sampling/importance_sampling_ratio/mean": 0.906460165977478, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6201156377792358, "sampling/sampling_logp_difference/mean": 0.02610577829182148, "step": 81, "step_time": 23.500829339027405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 189.203125, "completions/mean_terminated_length": 189.203125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.8099965378642082, "epoch": 0.0656, "frac_reward_zero_std": 0.0, "grad_norm": 0.8673341870307922, "learning_rate": 9.988492510215011e-06, "loss": 0.1362, "num_tokens": 3733326.0, "reward": 3.3693227767944336, "reward_std": 0.48802265524864197, "rewards/evaluation_direction_reward/mean": 0.802734375, "rewards/evaluation_direction_reward/std": 0.1873256266117096, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.140625, "rewards/move_legality_reward/std": 0.3490002751350403, "rewards/pv_length_reward/mean": 0.39479169249534607, "rewards/pv_length_reward/std": 0.1800754815340042, "rewards/pv_quality_reward/mean": 0.046875, "rewards/pv_quality_reward/std": 0.1767766922712326, "rewards/verbosity_reward/mean": 0.9842968583106995, "rewards/verbosity_reward/std": 0.04341130331158638, "sampling/importance_sampling_ratio/max": 2.8309550285339355, "sampling/importance_sampling_ratio/mean": 0.9380918741226196, "sampling/importance_sampling_ratio/min": 0.07980255782604218, "sampling/sampling_logp_difference/max": 0.49987274408340454, "sampling/sampling_logp_difference/mean": 0.025935955345630646, "step": 82, "step_time": 24.10129176825285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 196.046875, "completions/mean_terminated_length": 196.046875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.8029222451150417, "epoch": 0.0664, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5957927703857422, "learning_rate": 9.988206733179718e-06, "loss": 0.148, "num_tokens": 3777060.0, "reward": 3.4808292388916016, "reward_std": 0.5601513385772705, "rewards/evaluation_direction_reward/mean": 0.8671875, "rewards/evaluation_direction_reward/std": 0.1854381412267685, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.28135016560554504, "rewards/move_legality_reward/std": 0.45131435990333557, "rewards/pv_length_reward/mean": 0.30885419249534607, "rewards/pv_length_reward/std": 0.13356177508831024, "rewards/pv_quality_reward/mean": 0.029296875, "rewards/pv_quality_reward/std": 0.15880054235458374, "rewards/verbosity_reward/mean": 0.994140625, "rewards/verbosity_reward/std": 0.02865838259458542, "sampling/importance_sampling_ratio/max": 2.8127832412719727, "sampling/importance_sampling_ratio/mean": 0.901707649230957, "sampling/importance_sampling_ratio/min": 0.09717939794063568, "sampling/sampling_logp_difference/max": 0.49086999893188477, "sampling/sampling_logp_difference/mean": 0.025764398276805878, "step": 83, "step_time": 53.24397201091051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 189.7734375, "completions/mean_terminated_length": 189.7734375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.8207036554813385, "epoch": 0.0672, "frac_reward_zero_std": 0.0, "grad_norm": 0.6683021783828735, "learning_rate": 9.987917455228924e-06, "loss": 0.0091, "num_tokens": 3819775.0, "reward": 3.499966621398926, "reward_std": 0.6441566944122314, "rewards/evaluation_direction_reward/mean": 0.86328125, "rewards/evaluation_direction_reward/std": 0.13258251547813416, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.15625, "rewards/move_legality_reward/std": 0.3645188808441162, "rewards/pv_length_reward/mean": 0.43504464626312256, "rewards/pv_length_reward/std": 0.20723064243793488, "rewards/pv_quality_reward/mean": 0.060546875, "rewards/pv_quality_reward/std": 0.21903981268405914, "rewards/verbosity_reward/mean": 0.9848437309265137, "rewards/verbosity_reward/std": 0.05307823047041893, "sampling/importance_sampling_ratio/max": 2.85528302192688, "sampling/importance_sampling_ratio/mean": 0.7857295870780945, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9508609771728516, "sampling/sampling_logp_difference/mean": 0.025852257385849953, "step": 84, "step_time": 24.333909079432487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 185.7890625, "completions/mean_terminated_length": 185.7890625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.8046436980366707, "epoch": 0.068, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7313711047172546, "learning_rate": 9.987624676565652e-06, "loss": 0.0228, "num_tokens": 3862068.0, "reward": 3.3090932369232178, "reward_std": 0.5023424625396729, "rewards/evaluation_direction_reward/mean": 0.8125, "rewards/evaluation_direction_reward/std": 0.18022295832633972, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.09375, "rewards/move_legality_reward/std": 0.29262590408325195, "rewards/pv_length_reward/mean": 0.38042140007019043, "rewards/pv_length_reward/std": 0.258477121591568, "rewards/pv_quality_reward/mean": 0.029296875, "rewards/pv_quality_reward/std": 0.11996182799339294, "rewards/verbosity_reward/mean": 0.9931249618530273, "rewards/verbosity_reward/std": 0.03610461577773094, "sampling/importance_sampling_ratio/max": 2.9123475551605225, "sampling/importance_sampling_ratio/mean": 0.896617591381073, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6486577987670898, "sampling/sampling_logp_difference/mean": 0.02593923546373844, "step": 85, "step_time": 24.745100028812885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 195.6875, "completions/mean_terminated_length": 189.1653594970703, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.8042715638875961, "epoch": 0.0688, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7673628330230713, "learning_rate": 9.987328397395389e-06, "loss": 0.131, "num_tokens": 3905868.0, "reward": 3.41934871673584, "reward_std": 0.541120171546936, "rewards/evaluation_direction_reward/mean": 0.912109375, "rewards/evaluation_direction_reward/std": 0.1457732617855072, "rewards/format_reward/mean": 0.9906250238418579, "rewards/format_reward/std": 0.08365423232316971, "rewards/move_legality_reward/mean": 0.140625, "rewards/move_legality_reward/std": 0.3490002751350403, "rewards/pv_length_reward/mean": 0.3708333373069763, "rewards/pv_length_reward/std": 0.18170958757400513, "rewards/pv_quality_reward/mean": 0.021484375, "rewards/pv_quality_reward/std": 0.11750081181526184, "rewards/verbosity_reward/mean": 0.9836719036102295, "rewards/verbosity_reward/std": 0.0932600349187851, "sampling/importance_sampling_ratio/max": 2.751631021499634, "sampling/importance_sampling_ratio/mean": 0.9254558682441711, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8656858205795288, "sampling/sampling_logp_difference/mean": 0.025532620027661324, "step": 86, "step_time": 68.05757850408554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 187.0078125, "completions/mean_terminated_length": 187.0078125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.8124808073043823, "epoch": 0.0696, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6367681622505188, "learning_rate": 9.987028617926074e-06, "loss": -0.11, "num_tokens": 3948309.0, "reward": 3.3322060108184814, "reward_std": 0.5850538015365601, "rewards/evaluation_direction_reward/mean": 0.806640625, "rewards/evaluation_direction_reward/std": 0.2756789028644562, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.21875, "rewards/move_legality_reward/std": 0.41502299904823303, "rewards/pv_length_reward/mean": 0.2808779776096344, "rewards/pv_length_reward/std": 0.0777883306145668, "rewards/pv_quality_reward/mean": 0.033203125, "rewards/pv_quality_reward/std": 0.16110803186893463, "rewards/verbosity_reward/mean": 0.9927343726158142, "rewards/verbosity_reward/std": 0.029449405148625374, "sampling/importance_sampling_ratio/max": 2.945777654647827, "sampling/importance_sampling_ratio/mean": 0.8711270093917847, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5967534780502319, "sampling/sampling_logp_difference/mean": 0.0257500559091568, "step": 87, "step_time": 23.54186137765646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 184.546875, "completions/mean_terminated_length": 184.546875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.8047505728900433, "epoch": 0.0704, "frac_reward_zero_std": 0.0, "grad_norm": 0.7355842590332031, "learning_rate": 9.986725338368103e-06, "loss": 0.0408, "num_tokens": 3990555.0, "reward": 3.191265106201172, "reward_std": 0.6215701699256897, "rewards/evaluation_direction_reward/mean": 0.701171875, "rewards/evaluation_direction_reward/std": 0.39691054821014404, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.140625, "rewards/move_legality_reward/std": 0.3490002751350403, "rewards/pv_length_reward/mean": 0.2813429832458496, "rewards/pv_length_reward/std": 0.11949289590120316, "rewards/pv_quality_reward/mean": 0.07421875, "rewards/pv_quality_reward/std": 0.2516860067844391, "rewards/verbosity_reward/mean": 0.9939062595367432, "rewards/verbosity_reward/std": 0.022280758246779442, "sampling/importance_sampling_ratio/max": 2.962191581726074, "sampling/importance_sampling_ratio/mean": 0.8543453812599182, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49292564392089844, "sampling/sampling_logp_difference/mean": 0.02513302117586136, "step": 88, "step_time": 23.116744741797447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 192.90625, "completions/mean_terminated_length": 186.36219787597656, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.7929396964609623, "epoch": 0.0712, "frac_reward_zero_std": 0.0, "grad_norm": 0.7569488883018494, "learning_rate": 9.986418558934329e-06, "loss": 0.0945, "num_tokens": 4034007.0, "reward": 3.3697879314422607, "reward_std": 0.5503093600273132, "rewards/evaluation_direction_reward/mean": 0.875, "rewards/evaluation_direction_reward/std": 0.18560393154621124, "rewards/format_reward/mean": 0.992968738079071, "rewards/format_reward/std": 0.07954951375722885, "rewards/move_legality_reward/mean": 0.15625, "rewards/move_legality_reward/std": 0.3645188808441162, "rewards/pv_length_reward/mean": 0.3569754660129547, "rewards/pv_length_reward/std": 0.21277205646038055, "rewards/pv_quality_reward/mean": 0.00390625, "rewards/pv_quality_reward/std": 0.031126724556088448, "rewards/verbosity_reward/mean": 0.9846875071525574, "rewards/verbosity_reward/std": 0.09331124275922775, "sampling/importance_sampling_ratio/max": 2.7220616340637207, "sampling/importance_sampling_ratio/mean": 0.8840839266777039, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.675990641117096, "sampling/sampling_logp_difference/mean": 0.025487907230854034, "step": 89, "step_time": 68.58280087262392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 189.1953125, "completions/mean_terminated_length": 189.1953125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.8091855458915234, "epoch": 0.072, "frac_reward_zero_std": 0.0, "grad_norm": 0.6298447251319885, "learning_rate": 9.986108279840063e-06, "loss": 0.0006, "num_tokens": 4076696.0, "reward": 3.294869899749756, "reward_std": 0.45263051986694336, "rewards/evaluation_direction_reward/mean": 0.826171875, "rewards/evaluation_direction_reward/std": 0.2745048999786377, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1328125, "rewards/move_legality_reward/std": 0.3407054841518402, "rewards/pv_length_reward/mean": 0.3229166865348816, "rewards/pv_length_reward/std": 0.13133342564105988, "rewards/pv_quality_reward/mean": 0.0234375, "rewards/pv_quality_reward/std": 0.11064191907644272, "rewards/verbosity_reward/mean": 0.9895312786102295, "rewards/verbosity_reward/std": 0.04452477768063545, "sampling/importance_sampling_ratio/max": 2.927372455596924, "sampling/importance_sampling_ratio/mean": 0.8742789030075073, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7687332630157471, "sampling/sampling_logp_difference/mean": 0.025695964694023132, "step": 90, "step_time": 28.9696638956666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 185.3984375, "completions/mean_terminated_length": 185.3984375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.7909048646688461, "epoch": 0.0728, "frac_reward_zero_std": 0.125, "grad_norm": 0.6416728496551514, "learning_rate": 9.98579450130307e-06, "loss": 0.0457, "num_tokens": 4118659.0, "reward": 3.3788022994995117, "reward_std": 0.4750932455062866, "rewards/evaluation_direction_reward/mean": 0.890625, "rewards/evaluation_direction_reward/std": 0.23213745653629303, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1640625, "rewards/move_legality_reward/std": 0.371787428855896, "rewards/pv_length_reward/mean": 0.3192708492279053, "rewards/pv_length_reward/std": 0.18793947994709015, "rewards/pv_quality_reward/mean": 0.009765625, "rewards/pv_quality_reward/std": 0.07291959971189499, "rewards/verbosity_reward/mean": 0.9950781464576721, "rewards/verbosity_reward/std": 0.022307656705379486, "sampling/importance_sampling_ratio/max": 2.9482879638671875, "sampling/importance_sampling_ratio/mean": 0.9930097460746765, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4908132553100586, "sampling/sampling_logp_difference/mean": 0.02523653395473957, "step": 91, "step_time": 32.12451392412186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 192.6875, "completions/mean_terminated_length": 186.14173889160156, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.7937204949557781, "epoch": 0.0736, "frac_reward_zero_std": 0.0, "grad_norm": 0.86881422996521, "learning_rate": 9.985477223543574e-06, "loss": 0.1224, "num_tokens": 4161907.0, "reward": 3.283255100250244, "reward_std": 0.7368278503417969, "rewards/evaluation_direction_reward/mean": 0.787109375, "rewards/evaluation_direction_reward/std": 0.3107171058654785, "rewards/format_reward/mean": 0.992968738079071, "rewards/format_reward/std": 0.07954951375722885, "rewards/move_legality_reward/mean": 0.1328125, "rewards/move_legality_reward/std": 0.3407054841518402, "rewards/pv_length_reward/mean": 0.33645835518836975, "rewards/pv_length_reward/std": 0.21739479899406433, "rewards/pv_quality_reward/mean": 0.052734375, "rewards/pv_quality_reward/std": 0.20964054763317108, "rewards/verbosity_reward/mean": 0.9811718463897705, "rewards/verbosity_reward/std": 0.12491018325090408, "sampling/importance_sampling_ratio/max": 2.8254830837249756, "sampling/importance_sampling_ratio/mean": 0.8803978562355042, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6489336490631104, "sampling/sampling_logp_difference/mean": 0.025418762117624283, "step": 92, "step_time": 68.00599967688322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 184.59375, "completions/mean_terminated_length": 184.59375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.8094186708331108, "epoch": 0.0744, "frac_reward_zero_std": 0.125, "grad_norm": 0.6303867697715759, "learning_rate": 9.985156446784249e-06, "loss": -0.024, "num_tokens": 4204215.0, "reward": 3.5703086853027344, "reward_std": 0.7116910219192505, "rewards/evaluation_direction_reward/mean": 0.859375, "rewards/evaluation_direction_reward/std": 0.2712431252002716, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3046875, "rewards/move_legality_reward/std": 0.46208351850509644, "rewards/pv_length_reward/mean": 0.3516369163990021, "rewards/pv_length_reward/std": 0.19572487473487854, "rewards/pv_quality_reward/mean": 0.064453125, "rewards/pv_quality_reward/std": 0.2110297530889511, "rewards/verbosity_reward/mean": 0.9901562929153442, "rewards/verbosity_reward/std": 0.03591293841600418, "sampling/importance_sampling_ratio/max": 2.817345380783081, "sampling/importance_sampling_ratio/mean": 0.9091726541519165, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49173593521118164, "sampling/sampling_logp_difference/mean": 0.02583843655884266, "step": 93, "step_time": 23.749055325984955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 181.75, "completions/mean_terminated_length": 181.75, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.7899369448423386, "epoch": 0.0752, "frac_reward_zero_std": 0.25, "grad_norm": 0.5104176998138428, "learning_rate": 9.98483217125023e-06, "loss": 0.0194, "num_tokens": 4245815.0, "reward": 3.27030873298645, "reward_std": 0.5879793763160706, "rewards/evaluation_direction_reward/mean": 0.787109375, "rewards/evaluation_direction_reward/std": 0.3632880449295044, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.09375, "rewards/move_legality_reward/std": 0.29262590408325195, "rewards/pv_length_reward/mean": 0.3594494163990021, "rewards/pv_length_reward/std": 0.2067253589630127, "rewards/pv_quality_reward/mean": 0.03515625, "rewards/pv_quality_reward/std": 0.14958727359771729, "rewards/verbosity_reward/mean": 0.9948437809944153, "rewards/verbosity_reward/std": 0.024300750344991684, "sampling/importance_sampling_ratio/max": 2.4527666568756104, "sampling/importance_sampling_ratio/mean": 0.908170223236084, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5278538465499878, "sampling/sampling_logp_difference/mean": 0.025615770369768143, "step": 94, "step_time": 23.755928374826908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 176.9609375, "completions/mean_terminated_length": 176.9609375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.7873766347765923, "epoch": 0.076, "frac_reward_zero_std": 0.0625, "grad_norm": 0.8342330455780029, "learning_rate": 9.984504397169107e-06, "loss": -0.0043, "num_tokens": 4287138.0, "reward": 3.382693290710449, "reward_std": 0.6427215337753296, "rewards/evaluation_direction_reward/mean": 0.83203125, "rewards/evaluation_direction_reward/std": 0.31074804067611694, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1979166716337204, "rewards/move_legality_reward/std": 0.3978012800216675, "rewards/pv_length_reward/mean": 0.3154017925262451, "rewards/pv_length_reward/std": 0.20171873271465302, "rewards/pv_quality_reward/mean": 0.0390625, "rewards/pv_quality_reward/std": 0.15823055803775787, "rewards/verbosity_reward/mean": 0.9982812404632568, "rewards/verbosity_reward/std": 0.01403647381812334, "sampling/importance_sampling_ratio/max": 2.8856194019317627, "sampling/importance_sampling_ratio/mean": 0.9257174730300903, "sampling/importance_sampling_ratio/min": 0.14721083641052246, "sampling/sampling_logp_difference/max": 0.7835595607757568, "sampling/sampling_logp_difference/mean": 0.025181645527482033, "step": 95, "step_time": 22.824011020362377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 175.125, "completions/mean_terminated_length": 175.125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.7852149046957493, "epoch": 0.0768, "frac_reward_zero_std": 0.125, "grad_norm": 0.6303020119667053, "learning_rate": 9.984173124770924e-06, "loss": 0.0447, "num_tokens": 4328234.0, "reward": 3.4308853149414062, "reward_std": 0.5845861434936523, "rewards/evaluation_direction_reward/mean": 0.794921875, "rewards/evaluation_direction_reward/std": 0.2688441276550293, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1640625, "rewards/move_legality_reward/std": 0.371787428855896, "rewards/pv_length_reward/mean": 0.4151041805744171, "rewards/pv_length_reward/std": 0.18622085452079773, "rewards/pv_quality_reward/mean": 0.05859375, "rewards/pv_quality_reward/std": 0.22069230675697327, "rewards/verbosity_reward/mean": 0.9982031583786011, "rewards/verbosity_reward/std": 0.014657966792583466, "sampling/importance_sampling_ratio/max": 2.9925780296325684, "sampling/importance_sampling_ratio/mean": 0.9467948079109192, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.40826916694641113, "sampling/sampling_logp_difference/mean": 0.024567995220422745, "step": 96, "step_time": 23.37603861093521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 174.234375, "completions/mean_terminated_length": 174.234375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.7742373384535313, "epoch": 0.0776, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6849718689918518, "learning_rate": 9.983838354288181e-06, "loss": -0.0305, "num_tokens": 4368952.0, "reward": 3.547226905822754, "reward_std": 0.6830624341964722, "rewards/evaluation_direction_reward/mean": 0.869140625, "rewards/evaluation_direction_reward/std": 0.2716326415538788, "rewards/format_reward/mean": 0.9984375238418579, "rewards/format_reward/std": 0.0176776684820652, "rewards/move_legality_reward/mean": 0.2109375, "rewards/move_legality_reward/std": 0.4095771610736847, "rewards/pv_length_reward/mean": 0.393477201461792, "rewards/pv_length_reward/std": 0.22237227857112885, "rewards/pv_quality_reward/mean": 0.076171875, "rewards/pv_quality_reward/std": 0.2421170473098755, "rewards/verbosity_reward/mean": 0.9990624785423279, "rewards/verbosity_reward/std": 0.009755879640579224, "sampling/importance_sampling_ratio/max": 2.797245740890503, "sampling/importance_sampling_ratio/mean": 0.796650767326355, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9900552034378052, "sampling/sampling_logp_difference/mean": 0.025120094418525696, "step": 97, "step_time": 23.058091208338737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 174.5625, "completions/mean_terminated_length": 174.5625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.7713751383125782, "epoch": 0.0784, "frac_reward_zero_std": 0.125, "grad_norm": 0.7532607316970825, "learning_rate": 9.983500085955833e-06, "loss": 0.0151, "num_tokens": 4409688.0, "reward": 3.5038280487060547, "reward_std": 0.5623112320899963, "rewards/evaluation_direction_reward/mean": 0.890625, "rewards/evaluation_direction_reward/std": 0.2748478353023529, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.234375, "rewards/move_legality_reward/std": 0.42527204751968384, "rewards/pv_length_reward/mean": 0.35468751192092896, "rewards/pv_length_reward/std": 0.17427437007427216, "rewards/pv_quality_reward/mean": 0.025390625, "rewards/pv_quality_reward/std": 0.12486151605844498, "rewards/verbosity_reward/mean": 0.9987499713897705, "rewards/verbosity_reward/std": 0.009221677668392658, "sampling/importance_sampling_ratio/max": 2.7506794929504395, "sampling/importance_sampling_ratio/mean": 0.932557225227356, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4696793556213379, "sampling/sampling_logp_difference/mean": 0.02478131651878357, "step": 98, "step_time": 22.483422070741653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 169.84375, "completions/mean_terminated_length": 169.84375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.767357986420393, "epoch": 0.0792, "frac_reward_zero_std": 0.0625, "grad_norm": 0.783644437789917, "learning_rate": 9.983158320011288e-06, "loss": 0.0408, "num_tokens": 4449844.0, "reward": 3.495814800262451, "reward_std": 0.6118723750114441, "rewards/evaluation_direction_reward/mean": 0.796875, "rewards/evaluation_direction_reward/std": 0.28365930914878845, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1953125, "rewards/move_legality_reward/std": 0.3979988098144531, "rewards/pv_length_reward/mean": 0.4348772466182709, "rewards/pv_length_reward/std": 0.22402074933052063, "rewards/pv_quality_reward/mean": 0.0703125, "rewards/pv_quality_reward/std": 0.21030887961387634, "rewards/verbosity_reward/mean": 0.9984375238418579, "rewards/verbosity_reward/std": 0.01681007817387581, "sampling/importance_sampling_ratio/max": 2.7503607273101807, "sampling/importance_sampling_ratio/mean": 0.9163169264793396, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.553917407989502, "sampling/sampling_logp_difference/mean": 0.024287747219204903, "step": 99, "step_time": 23.205340817570686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 174.1484375, "completions/mean_terminated_length": 167.45669555664062, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.7511449195444584, "epoch": 0.08, "frac_reward_zero_std": 0.1875, "grad_norm": 0.6945288181304932, "learning_rate": 9.982813056694411e-06, "loss": 0.0222, "num_tokens": 4490487.0, "reward": 3.4462947845458984, "reward_std": 0.6082883477210999, "rewards/evaluation_direction_reward/mean": 0.828125, "rewards/evaluation_direction_reward/std": 0.31333550810813904, "rewards/format_reward/mean": 0.992968738079071, "rewards/format_reward/std": 0.07954951375722885, "rewards/move_legality_reward/mean": 0.0859375, "rewards/move_legality_reward/std": 0.2813730239868164, "rewards/pv_length_reward/mean": 0.5159040093421936, "rewards/pv_length_reward/std": 0.2759513258934021, "rewards/pv_quality_reward/mean": 0.03125, "rewards/pv_quality_reward/std": 0.1746762990951538, "rewards/verbosity_reward/mean": 0.9921093583106995, "rewards/verbosity_reward/std": 0.08838580548763275, "sampling/importance_sampling_ratio/max": 2.839848518371582, "sampling/importance_sampling_ratio/mean": 0.912001371383667, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4098827838897705, "sampling/sampling_logp_difference/mean": 0.024177270010113716, "step": 100, "step_time": 67.62004510313272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 170.359375, "completions/mean_terminated_length": 170.359375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.7654573917388916, "epoch": 0.0808, "frac_reward_zero_std": 0.0, "grad_norm": 0.7497929334640503, "learning_rate": 9.982464296247523e-06, "loss": -0.0423, "num_tokens": 4531053.0, "reward": 3.385937452316284, "reward_std": 0.49599742889404297, "rewards/evaluation_direction_reward/mean": 0.80078125, "rewards/evaluation_direction_reward/std": 0.23760487139225006, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1796875, "rewards/move_legality_reward/std": 0.3854354918003082, "rewards/pv_length_reward/mean": 0.3812500238418579, "rewards/pv_length_reward/std": 0.19354183971881866, "rewards/pv_quality_reward/mean": 0.025390625, "rewards/pv_quality_reward/std": 0.10795086622238159, "rewards/verbosity_reward/mean": 0.9988281726837158, "rewards/verbosity_reward/std": 0.00847609806805849, "sampling/importance_sampling_ratio/max": 2.813934087753296, "sampling/importance_sampling_ratio/mean": 0.8971352577209473, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49392032623291016, "sampling/sampling_logp_difference/mean": 0.02469717152416706, "step": 101, "step_time": 23.44372722506523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 174.5625, "completions/mean_terminated_length": 167.87400817871094, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.7568599432706833, "epoch": 0.0816, "frac_reward_zero_std": 0.125, "grad_norm": 0.7251200079917908, "learning_rate": 9.982112038915394e-06, "loss": 0.0809, "num_tokens": 4571957.0, "reward": 3.4870352745056152, "reward_std": 0.6065162420272827, "rewards/evaluation_direction_reward/mean": 0.869140625, "rewards/evaluation_direction_reward/std": 0.23046691715717316, "rewards/format_reward/mean": 0.992968738079071, "rewards/format_reward/std": 0.07954951375722885, "rewards/move_legality_reward/mean": 0.1484375, "rewards/move_legality_reward/std": 0.356930136680603, "rewards/pv_length_reward/mean": 0.4374256134033203, "rewards/pv_length_reward/std": 0.21500876545906067, "rewards/pv_quality_reward/mean": 0.046875, "rewards/pv_quality_reward/std": 0.19529405236244202, "rewards/verbosity_reward/mean": 0.9921875, "rewards/verbosity_reward/std": 0.0883883461356163, "sampling/importance_sampling_ratio/max": 2.9442226886749268, "sampling/importance_sampling_ratio/mean": 0.958601713180542, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7715849876403809, "sampling/sampling_logp_difference/mean": 0.024575866758823395, "step": 102, "step_time": 68.00128821283579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 172.1796875, "completions/mean_terminated_length": 165.47244262695312, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.7518652826547623, "epoch": 0.0824, "frac_reward_zero_std": 0.125, "grad_norm": 0.7410684823989868, "learning_rate": 9.981756284945256e-06, "loss": 0.179, "num_tokens": 4612364.0, "reward": 3.4244022369384766, "reward_std": 0.7083966732025146, "rewards/evaluation_direction_reward/mean": 0.81640625, "rewards/evaluation_direction_reward/std": 0.29006579518318176, "rewards/format_reward/mean": 0.992968738079071, "rewards/format_reward/std": 0.07954951375722885, "rewards/move_legality_reward/mean": 0.1953125, "rewards/move_legality_reward/std": 0.3979988098144531, "rewards/pv_length_reward/mean": 0.35783958435058594, "rewards/pv_length_reward/std": 0.2507587969303131, "rewards/pv_quality_reward/mean": 0.0703125, "rewards/pv_quality_reward/std": 0.21030887961387634, "rewards/verbosity_reward/mean": 0.9915624856948853, "rewards/verbosity_reward/std": 0.08861521631479263, "sampling/importance_sampling_ratio/max": 2.2220473289489746, "sampling/importance_sampling_ratio/mean": 0.7984413504600525, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7465285062789917, "sampling/sampling_logp_difference/mean": 0.024048957973718643, "step": 103, "step_time": 68.01156590878963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 166.1796875, "completions/mean_terminated_length": 166.1796875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.757517147809267, "epoch": 0.0832, "frac_reward_zero_std": 0.1875, "grad_norm": 0.6559773683547974, "learning_rate": 9.981397034586789e-06, "loss": 0.0429, "num_tokens": 4652211.0, "reward": 3.368898630142212, "reward_std": 0.6968657374382019, "rewards/evaluation_direction_reward/mean": 0.78515625, "rewards/evaluation_direction_reward/std": 0.30211585760116577, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1171875, "rewards/move_legality_reward/std": 0.322907418012619, "rewards/pv_length_reward/mean": 0.40014880895614624, "rewards/pv_length_reward/std": 0.2078811526298523, "rewards/pv_quality_reward/mean": 0.06640625, "rewards/pv_quality_reward/std": 0.22728362679481506, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.982017755508423, "sampling/importance_sampling_ratio/mean": 0.8482334613800049, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.31003737449646, "sampling/sampling_logp_difference/mean": 0.02541634626686573, "step": 104, "step_time": 21.55210966616869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 168.6484375, "completions/mean_terminated_length": 168.6484375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.7604557909071445, "epoch": 0.084, "frac_reward_zero_std": 0.25, "grad_norm": 0.5550643801689148, "learning_rate": 9.981034288092129e-06, "loss": 0.0323, "num_tokens": 4692382.0, "reward": 3.4403645992279053, "reward_std": 0.4983551800251007, "rewards/evaluation_direction_reward/mean": 0.869140625, "rewards/evaluation_direction_reward/std": 0.23469875752925873, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1484375, "rewards/move_legality_reward/std": 0.356930136680603, "rewards/pv_length_reward/mean": 0.3895833492279053, "rewards/pv_length_reward/std": 0.17604829370975494, "rewards/pv_quality_reward/mean": 0.033203125, "rewards/pv_quality_reward/std": 0.14503289759159088, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.930873155593872, "sampling/importance_sampling_ratio/mean": 0.8647660613059998, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5190858840942383, "sampling/sampling_logp_difference/mean": 0.0246525127440691, "step": 105, "step_time": 21.918499134480953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 170.234375, "completions/mean_terminated_length": 163.51181030273438, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.7456449344754219, "epoch": 0.0848, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7924783825874329, "learning_rate": 9.980668045715864e-06, "loss": 0.112, "num_tokens": 4732804.0, "reward": 3.2939369678497314, "reward_std": 0.7381122708320618, "rewards/evaluation_direction_reward/mean": 0.720703125, "rewards/evaluation_direction_reward/std": 0.3544168472290039, "rewards/format_reward/mean": 0.992968738079071, "rewards/format_reward/std": 0.07954951375722885, "rewards/move_legality_reward/mean": 0.2109375, "rewards/move_legality_reward/std": 0.4095771610736847, "rewards/pv_length_reward/mean": 0.3233901560306549, "rewards/pv_length_reward/std": 0.1718040555715561, "rewards/pv_quality_reward/mean": 0.0546875, "rewards/pv_quality_reward/std": 0.20073069632053375, "rewards/verbosity_reward/mean": 0.9912500381469727, "rewards/verbosity_reward/std": 0.0886557474732399, "sampling/importance_sampling_ratio/max": 2.6845951080322266, "sampling/importance_sampling_ratio/mean": 0.9455704689025879, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7733529806137085, "sampling/sampling_logp_difference/mean": 0.024603066965937614, "step": 106, "step_time": 68.18884674459696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 162.90625, "completions/mean_terminated_length": 162.90625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.7524286583065987, "epoch": 0.0856, "frac_reward_zero_std": 0.25, "grad_norm": 0.6723228096961975, "learning_rate": 9.980298307715038e-06, "loss": 0.0005, "num_tokens": 4771816.0, "reward": 3.587230920791626, "reward_std": 0.5889813899993896, "rewards/evaluation_direction_reward/mean": 0.90234375, "rewards/evaluation_direction_reward/std": 0.27212053537368774, "rewards/format_reward/mean": 0.9984375238418579, "rewards/format_reward/std": 0.0176776684820652, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.3496527671813965, "rewards/pv_length_reward/std": 0.16687323153018951, "rewards/pv_quality_reward/mean": 0.087890625, "rewards/pv_quality_reward/std": 0.25792524218559265, "rewards/verbosity_reward/mean": 0.9989062547683716, "rewards/verbosity_reward/std": 0.00880536437034607, "sampling/importance_sampling_ratio/max": 2.6945579051971436, "sampling/importance_sampling_ratio/mean": 0.9357390403747559, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.46250104904174805, "sampling/sampling_logp_difference/mean": 0.02477310784161091, "step": 107, "step_time": 22.210132598876953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 161.2421875, "completions/mean_terminated_length": 161.2421875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.7451414205133915, "epoch": 0.0864, "frac_reward_zero_std": 0.0625, "grad_norm": 0.7391572594642639, "learning_rate": 9.979925074349146e-06, "loss": -0.0735, "num_tokens": 4811143.0, "reward": 3.6571059226989746, "reward_std": 0.8498750925064087, "rewards/evaluation_direction_reward/mean": 0.822265625, "rewards/evaluation_direction_reward/std": 0.23717568814754486, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.2578125, "rewards/move_legality_reward/std": 0.43914902210235596, "rewards/pv_length_reward/mean": 0.4344494342803955, "rewards/pv_length_reward/std": 0.2568972706794739, "rewards/pv_quality_reward/mean": 0.142578125, "rewards/pv_quality_reward/std": 0.32404249906539917, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5713236331939697, "sampling/importance_sampling_ratio/mean": 0.8937722444534302, "sampling/importance_sampling_ratio/min": 0.198529452085495, "sampling/sampling_logp_difference/max": 0.49315547943115234, "sampling/sampling_logp_difference/mean": 0.024235907942056656, "step": 108, "step_time": 22.25088132917881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 155.7421875, "completions/mean_terminated_length": 155.7421875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.7347769439220428, "epoch": 0.0872, "frac_reward_zero_std": 0.25, "grad_norm": 0.626630961894989, "learning_rate": 9.979548345880142e-06, "loss": -0.0285, "num_tokens": 4849358.0, "reward": 3.3417534828186035, "reward_std": 0.6518213152885437, "rewards/evaluation_direction_reward/mean": 0.810546875, "rewards/evaluation_direction_reward/std": 0.35872983932495117, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1484375, "rewards/move_legality_reward/std": 0.356930136680603, "rewards/pv_length_reward/mean": 0.3378472328186035, "rewards/pv_length_reward/std": 0.21689678728580475, "rewards/pv_quality_reward/mean": 0.044921875, "rewards/pv_quality_reward/std": 0.18409597873687744, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7887022495269775, "sampling/importance_sampling_ratio/mean": 0.90716952085495, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4861030578613281, "sampling/sampling_logp_difference/mean": 0.024134814739227295, "step": 109, "step_time": 21.43716014176607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 155.4296875, "completions/mean_terminated_length": 155.4296875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.736087866127491, "epoch": 0.088, "frac_reward_zero_std": 0.25, "grad_norm": 0.7270245552062988, "learning_rate": 9.979168122572422e-06, "loss": 0.0015, "num_tokens": 4887917.0, "reward": 3.4808592796325684, "reward_std": 0.5996977686882019, "rewards/evaluation_direction_reward/mean": 0.81640625, "rewards/evaluation_direction_reward/std": 0.33568060398101807, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.15625, "rewards/move_legality_reward/std": 0.3645188808441162, "rewards/pv_length_reward/mean": 0.4437500238418579, "rewards/pv_length_reward/std": 0.20926693081855774, "rewards/pv_quality_reward/mean": 0.064453125, "rewards/pv_quality_reward/std": 0.22892700135707855, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9229021072387695, "sampling/importance_sampling_ratio/mean": 0.9915099143981934, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6404882669448853, "sampling/sampling_logp_difference/mean": 0.02449335902929306, "step": 110, "step_time": 21.614892333745956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 162.7578125, "completions/mean_terminated_length": 155.97637939453125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.7312752865254879, "epoch": 0.0888, "frac_reward_zero_std": 0.125, "grad_norm": 0.772115170955658, "learning_rate": 9.978784404692847e-06, "loss": 0.0308, "num_tokens": 4927286.0, "reward": 3.375520706176758, "reward_std": 0.6032273769378662, "rewards/evaluation_direction_reward/mean": 0.86328125, "rewards/evaluation_direction_reward/std": 0.2124551236629486, "rewards/format_reward/mean": 0.9906250238418579, "rewards/format_reward/std": 0.08365423232316971, "rewards/move_legality_reward/mean": 0.1171875, "rewards/move_legality_reward/std": 0.322907418012619, "rewards/pv_length_reward/mean": 0.3739583194255829, "rewards/pv_length_reward/std": 0.1958867311477661, "rewards/pv_quality_reward/mean": 0.0390625, "rewards/pv_quality_reward/std": 0.18410642445087433, "rewards/verbosity_reward/mean": 0.9914062023162842, "rewards/verbosity_reward/std": 0.08868003636598587, "sampling/importance_sampling_ratio/max": 2.8444128036499023, "sampling/importance_sampling_ratio/mean": 0.9095694422721863, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5939638614654541, "sampling/sampling_logp_difference/mean": 0.024308089166879654, "step": 111, "step_time": 68.1709001287818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 152.4609375, "completions/mean_terminated_length": 152.4609375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.728966798633337, "epoch": 0.0896, "frac_reward_zero_std": 0.3125, "grad_norm": 0.9370754361152649, "learning_rate": 9.978397192510722e-06, "loss": 0.0414, "num_tokens": 4964865.0, "reward": 3.350630760192871, "reward_std": 0.6231047511100769, "rewards/evaluation_direction_reward/mean": 0.81640625, "rewards/evaluation_direction_reward/std": 0.28665250539779663, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.09375, "rewards/move_legality_reward/std": 0.29262590408325195, "rewards/pv_length_reward/mean": 0.4150838851928711, "rewards/pv_length_reward/std": 0.26967811584472656, "rewards/pv_quality_reward/mean": 0.025390625, "rewards/pv_quality_reward/std": 0.14661483466625214, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9490747451782227, "sampling/importance_sampling_ratio/mean": 1.007587194442749, "sampling/importance_sampling_ratio/min": 0.12170320004224777, "sampling/sampling_logp_difference/max": 0.5531740188598633, "sampling/sampling_logp_difference/mean": 0.024549560621380806, "step": 112, "step_time": 21.38984364271164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 149.78125, "completions/mean_terminated_length": 149.78125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.7177873104810715, "epoch": 0.0904, "frac_reward_zero_std": 0.375, "grad_norm": 0.7542063593864441, "learning_rate": 9.978006486297808e-06, "loss": 0.0223, "num_tokens": 5002645.0, "reward": 3.416796922683716, "reward_std": 0.44363632798194885, "rewards/evaluation_direction_reward/mean": 0.853515625, "rewards/evaluation_direction_reward/std": 0.25970786809921265, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.078125, "rewards/move_legality_reward/std": 0.2694226801395416, "rewards/pv_length_reward/mean": 0.45781251788139343, "rewards/pv_length_reward/std": 0.2349022775888443, "rewards/pv_quality_reward/mean": 0.02734375, "rewards/pv_quality_reward/std": 0.15759754180908203, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8054146766662598, "sampling/importance_sampling_ratio/mean": 0.8916110396385193, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48149609565734863, "sampling/sampling_logp_difference/mean": 0.023990679532289505, "step": 113, "step_time": 21.555673770606518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 152.25, "completions/mean_terminated_length": 152.25, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.7265091091394424, "epoch": 0.0912, "frac_reward_zero_std": 0.375, "grad_norm": 0.71568363904953, "learning_rate": 9.977612286328317e-06, "loss": -0.0332, "num_tokens": 5040517.0, "reward": 3.2799477577209473, "reward_std": 0.6605892777442932, "rewards/evaluation_direction_reward/mean": 0.775390625, "rewards/evaluation_direction_reward/std": 0.31504255533218384, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.109375, "rewards/move_legality_reward/std": 0.31333550810813904, "rewards/pv_length_reward/mean": 0.3541666865348816, "rewards/pv_length_reward/std": 0.2564064860343933, "rewards/pv_quality_reward/mean": 0.041015625, "rewards/pv_quality_reward/std": 0.14972858130931854, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.3795852661132812, "sampling/importance_sampling_ratio/mean": 0.8917587995529175, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5489506721496582, "sampling/sampling_logp_difference/mean": 0.02374156378209591, "step": 114, "step_time": 21.335313268005848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 150.640625, "completions/mean_terminated_length": 150.640625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.7191668525338173, "epoch": 0.092, "frac_reward_zero_std": 0.1875, "grad_norm": 0.8921257257461548, "learning_rate": 9.977214592878917e-06, "loss": -0.047, "num_tokens": 5078599.0, "reward": 3.2749876976013184, "reward_std": 0.6059982180595398, "rewards/evaluation_direction_reward/mean": 0.69140625, "rewards/evaluation_direction_reward/std": 0.3949538767337799, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.09375, "rewards/move_legality_reward/std": 0.29262590408325195, "rewards/pv_length_reward/mean": 0.4195188581943512, "rewards/pv_length_reward/std": 0.17256733775138855, "rewards/pv_quality_reward/mean": 0.0703125, "rewards/pv_quality_reward/std": 0.25085973739624023, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7970969676971436, "sampling/importance_sampling_ratio/mean": 0.9893866181373596, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.739530086517334, "sampling/sampling_logp_difference/mean": 0.024457629770040512, "step": 115, "step_time": 20.89422734081745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 150.4765625, "completions/mean_terminated_length": 150.4765625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.7152369394898415, "epoch": 0.0928, "frac_reward_zero_std": 0.1875, "grad_norm": 0.7684136033058167, "learning_rate": 9.97681340622872e-06, "loss": 0.047, "num_tokens": 5116372.0, "reward": 3.4081473350524902, "reward_std": 0.41730931401252747, "rewards/evaluation_direction_reward/mean": 0.931640625, "rewards/evaluation_direction_reward/std": 0.14278873801231384, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.109375, "rewards/move_legality_reward/std": 0.31333550810813904, "rewards/pv_length_reward/mean": 0.34955358505249023, "rewards/pv_length_reward/std": 0.18722616136074066, "rewards/pv_quality_reward/mean": 0.017578125, "rewards/pv_quality_reward/std": 0.09507349878549576, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9947738647460938, "sampling/importance_sampling_ratio/mean": 0.9571110606193542, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6627942323684692, "sampling/sampling_logp_difference/mean": 0.023607103154063225, "step": 116, "step_time": 20.76751984655857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 149.265625, "completions/mean_terminated_length": 149.265625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.7117926888167858, "epoch": 0.0936, "frac_reward_zero_std": 0.1875, "grad_norm": 0.7990558743476868, "learning_rate": 9.976408726659296e-06, "loss": -0.0258, "num_tokens": 5154006.0, "reward": 3.450409173965454, "reward_std": 0.6727626919746399, "rewards/evaluation_direction_reward/mean": 0.88671875, "rewards/evaluation_direction_reward/std": 0.28900346159935, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1640625, "rewards/move_legality_reward/std": 0.371787428855896, "rewards/pv_length_reward/mean": 0.3683779835700989, "rewards/pv_length_reward/std": 0.28120917081832886, "rewards/pv_quality_reward/mean": 0.03125, "rewards/pv_quality_reward/std": 0.1746762990951538, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9585161209106445, "sampling/importance_sampling_ratio/mean": 0.990655243396759, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5251514911651611, "sampling/sampling_logp_difference/mean": 0.023908747360110283, "step": 117, "step_time": 20.880047895014286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 149.875, "completions/mean_terminated_length": 149.875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.7042017430067062, "epoch": 0.0944, "frac_reward_zero_std": 0.125, "grad_norm": 1.2265547513961792, "learning_rate": 9.976000554454668e-06, "loss": 0.0134, "num_tokens": 5191582.0, "reward": 3.4847655296325684, "reward_std": 0.5934021472930908, "rewards/evaluation_direction_reward/mean": 0.828125, "rewards/evaluation_direction_reward/std": 0.24851924180984497, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.0703125, "rewards/move_legality_reward/std": 0.2566775679588318, "rewards/pv_length_reward/mean": 0.5453125238418579, "rewards/pv_length_reward/std": 0.27204981446266174, "rewards/pv_quality_reward/mean": 0.041015625, "rewards/pv_quality_reward/std": 0.18233326077461243, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9943854808807373, "sampling/importance_sampling_ratio/mean": 0.8703687191009521, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6386337280273438, "sampling/sampling_logp_difference/mean": 0.023936204612255096, "step": 118, "step_time": 33.96797278523445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 149.6328125, "completions/mean_terminated_length": 149.6328125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.6918681673705578, "epoch": 0.0952, "frac_reward_zero_std": 0.125, "grad_norm": 0.9246020913124084, "learning_rate": 9.975588889901302e-06, "loss": 0.0171, "num_tokens": 5229415.0, "reward": 3.518310546875, "reward_std": 0.5064077973365784, "rewards/evaluation_direction_reward/mean": 0.88671875, "rewards/evaluation_direction_reward/std": 0.230241596698761, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875813752412796, "rewards/move_legality_reward/std": 0.3918078541755676, "rewards/pv_length_reward/mean": 0.4186198115348816, "rewards/pv_length_reward/std": 0.20927342772483826, "rewards/pv_quality_reward/mean": 0.025390625, "rewards/pv_quality_reward/std": 0.12486151605844498, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.938870429992676, "sampling/importance_sampling_ratio/mean": 0.9527101516723633, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6107983589172363, "sampling/sampling_logp_difference/mean": 0.023778092116117477, "step": 119, "step_time": 46.764660343527794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 143.7109375, "completions/mean_terminated_length": 143.7109375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.6988058723509312, "epoch": 0.096, "frac_reward_zero_std": 0.1875, "grad_norm": 0.706233024597168, "learning_rate": 9.975173733288122e-06, "loss": -0.0295, "num_tokens": 5266530.0, "reward": 3.3399815559387207, "reward_std": 0.3982155919075012, "rewards/evaluation_direction_reward/mean": 0.873046875, "rewards/evaluation_direction_reward/std": 0.19840925931930542, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1015625, "rewards/move_legality_reward/std": 0.3032590448856354, "rewards/pv_length_reward/mean": 0.3438875675201416, "rewards/pv_length_reward/std": 0.1918460726737976, "rewards/pv_quality_reward/mean": 0.021484375, "rewards/pv_quality_reward/std": 0.08888716995716095, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.912493944168091, "sampling/importance_sampling_ratio/mean": 0.88442063331604, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.48676884174346924, "sampling/sampling_logp_difference/mean": 0.023965265601873398, "step": 120, "step_time": 21.93833637982607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 143.078125, "completions/mean_terminated_length": 143.078125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.6968801245093346, "epoch": 0.0968, "frac_reward_zero_std": 0.125, "grad_norm": 1.0000108480453491, "learning_rate": 9.974755084906503e-06, "loss": -0.0237, "num_tokens": 5303172.0, "reward": 3.496558904647827, "reward_std": 0.5127706527709961, "rewards/evaluation_direction_reward/mean": 0.923828125, "rewards/evaluation_direction_reward/std": 0.1522747278213501, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.140625, "rewards/move_legality_reward/std": 0.3490002751350403, "rewards/pv_length_reward/mean": 0.3891369104385376, "rewards/pv_length_reward/std": 0.20248863101005554, "rewards/pv_quality_reward/mean": 0.04296875, "rewards/pv_quality_reward/std": 0.17214880883693695, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.572046995162964, "sampling/importance_sampling_ratio/mean": 0.9204471707344055, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5921976566314697, "sampling/sampling_logp_difference/mean": 0.0239044651389122, "step": 121, "step_time": 20.832563556730747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 138.9375, "completions/mean_terminated_length": 138.9375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.6743098124861717, "epoch": 0.0976, "frac_reward_zero_std": 0.25, "grad_norm": 0.7454096674919128, "learning_rate": 9.974332945050263e-06, "loss": 0.05, "num_tokens": 5339676.0, "reward": 3.4913196563720703, "reward_std": 0.6622142195701599, "rewards/evaluation_direction_reward/mean": 0.826171875, "rewards/evaluation_direction_reward/std": 0.2540200352668762, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1640625, "rewards/move_legality_reward/std": 0.371787428855896, "rewards/pv_length_reward/mean": 0.442491352558136, "rewards/pv_length_reward/std": 0.24570044875144958, "rewards/pv_quality_reward/mean": 0.05859375, "rewards/pv_quality_reward/std": 0.21158470213413239, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5976967811584473, "sampling/importance_sampling_ratio/mean": 0.9880890846252441, "sampling/importance_sampling_ratio/min": 0.2036265730857849, "sampling/sampling_logp_difference/max": 0.49436426162719727, "sampling/sampling_logp_difference/mean": 0.023352259770035744, "step": 122, "step_time": 19.42229075729847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 140.671875, "completions/mean_terminated_length": 140.671875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.6988713145256042, "epoch": 0.0984, "frac_reward_zero_std": 0.375, "grad_norm": 0.8302815556526184, "learning_rate": 9.973907314015682e-06, "loss": 0.0723, "num_tokens": 5376394.0, "reward": 3.231901168823242, "reward_std": 0.517932653427124, "rewards/evaluation_direction_reward/mean": 0.728515625, "rewards/evaluation_direction_reward/std": 0.3290815055370331, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.33541667461395264, "rewards/pv_length_reward/std": 0.15845781564712524, "rewards/pv_quality_reward/mean": 0.04296875, "rewards/pv_quality_reward/std": 0.16926594078540802, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.777151346206665, "sampling/importance_sampling_ratio/mean": 0.936285138130188, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7184453010559082, "sampling/sampling_logp_difference/mean": 0.024476591497659683, "step": 123, "step_time": 20.189803950488567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 141.2421875, "completions/mean_terminated_length": 141.2421875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.6839418634772301, "epoch": 0.0992, "frac_reward_zero_std": 0.375, "grad_norm": 0.896493673324585, "learning_rate": 9.97347819210148e-06, "loss": -0.0547, "num_tokens": 5413041.0, "reward": 3.5218937397003174, "reward_std": 0.4493180513381958, "rewards/evaluation_direction_reward/mean": 0.93359375, "rewards/evaluation_direction_reward/std": 0.1419922113418579, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.140625, "rewards/move_legality_reward/std": 0.3490002751350403, "rewards/pv_length_reward/mean": 0.41056546568870544, "rewards/pv_length_reward/std": 0.22590696811676025, "rewards/pv_quality_reward/mean": 0.037109375, "rewards/pv_quality_reward/std": 0.16328880190849304, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7094931602478027, "sampling/importance_sampling_ratio/mean": 0.9207935333251953, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.869997501373291, "sampling/sampling_logp_difference/mean": 0.024384407326579094, "step": 124, "step_time": 20.094478026032448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 137.1015625, "completions/mean_terminated_length": 137.1015625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.6885227188467979, "epoch": 0.1, "frac_reward_zero_std": 0.0625, "grad_norm": 0.9692038297653198, "learning_rate": 9.973045579608834e-06, "loss": 0.0219, "num_tokens": 5449158.0, "reward": 3.435582160949707, "reward_std": 0.5940046310424805, "rewards/evaluation_direction_reward/mean": 0.822265625, "rewards/evaluation_direction_reward/std": 0.24733290076255798, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.19602271914482117, "rewards/move_legality_reward/std": 0.3977285325527191, "rewards/pv_length_reward/mean": 0.35284093022346497, "rewards/pv_length_reward/std": 0.21488124132156372, "rewards/pv_quality_reward/mean": 0.064453125, "rewards/pv_quality_reward/std": 0.20868469774723053, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8794476985931396, "sampling/importance_sampling_ratio/mean": 0.9407631754875183, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7414846420288086, "sampling/sampling_logp_difference/mean": 0.0251169316470623, "step": 125, "step_time": 21.621751703321934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 137.53125, "completions/mean_terminated_length": 137.53125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.675857350230217, "epoch": 0.1008, "frac_reward_zero_std": 0.3125, "grad_norm": 0.9148245453834534, "learning_rate": 9.972609476841368e-06, "loss": -0.0227, "num_tokens": 5485442.0, "reward": 3.509939193725586, "reward_std": 0.5660897493362427, "rewards/evaluation_direction_reward/mean": 0.8984375, "rewards/evaluation_direction_reward/std": 0.1814136803150177, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1328125, "rewards/move_legality_reward/std": 0.3407054841518402, "rewards/pv_length_reward/mean": 0.4454861283302307, "rewards/pv_length_reward/std": 0.23169831931591034, "rewards/pv_quality_reward/mean": 0.033203125, "rewards/pv_quality_reward/std": 0.17002516984939575, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.903937339782715, "sampling/importance_sampling_ratio/mean": 0.9575800895690918, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4915146827697754, "sampling/sampling_logp_difference/mean": 0.023429501801729202, "step": 126, "step_time": 19.143042966723442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 138.828125, "completions/mean_terminated_length": 138.828125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.6724979244172573, "epoch": 0.1016, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7815170884132385, "learning_rate": 9.972169884105155e-06, "loss": -0.0106, "num_tokens": 5521796.0, "reward": 3.482942581176758, "reward_std": 0.551802396774292, "rewards/evaluation_direction_reward/mean": 0.90625, "rewards/evaluation_direction_reward/std": 0.19592301547527313, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1484375, "rewards/move_legality_reward/std": 0.356930136680603, "rewards/pv_length_reward/mean": 0.38333332538604736, "rewards/pv_length_reward/std": 0.22510811686515808, "rewards/pv_quality_reward/mean": 0.044921875, "rewards/pv_quality_reward/std": 0.18936693668365479, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.78379225730896, "sampling/importance_sampling_ratio/mean": 0.964450478553772, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6980130672454834, "sampling/sampling_logp_difference/mean": 0.02397824637591839, "step": 127, "step_time": 19.350017942488194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 137.078125, "completions/mean_terminated_length": 137.078125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.6801747977733612, "epoch": 0.1024, "frac_reward_zero_std": 0.375, "grad_norm": 0.5935385227203369, "learning_rate": 9.971726801708715e-06, "loss": -0.0209, "num_tokens": 5558102.0, "reward": 3.3433778285980225, "reward_std": 0.5277038216590881, "rewards/evaluation_direction_reward/mean": 0.734375, "rewards/evaluation_direction_reward/std": 0.2730514407157898, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.140625, "rewards/move_legality_reward/std": 0.3490002751350403, "rewards/pv_length_reward/mean": 0.4136904776096344, "rewards/pv_length_reward/std": 0.1747000813484192, "rewards/pv_quality_reward/mean": 0.0546875, "rewards/pv_quality_reward/std": 0.20316758751869202, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.4691810607910156, "sampling/importance_sampling_ratio/mean": 0.8915610313415527, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7377221584320068, "sampling/sampling_logp_difference/mean": 0.023867545649409294, "step": 128, "step_time": 21.67512070387602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 136.3515625, "completions/mean_terminated_length": 136.3515625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.6555752903223038, "epoch": 0.1032, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7923663854598999, "learning_rate": 9.971280229963026e-06, "loss": 0.0529, "num_tokens": 5594059.0, "reward": 3.5606212615966797, "reward_std": 0.6560156345367432, "rewards/evaluation_direction_reward/mean": 0.875, "rewards/evaluation_direction_reward/std": 0.26987895369529724, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1484375, "rewards/move_legality_reward/std": 0.356930136680603, "rewards/pv_length_reward/mean": 0.49226194620132446, "rewards/pv_length_reward/std": 0.24618647992610931, "rewards/pv_quality_reward/mean": 0.044921875, "rewards/pv_quality_reward/std": 0.1970091164112091, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.246464967727661, "sampling/importance_sampling_ratio/mean": 0.8988786935806274, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5421628952026367, "sampling/sampling_logp_difference/mean": 0.02346257120370865, "step": 129, "step_time": 20.3271117284894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 136.34375, "completions/mean_terminated_length": 136.34375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.6645896695554256, "epoch": 0.104, "frac_reward_zero_std": 0.5, "grad_norm": 0.8274310231208801, "learning_rate": 9.970830169181504e-06, "loss": -0.0655, "num_tokens": 5629887.0, "reward": 3.5433778762817383, "reward_std": 0.47869136929512024, "rewards/evaluation_direction_reward/mean": 0.91015625, "rewards/evaluation_direction_reward/std": 0.21447238326072693, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.21875, "rewards/move_legality_reward/std": 0.41502299904823303, "rewards/pv_length_reward/mean": 0.39494049549102783, "rewards/pv_length_reward/std": 0.2406831681728363, "rewards/pv_quality_reward/mean": 0.01953125, "rewards/pv_quality_reward/std": 0.10689504444599152, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8924248218536377, "sampling/importance_sampling_ratio/mean": 1.0761778354644775, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.49838685989379883, "sampling/sampling_logp_difference/mean": 0.023349039256572723, "step": 130, "step_time": 19.544281631708145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 134.0390625, "completions/mean_terminated_length": 134.0390625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.6577984131872654, "epoch": 0.1048, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6586757898330688, "learning_rate": 9.970376619680024e-06, "loss": 0.0656, "num_tokens": 5665644.0, "reward": 3.5259299278259277, "reward_std": 0.5689175128936768, "rewards/evaluation_direction_reward/mean": 0.927734375, "rewards/evaluation_direction_reward/std": 0.21313266456127167, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1328125, "rewards/move_legality_reward/std": 0.3407054841518402, "rewards/pv_length_reward/mean": 0.4282738268375397, "rewards/pv_length_reward/std": 0.23754999041557312, "rewards/pv_quality_reward/mean": 0.037109375, "rewards/pv_quality_reward/std": 0.18584199249744415, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8864800930023193, "sampling/importance_sampling_ratio/mean": 0.9868752956390381, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8393909931182861, "sampling/sampling_logp_difference/mean": 0.023427175357937813, "step": 131, "step_time": 19.582026354968548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 134.9296875, "completions/mean_terminated_length": 134.9296875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.6568439677357674, "epoch": 0.1056, "frac_reward_zero_std": 0.4375, "grad_norm": 0.8743370175361633, "learning_rate": 9.969919581776902e-06, "loss": 0.03, "num_tokens": 5701403.0, "reward": 3.4183034896850586, "reward_std": 0.48088109493255615, "rewards/evaluation_direction_reward/mean": 0.8515625, "rewards/evaluation_direction_reward/std": 0.25378531217575073, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1328125, "rewards/move_legality_reward/std": 0.3407054841518402, "rewards/pv_length_reward/mean": 0.41049110889434814, "rewards/pv_length_reward/std": 0.1860724240541458, "rewards/pv_quality_reward/mean": 0.0234375, "rewards/pv_quality_reward/std": 0.14182965457439423, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9406325817108154, "sampling/importance_sampling_ratio/mean": 0.965819239616394, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4929041862487793, "sampling/sampling_logp_difference/mean": 0.02373977191746235, "step": 132, "step_time": 19.46392973512411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 132.5234375, "completions/mean_terminated_length": 132.5234375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.6507400758564472, "epoch": 0.1064, "frac_reward_zero_std": 0.5, "grad_norm": 0.706211507320404, "learning_rate": 9.969459055792903e-06, "loss": 0.0205, "num_tokens": 5736470.0, "reward": 3.519927978515625, "reward_std": 0.630254328250885, "rewards/evaluation_direction_reward/mean": 0.90234375, "rewards/evaluation_direction_reward/std": 0.25532588362693787, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1953125, "rewards/move_legality_reward/std": 0.3979988098144531, "rewards/pv_length_reward/mean": 0.35195931792259216, "rewards/pv_length_reward/std": 0.19813793897628784, "rewards/pv_quality_reward/mean": 0.0703125, "rewards/pv_quality_reward/std": 0.1880732625722885, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.999856472015381, "sampling/importance_sampling_ratio/mean": 0.914071261882782, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6396687030792236, "sampling/sampling_logp_difference/mean": 0.02383011020720005, "step": 133, "step_time": 18.78777387738228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 130.21875, "completions/mean_terminated_length": 130.21875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.6490738242864609, "epoch": 0.1072, "frac_reward_zero_std": 0.375, "grad_norm": 0.808922529220581, "learning_rate": 9.968995042051244e-06, "loss": -0.0145, "num_tokens": 5771434.0, "reward": 3.426215171813965, "reward_std": 0.49654290080070496, "rewards/evaluation_direction_reward/mean": 0.865234375, "rewards/evaluation_direction_reward/std": 0.21255463361740112, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.140625, "rewards/move_legality_reward/std": 0.3490002751350403, "rewards/pv_length_reward/mean": 0.3637152910232544, "rewards/pv_length_reward/std": 0.16994720697402954, "rewards/pv_quality_reward/mean": 0.056640625, "rewards/pv_quality_reward/std": 0.22451795637607574, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.882488489151001, "sampling/importance_sampling_ratio/mean": 0.9726359844207764, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.664520263671875, "sampling/sampling_logp_difference/mean": 0.024829279631376266, "step": 134, "step_time": 18.684845007956028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 127.2890625, "completions/mean_terminated_length": 127.2890625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.641316544264555, "epoch": 0.108, "frac_reward_zero_std": 0.625, "grad_norm": 0.6524202227592468, "learning_rate": 9.968527540877586e-06, "loss": -0.0055, "num_tokens": 5806263.0, "reward": 3.642131805419922, "reward_std": 0.5345194339752197, "rewards/evaluation_direction_reward/mean": 0.89453125, "rewards/evaluation_direction_reward/std": 0.26916569471359253, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3046875, "rewards/move_legality_reward/std": 0.46208351850509644, "rewards/pv_length_reward/mean": 0.3589285910129547, "rewards/pv_length_reward/std": 0.14276838302612305, "rewards/pv_quality_reward/mean": 0.083984375, "rewards/pv_quality_reward/std": 0.2573283016681671, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6319081783294678, "sampling/importance_sampling_ratio/mean": 0.9745063781738281, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7087137699127197, "sampling/sampling_logp_difference/mean": 0.02291264198720455, "step": 135, "step_time": 19.017769053578377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 128.8125, "completions/mean_terminated_length": 128.8125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.6497545577585697, "epoch": 0.1088, "frac_reward_zero_std": 0.5, "grad_norm": 0.6616472601890564, "learning_rate": 9.968056552600043e-06, "loss": -0.0001, "num_tokens": 5841159.0, "reward": 3.44392991065979, "reward_std": 0.4692850410938263, "rewards/evaluation_direction_reward/mean": 0.845703125, "rewards/evaluation_direction_reward/std": 0.30758312344551086, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.0859375, "rewards/move_legality_reward/std": 0.2813730239868164, "rewards/pv_length_reward/mean": 0.5005704164505005, "rewards/pv_length_reward/std": 0.24570676684379578, "rewards/pv_quality_reward/mean": 0.01171875, "rewards/pv_quality_reward/std": 0.09338017553091049, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.975161552429199, "sampling/importance_sampling_ratio/mean": 0.9911806583404541, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6892142295837402, "sampling/sampling_logp_difference/mean": 0.023582739755511284, "step": 136, "step_time": 19.066745571792126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 126.4765625, "completions/mean_terminated_length": 126.4765625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.6456937305629253, "epoch": 0.1096, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7766537070274353, "learning_rate": 9.96758207754917e-06, "loss": 0.0351, "num_tokens": 5876004.0, "reward": 3.4069197177886963, "reward_std": 0.47211596369743347, "rewards/evaluation_direction_reward/mean": 0.8359375, "rewards/evaluation_direction_reward/std": 0.22280755639076233, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1171875, "rewards/move_legality_reward/std": 0.322907418012619, "rewards/pv_length_reward/mean": 0.3991071581840515, "rewards/pv_length_reward/std": 0.18175946176052094, "rewards/pv_quality_reward/mean": 0.0546875, "rewards/pv_quality_reward/std": 0.21721550822257996, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9041545391082764, "sampling/importance_sampling_ratio/mean": 0.9957134127616882, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5007877349853516, "sampling/sampling_logp_difference/mean": 0.022414209321141243, "step": 137, "step_time": 18.461584448814392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 124.875, "completions/mean_terminated_length": 124.875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.6433844976127148, "epoch": 0.1104, "frac_reward_zero_std": 0.5, "grad_norm": 0.7345701456069946, "learning_rate": 9.96710411605797e-06, "loss": -0.0397, "num_tokens": 5910356.0, "reward": 3.456249952316284, "reward_std": 0.4790858328342438, "rewards/evaluation_direction_reward/mean": 0.8125, "rewards/evaluation_direction_reward/std": 0.34652379155158997, "rewards/format_reward/mean": 0.9984375238418579, "rewards/format_reward/std": 0.0176776684820652, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.41093751788139343, "rewards/pv_length_reward/std": 0.16367976367473602, "rewards/pv_quality_reward/mean": 0.046875, "rewards/pv_quality_reward/std": 0.21220162510871887, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.942439317703247, "sampling/importance_sampling_ratio/mean": 0.9850678443908691, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6201164722442627, "sampling/sampling_logp_difference/mean": 0.023150654509663582, "step": 138, "step_time": 18.98353374749422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 124.34375, "completions/mean_terminated_length": 124.34375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.6390024200081825, "epoch": 0.1112, "frac_reward_zero_std": 0.375, "grad_norm": 0.7512683272361755, "learning_rate": 9.966622668461899e-06, "loss": -0.0041, "num_tokens": 5944704.0, "reward": 3.5736048221588135, "reward_std": 0.6946055889129639, "rewards/evaluation_direction_reward/mean": 0.841796875, "rewards/evaluation_direction_reward/std": 0.2467726618051529, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.203125, "rewards/move_legality_reward/std": 0.40390563011169434, "rewards/pv_length_reward/mean": 0.423214316368103, "rewards/pv_length_reward/std": 0.2192830741405487, "rewards/pv_quality_reward/mean": 0.10546875, "rewards/pv_quality_reward/std": 0.2673310935497284, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.1533432006835938, "sampling/importance_sampling_ratio/mean": 0.8683414459228516, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6805146336555481, "sampling/sampling_logp_difference/mean": 0.02492273598909378, "step": 139, "step_time": 18.513176888227463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 124.3828125, "completions/mean_terminated_length": 124.3828125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6288966499269009, "epoch": 0.112, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7769875526428223, "learning_rate": 9.966137735098853e-06, "loss": -0.0487, "num_tokens": 5979105.0, "reward": 3.645963668823242, "reward_std": 0.5484678149223328, "rewards/evaluation_direction_reward/mean": 0.912109375, "rewards/evaluation_direction_reward/std": 0.1735168993473053, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.234375, "rewards/move_legality_reward/std": 0.42527204751968384, "rewards/pv_length_reward/mean": 0.39791667461395264, "rewards/pv_length_reward/std": 0.2104758620262146, "rewards/pv_quality_reward/mean": 0.1015625, "rewards/pv_quality_reward/std": 0.2651650309562683, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7677080631256104, "sampling/importance_sampling_ratio/mean": 1.005407691001892, "sampling/importance_sampling_ratio/min": 0.2095126062631607, "sampling/sampling_logp_difference/max": 0.6234478950500488, "sampling/sampling_logp_difference/mean": 0.023359837010502815, "step": 140, "step_time": 18.66738321632147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 119.9765625, "completions/mean_terminated_length": 119.9765625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.6236050054430962, "epoch": 0.1128, "frac_reward_zero_std": 0.8125, "grad_norm": 0.41500380635261536, "learning_rate": 9.965649316309178e-06, "loss": -0.0121, "num_tokens": 6013030.0, "reward": 3.323394298553467, "reward_std": 0.5439820885658264, "rewards/evaluation_direction_reward/mean": 0.69140625, "rewards/evaluation_direction_reward/std": 0.3810010254383087, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.0546875, "rewards/move_legality_reward/std": 0.22826264798641205, "rewards/pv_length_reward/mean": 0.5519096851348877, "rewards/pv_length_reward/std": 0.2126188725233078, "rewards/pv_quality_reward/mean": 0.025390625, "rewards/pv_quality_reward/std": 0.11671286821365356, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.575333595275879, "sampling/importance_sampling_ratio/mean": 0.9637267589569092, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7377095222473145, "sampling/sampling_logp_difference/mean": 0.02398744970560074, "step": 141, "step_time": 17.884829573333263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 122.8828125, "completions/mean_terminated_length": 122.8828125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6255802549421787, "epoch": 0.1136, "frac_reward_zero_std": 0.375, "grad_norm": 0.8704269528388977, "learning_rate": 9.965157412435663e-06, "loss": 0.0276, "num_tokens": 6047287.0, "reward": 3.7525546550750732, "reward_std": 0.5905358791351318, "rewards/evaluation_direction_reward/mean": 0.87890625, "rewards/evaluation_direction_reward/std": 0.2008455991744995, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3671875, "rewards/move_legality_reward/std": 0.4839322865009308, "rewards/pv_length_reward/mean": 0.4009920656681061, "rewards/pv_length_reward/std": 0.16952554881572723, "rewards/pv_quality_reward/mean": 0.10546875, "rewards/pv_quality_reward/std": 0.2673310935497284, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.589761734008789, "sampling/importance_sampling_ratio/mean": 0.9648653268814087, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9361478090286255, "sampling/sampling_logp_difference/mean": 0.023168936371803284, "step": 142, "step_time": 19.148284696042538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 121.28125, "completions/mean_terminated_length": 121.28125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6096518598496914, "epoch": 0.1144, "frac_reward_zero_std": 0.25, "grad_norm": 0.9303181171417236, "learning_rate": 9.964662023823548e-06, "loss": -0.0109, "num_tokens": 6081691.0, "reward": 3.708367347717285, "reward_std": 0.7065202593803406, "rewards/evaluation_direction_reward/mean": 0.7890625, "rewards/evaluation_direction_reward/std": 0.2848495543003082, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.41590073704719543, "rewards/move_legality_reward/std": 0.49337953329086304, "rewards/pv_length_reward/mean": 0.34129464626312256, "rewards/pv_length_reward/std": 0.18322056531906128, "rewards/pv_quality_reward/mean": 0.162109375, "rewards/pv_quality_reward/std": 0.29528751969337463, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.613435745239258, "sampling/importance_sampling_ratio/mean": 0.9661359190940857, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0369436740875244, "sampling/sampling_logp_difference/mean": 0.023229999467730522, "step": 143, "step_time": 22.24037242680788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 122.234375, "completions/mean_terminated_length": 122.234375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.6293835379183292, "epoch": 0.1152, "frac_reward_zero_std": 0.5, "grad_norm": 0.9787125587463379, "learning_rate": 9.964163150820512e-06, "loss": -0.015, "num_tokens": 6116033.0, "reward": 3.603980541229248, "reward_std": 0.5131587386131287, "rewards/evaluation_direction_reward/mean": 0.8515625, "rewards/evaluation_direction_reward/std": 0.3080889582633972, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.2578125, "rewards/move_legality_reward/std": 0.43914902210235596, "rewards/pv_length_reward/mean": 0.4360119104385376, "rewards/pv_length_reward/std": 0.2303081750869751, "rewards/pv_quality_reward/mean": 0.05859375, "rewards/pv_quality_reward/std": 0.19207829236984253, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6151556968688965, "sampling/importance_sampling_ratio/mean": 0.945966362953186, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7523622512817383, "sampling/sampling_logp_difference/mean": 0.02405453845858574, "step": 144, "step_time": 18.50129372626543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 120.1640625, "completions/mean_terminated_length": 120.1640625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.6062077470123768, "epoch": 0.116, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6885445713996887, "learning_rate": 9.963660793776689e-06, "loss": -0.0141, "num_tokens": 6149998.0, "reward": 3.6368861198425293, "reward_std": 0.746364176273346, "rewards/evaluation_direction_reward/mean": 0.822265625, "rewards/evaluation_direction_reward/std": 0.32926836609840393, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3125, "rewards/move_legality_reward/std": 0.4653336703777313, "rewards/pv_length_reward/mean": 0.3732143044471741, "rewards/pv_length_reward/std": 0.19390150904655457, "rewards/pv_quality_reward/mean": 0.12890625, "rewards/pv_quality_reward/std": 0.2976025640964508, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5783751010894775, "sampling/importance_sampling_ratio/mean": 0.9250954389572144, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6229867935180664, "sampling/sampling_logp_difference/mean": 0.02323579229414463, "step": 145, "step_time": 19.36598525941372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 116.0625, "completions/mean_terminated_length": 116.0625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.5959313251078129, "epoch": 0.1168, "frac_reward_zero_std": 0.625, "grad_norm": 0.48281052708625793, "learning_rate": 9.963154953044646e-06, "loss": 0.0234, "num_tokens": 6183326.0, "reward": 3.6972098350524902, "reward_std": 0.7388455867767334, "rewards/evaluation_direction_reward/mean": 0.9375, "rewards/evaluation_direction_reward/std": 0.23685936629772186, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.39642858505249023, "rewards/pv_length_reward/std": 0.2665782868862152, "rewards/pv_quality_reward/mean": 0.11328125, "rewards/pv_quality_reward/std": 0.2768270969390869, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.53680157661438, "sampling/importance_sampling_ratio/mean": 0.9708071947097778, "sampling/importance_sampling_ratio/min": 0.3212238550186157, "sampling/sampling_logp_difference/max": 0.9696173667907715, "sampling/sampling_logp_difference/mean": 0.02313910983502865, "step": 146, "step_time": 17.81457383930683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 116.5, "completions/mean_terminated_length": 116.5, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.6007821746170521, "epoch": 0.1176, "frac_reward_zero_std": 0.5, "grad_norm": 0.7271394729614258, "learning_rate": 9.962645628979406e-06, "loss": 0.0226, "num_tokens": 6216958.0, "reward": 3.371180534362793, "reward_std": 0.8821466565132141, "rewards/evaluation_direction_reward/mean": 0.763671875, "rewards/evaluation_direction_reward/std": 0.332475483417511, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1640625, "rewards/move_legality_reward/std": 0.371787428855896, "rewards/pv_length_reward/mean": 0.35555556416511536, "rewards/pv_length_reward/std": 0.226159006357193, "rewards/pv_quality_reward/mean": 0.087890625, "rewards/pv_quality_reward/std": 0.2501767873764038, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7128312587738037, "sampling/importance_sampling_ratio/mean": 1.001535415649414, "sampling/importance_sampling_ratio/min": 0.23059700429439545, "sampling/sampling_logp_difference/max": 0.5569438934326172, "sampling/sampling_logp_difference/mean": 0.022841187193989754, "step": 147, "step_time": 17.77836848050356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 113.2109375, "completions/mean_terminated_length": 113.2109375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.5739432722330093, "epoch": 0.1184, "frac_reward_zero_std": 0.5625, "grad_norm": 0.4810524582862854, "learning_rate": 9.96213282193843e-06, "loss": -0.0175, "num_tokens": 6250049.0, "reward": 3.7383246421813965, "reward_std": 0.6273334622383118, "rewards/evaluation_direction_reward/mean": 0.869140625, "rewards/evaluation_direction_reward/std": 0.24495869874954224, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.4609375, "rewards/move_legality_reward/std": 0.5004304051399231, "rewards/pv_length_reward/mean": 0.3652777671813965, "rewards/pv_length_reward/std": 0.16431410610675812, "rewards/pv_quality_reward/mean": 0.04296875, "rewards/pv_quality_reward/std": 0.16926594078540802, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6545512676239014, "sampling/importance_sampling_ratio/mean": 0.9242766499519348, "sampling/importance_sampling_ratio/min": 0.21174883842468262, "sampling/sampling_logp_difference/max": 1.076223611831665, "sampling/sampling_logp_difference/mean": 0.02248041331768036, "step": 148, "step_time": 18.046379409730434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 111.734375, "completions/mean_terminated_length": 111.734375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.5715909935534, "epoch": 0.1192, "frac_reward_zero_std": 0.5625, "grad_norm": 0.8434132933616638, "learning_rate": 9.961616532281626e-06, "loss": 0.0135, "num_tokens": 6282647.0, "reward": 3.6305618286132812, "reward_std": 0.5586704611778259, "rewards/evaluation_direction_reward/mean": 0.875, "rewards/evaluation_direction_reward/std": 0.3089861571788788, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.34375, "rewards/move_legality_reward/std": 0.47682511806488037, "rewards/pv_length_reward/mean": 0.3434523940086365, "rewards/pv_length_reward/std": 0.1422773003578186, "rewards/pv_quality_reward/mean": 0.068359375, "rewards/pv_quality_reward/std": 0.2320629060268402, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.768885850906372, "sampling/importance_sampling_ratio/mean": 0.9073194265365601, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9364643096923828, "sampling/sampling_logp_difference/mean": 0.0226159505546093, "step": 149, "step_time": 18.16509371250868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 108.9296875, "completions/mean_terminated_length": 108.9296875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.5649516545236111, "epoch": 0.12, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6109444499015808, "learning_rate": 9.961096760371349e-06, "loss": -0.028, "num_tokens": 6315222.0, "reward": 3.8424477577209473, "reward_std": 0.6683928966522217, "rewards/evaluation_direction_reward/mean": 0.7734375, "rewards/evaluation_direction_reward/std": 0.3442976176738739, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.375, "rewards/move_legality_reward/std": 0.4860251843929291, "rewards/pv_length_reward/mean": 0.4166666865348816, "rewards/pv_length_reward/std": 0.21751801669597626, "rewards/pv_quality_reward/mean": 0.27734375, "rewards/pv_quality_reward/std": 0.4093330204486847, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.626276731491089, "sampling/importance_sampling_ratio/mean": 0.8639478087425232, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9508609771728516, "sampling/sampling_logp_difference/mean": 0.022438453510403633, "step": 150, "step_time": 17.946591958403587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 108.9765625, "completions/mean_terminated_length": 108.9765625, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.5504528023302555, "epoch": 0.1208, "frac_reward_zero_std": 0.8125, "grad_norm": 0.2375236600637436, "learning_rate": 9.960573506572391e-06, "loss": -0.0125, "num_tokens": 6347627.0, "reward": 3.9634857177734375, "reward_std": 0.5572640895843506, "rewards/evaluation_direction_reward/mean": 0.859375, "rewards/evaluation_direction_reward/std": 0.28016796708106995, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.4296875, "rewards/move_legality_reward/std": 0.4969765841960907, "rewards/pv_length_reward/mean": 0.500595211982727, "rewards/pv_length_reward/std": 0.22847063839435577, "rewards/pv_quality_reward/mean": 0.181640625, "rewards/pv_quality_reward/std": 0.35731232166290283, "rewards/verbosity_reward/mean": 0.9921875, "rewards/verbosity_reward/std": 0.0883883461356163, "sampling/importance_sampling_ratio/max": 2.593761444091797, "sampling/importance_sampling_ratio/mean": 0.9250300526618958, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5665156841278076, "sampling/sampling_logp_difference/mean": 0.021496471017599106, "step": 151, "step_time": 17.444189988076687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 107.8125, "completions/mean_terminated_length": 107.8125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.5613237954676151, "epoch": 0.1216, "frac_reward_zero_std": 0.9375, "grad_norm": 0.17788714170455933, "learning_rate": 9.960046771251991e-06, "loss": 0.0047, "num_tokens": 6379739.0, "reward": 3.834709644317627, "reward_std": 0.548153281211853, "rewards/evaluation_direction_reward/mean": 0.95703125, "rewards/evaluation_direction_reward/std": 0.12976877391338348, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.375, "rewards/move_legality_reward/std": 0.4860251843929291, "rewards/pv_length_reward/mean": 0.4089285731315613, "rewards/pv_length_reward/std": 0.1627330631017685, "rewards/pv_quality_reward/mean": 0.09375, "rewards/pv_quality_reward/std": 0.19592301547527313, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.553135395050049, "sampling/importance_sampling_ratio/mean": 0.8761867880821228, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9442768096923828, "sampling/sampling_logp_difference/mean": 0.023310033604502678, "step": 152, "step_time": 17.765996269881725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 107.6015625, "completions/mean_terminated_length": 107.6015625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.564702957868576, "epoch": 0.1224, "frac_reward_zero_std": 0.625, "grad_norm": 0.8282121419906616, "learning_rate": 9.959516554779838e-06, "loss": -0.0214, "num_tokens": 6412104.0, "reward": 3.6242809295654297, "reward_std": 0.6350694894790649, "rewards/evaluation_direction_reward/mean": 0.875, "rewards/evaluation_direction_reward/std": 0.18293322622776031, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3046875, "rewards/move_legality_reward/std": 0.46208351850509644, "rewards/pv_length_reward/mean": 0.30396825075149536, "rewards/pv_length_reward/std": 0.21338964998722076, "rewards/pv_quality_reward/mean": 0.140625, "rewards/pv_quality_reward/std": 0.2938845157623291, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.848827362060547, "sampling/importance_sampling_ratio/mean": 0.9873177409172058, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7436319589614868, "sampling/sampling_logp_difference/mean": 0.02303829975426197, "step": 153, "step_time": 18.156866133213043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 106.5390625, "completions/mean_terminated_length": 106.5390625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.5544529594480991, "epoch": 0.1232, "frac_reward_zero_std": 0.625, "grad_norm": 0.6166614294052124, "learning_rate": 9.958982857528053e-06, "loss": -0.0382, "num_tokens": 6444341.0, "reward": 3.3638391494750977, "reward_std": 0.47027361392974854, "rewards/evaluation_direction_reward/mean": 0.830078125, "rewards/evaluation_direction_reward/std": 0.33487215638160706, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.109375, "rewards/move_legality_reward/std": 0.31333550810813904, "rewards/pv_length_reward/mean": 0.4107142984867096, "rewards/pv_length_reward/std": 0.2105066031217575, "rewards/pv_quality_reward/mean": 0.013671875, "rewards/pv_quality_reward/std": 0.07879877090454102, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.823228359222412, "sampling/importance_sampling_ratio/mean": 0.9192638397216797, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7969920635223389, "sampling/sampling_logp_difference/mean": 0.023001158609986305, "step": 154, "step_time": 17.30050053447485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 106.109375, "completions/mean_terminated_length": 106.109375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.5615308657288551, "epoch": 0.124, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6792736053466797, "learning_rate": 9.958445679871204e-06, "loss": 0.0452, "num_tokens": 6476059.0, "reward": 3.6134424209594727, "reward_std": 0.4966457188129425, "rewards/evaluation_direction_reward/mean": 0.857421875, "rewards/evaluation_direction_reward/std": 0.3285670280456543, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1953125, "rewards/move_legality_reward/std": 0.3979988098144531, "rewards/pv_length_reward/mean": 0.42594245076179504, "rewards/pv_length_reward/std": 0.3010059893131256, "rewards/pv_quality_reward/mean": 0.134765625, "rewards/pv_quality_reward/std": 0.3406547009944916, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.665247917175293, "sampling/importance_sampling_ratio/mean": 0.9417818784713745, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8130229711532593, "sampling/sampling_logp_difference/mean": 0.023019785061478615, "step": 155, "step_time": 17.5110786780715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 107.984375, "completions/mean_terminated_length": 107.984375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.5636128336191177, "epoch": 0.1248, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6308668851852417, "learning_rate": 9.957905022186309e-06, "loss": 0.024, "num_tokens": 6508665.0, "reward": 3.5147879123687744, "reward_std": 0.43420496582984924, "rewards/evaluation_direction_reward/mean": 0.859375, "rewards/evaluation_direction_reward/std": 0.2445266991853714, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.5089285373687744, "rewards/pv_length_reward/std": 0.16912062466144562, "rewards/pv_quality_reward/mean": 0.021484375, "rewards/pv_quality_reward/std": 0.0703432559967041, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.3215010166168213, "sampling/importance_sampling_ratio/mean": 0.8671256899833679, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9216597080230713, "sampling/sampling_logp_difference/mean": 0.023285770788788795, "step": 156, "step_time": 17.44320084899664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 106.4921875, "completions/mean_terminated_length": 106.4921875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.5654417015612125, "epoch": 0.1256, "frac_reward_zero_std": 0.6875, "grad_norm": 1.0366135835647583, "learning_rate": 9.957360884852819e-06, "loss": -0.0101, "num_tokens": 6540560.0, "reward": 3.845517158508301, "reward_std": 0.8522264361381531, "rewards/evaluation_direction_reward/mean": 0.810546875, "rewards/evaluation_direction_reward/std": 0.3133293688297272, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3984375, "rewards/move_legality_reward/std": 0.4915000796318054, "rewards/pv_length_reward/mean": 0.409970223903656, "rewards/pv_length_reward/std": 0.22624458372592926, "rewards/pv_quality_reward/mean": 0.2265625, "rewards/pv_quality_reward/std": 0.37442541122436523, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7511324882507324, "sampling/importance_sampling_ratio/mean": 0.9561112523078918, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4805213212966919, "sampling/sampling_logp_difference/mean": 0.022579701617360115, "step": 157, "step_time": 17.449221082031727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 105.3125, "completions/mean_terminated_length": 105.3125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.5548792332410812, "epoch": 0.1264, "frac_reward_zero_std": 0.6875, "grad_norm": 0.581157386302948, "learning_rate": 9.95681326825263e-06, "loss": -0.003, "num_tokens": 6572672.0, "reward": 3.649348735809326, "reward_std": 0.715802013874054, "rewards/evaluation_direction_reward/mean": 0.798828125, "rewards/evaluation_direction_reward/std": 0.360525906085968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.359375, "rewards/move_legality_reward/std": 0.481702595949173, "rewards/pv_length_reward/mean": 0.3895833492279053, "rewards/pv_length_reward/std": 0.15861110389232635, "rewards/pv_quality_reward/mean": 0.1015625, "rewards/pv_quality_reward/std": 0.25378531217575073, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.73429274559021, "sampling/importance_sampling_ratio/mean": 0.9386183023452759, "sampling/importance_sampling_ratio/min": 0.186435729265213, "sampling/sampling_logp_difference/max": 0.6272387504577637, "sampling/sampling_logp_difference/mean": 0.02261841483414173, "step": 158, "step_time": 17.68643167614937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 105.6796875, "completions/mean_terminated_length": 105.6796875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.5472441129386425, "epoch": 0.1272, "frac_reward_zero_std": 0.8125, "grad_norm": 0.358713299036026, "learning_rate": 9.956262172770082e-06, "loss": 0.005, "num_tokens": 6604687.0, "reward": 3.28322172164917, "reward_std": 0.4894873797893524, "rewards/evaluation_direction_reward/mean": 0.8125, "rewards/evaluation_direction_reward/std": 0.3073893189430237, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.0703125, "rewards/move_legality_reward/std": 0.2566775679588318, "rewards/pv_length_reward/mean": 0.3886904716491699, "rewards/pv_length_reward/std": 0.2653396427631378, "rewards/pv_quality_reward/mean": 0.01171875, "rewards/pv_quality_reward/std": 0.07594143599271774, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.3907558917999268, "sampling/importance_sampling_ratio/mean": 0.9182955026626587, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6980547904968262, "sampling/sampling_logp_difference/mean": 0.02226240560412407, "step": 159, "step_time": 19.1806860268116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 104.40625, "completions/mean_terminated_length": 104.40625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.5600834600627422, "epoch": 0.128, "frac_reward_zero_std": 0.75, "grad_norm": 0.6424727439880371, "learning_rate": 9.955707598791952e-06, "loss": -0.0338, "num_tokens": 6636435.0, "reward": 3.5522446632385254, "reward_std": 0.6902234554290771, "rewards/evaluation_direction_reward/mean": 0.869140625, "rewards/evaluation_direction_reward/std": 0.25672996044158936, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.265625, "rewards/move_legality_reward/std": 0.44340085983276367, "rewards/pv_length_reward/mean": 0.33740079402923584, "rewards/pv_length_reward/std": 0.2110716700553894, "rewards/pv_quality_reward/mean": 0.080078125, "rewards/pv_quality_reward/std": 0.21262697875499725, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8612420558929443, "sampling/importance_sampling_ratio/mean": 0.9719337821006775, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6551001071929932, "sampling/sampling_logp_difference/mean": 0.023226531222462654, "step": 160, "step_time": 17.384050108492374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 104.8828125, "completions/mean_terminated_length": 104.8828125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.5571412444114685, "epoch": 0.1288, "frac_reward_zero_std": 0.8125, "grad_norm": 0.5592122673988342, "learning_rate": 9.955149546707465e-06, "loss": -0.0417, "num_tokens": 6668172.0, "reward": 3.588820695877075, "reward_std": 0.5910348892211914, "rewards/evaluation_direction_reward/mean": 0.76953125, "rewards/evaluation_direction_reward/std": 0.3239297568798065, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.5048363208770752, "rewards/pv_length_reward/std": 0.20747025310993195, "rewards/pv_quality_reward/mean": 0.064453125, "rewards/pv_quality_reward/std": 0.243510439991951, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8970260620117188, "sampling/importance_sampling_ratio/mean": 0.989634096622467, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9575749635696411, "sampling/sampling_logp_difference/mean": 0.023756422102451324, "step": 161, "step_time": 17.596826799213886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 106.7265625, "completions/mean_terminated_length": 106.7265625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.5564213059842587, "epoch": 0.1296, "frac_reward_zero_std": 0.75, "grad_norm": 0.7834507822990417, "learning_rate": 9.95458801690828e-06, "loss": -0.0095, "num_tokens": 6700561.0, "reward": 3.651116371154785, "reward_std": 0.6690790057182312, "rewards/evaluation_direction_reward/mean": 0.794921875, "rewards/evaluation_direction_reward/std": 0.3096759617328644, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.375, "rewards/move_legality_reward/std": 0.4860251843929291, "rewards/pv_length_reward/mean": 0.4089285731315613, "rewards/pv_length_reward/std": 0.18374204635620117, "rewards/pv_quality_reward/mean": 0.072265625, "rewards/pv_quality_reward/std": 0.24533510208129883, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2791295051574707, "sampling/importance_sampling_ratio/mean": 0.9321703910827637, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7331085801124573, "sampling/sampling_logp_difference/mean": 0.022576220333576202, "step": 162, "step_time": 18.205911807715893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 105.15625, "completions/mean_terminated_length": 105.15625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.559047557413578, "epoch": 0.1304, "frac_reward_zero_std": 0.875, "grad_norm": 0.3011290729045868, "learning_rate": 9.954023009788505e-06, "loss": 0.002, "num_tokens": 6732501.0, "reward": 3.48205828666687, "reward_std": 0.5175552368164062, "rewards/evaluation_direction_reward/mean": 0.818359375, "rewards/evaluation_direction_reward/std": 0.2887572646141052, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.4312770664691925, "rewards/pv_length_reward/std": 0.25089961290359497, "rewards/pv_quality_reward/mean": 0.044921875, "rewards/pv_quality_reward/std": 0.13099254667758942, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.301983594894409, "sampling/importance_sampling_ratio/mean": 0.9154298305511475, "sampling/importance_sampling_ratio/min": 0.23549729585647583, "sampling/sampling_logp_difference/max": 0.6467527151107788, "sampling/sampling_logp_difference/mean": 0.023160556331276894, "step": 163, "step_time": 17.07457572966814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 106.1875, "completions/mean_terminated_length": 106.1875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.5637382715940475, "epoch": 0.1312, "frac_reward_zero_std": 0.625, "grad_norm": 0.8488302826881409, "learning_rate": 9.95345452574468e-06, "loss": -0.0527, "num_tokens": 6764581.0, "reward": 3.7197482585906982, "reward_std": 0.6171249151229858, "rewards/evaluation_direction_reward/mean": 0.83203125, "rewards/evaluation_direction_reward/std": 0.3262006640434265, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3671875, "rewards/move_legality_reward/std": 0.4839322865009308, "rewards/pv_length_reward/mean": 0.40138888359069824, "rewards/pv_length_reward/std": 0.19840426743030548, "rewards/pv_quality_reward/mean": 0.119140625, "rewards/pv_quality_reward/std": 0.26798468828201294, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.731586217880249, "sampling/importance_sampling_ratio/mean": 0.9906464219093323, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6083393096923828, "sampling/sampling_logp_difference/mean": 0.022500580176711082, "step": 164, "step_time": 17.919625744223595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 104.84375, "completions/mean_terminated_length": 104.84375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.5582937635481358, "epoch": 0.132, "frac_reward_zero_std": 0.625, "grad_norm": 0.5593171119689941, "learning_rate": 9.952882565175788e-06, "loss": -0.0163, "num_tokens": 6796417.0, "reward": 3.918433666229248, "reward_std": 0.6829463839530945, "rewards/evaluation_direction_reward/mean": 0.86328125, "rewards/evaluation_direction_reward/std": 0.23863822221755981, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.484375, "rewards/move_legality_reward/std": 0.5017194747924805, "rewards/pv_length_reward/mean": 0.4047619104385376, "rewards/pv_length_reward/std": 0.18915176391601562, "rewards/pv_quality_reward/mean": 0.166015625, "rewards/pv_quality_reward/std": 0.3218997120857239, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.1419692039489746, "sampling/importance_sampling_ratio/mean": 0.8941982984542847, "sampling/importance_sampling_ratio/min": 0.19038183987140656, "sampling/sampling_logp_difference/max": 0.7160007953643799, "sampling/sampling_logp_difference/mean": 0.023571936413645744, "step": 165, "step_time": 18.00354442745447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 107.0390625, "completions/mean_terminated_length": 107.0390625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.5599941536784172, "epoch": 0.1328, "frac_reward_zero_std": 0.75, "grad_norm": 0.5110878944396973, "learning_rate": 9.952307128483257e-06, "loss": 0.0095, "num_tokens": 6828726.0, "reward": 3.3946614265441895, "reward_std": 0.5754488110542297, "rewards/evaluation_direction_reward/mean": 0.818359375, "rewards/evaluation_direction_reward/std": 0.33307626843452454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1328125, "rewards/move_legality_reward/std": 0.3407054841518402, "rewards/pv_length_reward/mean": 0.42395833134651184, "rewards/pv_length_reward/std": 0.25867074728012085, "rewards/pv_quality_reward/mean": 0.01953125, "rewards/pv_quality_reward/std": 0.10218755900859833, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.798895835876465, "sampling/importance_sampling_ratio/mean": 0.935349702835083, "sampling/importance_sampling_ratio/min": 0.29385635256767273, "sampling/sampling_logp_difference/max": 0.976611852645874, "sampling/sampling_logp_difference/mean": 0.02360164187848568, "step": 166, "step_time": 17.19639215618372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 103.1328125, "completions/mean_terminated_length": 103.1328125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.5553920567035675, "epoch": 0.1336, "frac_reward_zero_std": 0.625, "grad_norm": 0.9265041947364807, "learning_rate": 9.951728216070949e-06, "loss": 0.0251, "num_tokens": 6860527.0, "reward": 3.5316779613494873, "reward_std": 0.5000042915344238, "rewards/evaluation_direction_reward/mean": 0.8515625, "rewards/evaluation_direction_reward/std": 0.2114756554365158, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.2421875, "rewards/move_legality_reward/std": 0.4300905168056488, "rewards/pv_length_reward/mean": 0.37738096714019775, "rewards/pv_length_reward/std": 0.22125202417373657, "rewards/pv_quality_reward/mean": 0.060546875, "rewards/pv_quality_reward/std": 0.16525480151176453, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.709549903869629, "sampling/importance_sampling_ratio/mean": 0.9130164384841919, "sampling/importance_sampling_ratio/min": 0.17163680493831635, "sampling/sampling_logp_difference/max": 0.6682069301605225, "sampling/sampling_logp_difference/mean": 0.022582517936825752, "step": 167, "step_time": 16.891005404293537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 108.109375, "completions/mean_terminated_length": 108.109375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.5724524743855, "epoch": 0.1344, "frac_reward_zero_std": 0.6875, "grad_norm": 0.8069501519203186, "learning_rate": 9.951145828345163e-06, "loss": 0.0322, "num_tokens": 6892965.0, "reward": 3.7048161029815674, "reward_std": 0.5687907338142395, "rewards/evaluation_direction_reward/mean": 0.77734375, "rewards/evaluation_direction_reward/std": 0.2896413207054138, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.375, "rewards/move_legality_reward/std": 0.4860251843929291, "rewards/pv_length_reward/mean": 0.4235660433769226, "rewards/pv_length_reward/std": 0.20054364204406738, "rewards/pv_quality_reward/mean": 0.12890625, "rewards/pv_quality_reward/std": 0.2567674219608307, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.687638998031616, "sampling/importance_sampling_ratio/mean": 0.96921706199646, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7089719772338867, "sampling/sampling_logp_difference/mean": 0.023514650762081146, "step": 168, "step_time": 17.790646754205227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 104.921875, "completions/mean_terminated_length": 104.921875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.5529778003692627, "epoch": 0.1352, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6213428378105164, "learning_rate": 9.950559965714647e-06, "loss": -0.0295, "num_tokens": 6924771.0, "reward": 3.600192070007324, "reward_std": 0.768799364566803, "rewards/evaluation_direction_reward/mean": 0.919921875, "rewards/evaluation_direction_reward/std": 0.18004153668880463, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1953125, "rewards/move_legality_reward/std": 0.3979988098144531, "rewards/pv_length_reward/mean": 0.35605159401893616, "rewards/pv_length_reward/std": 0.2336598038673401, "rewards/pv_quality_reward/mean": 0.12890625, "rewards/pv_quality_reward/std": 0.33199524879455566, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8573246002197266, "sampling/importance_sampling_ratio/mean": 0.9422265291213989, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8667364120483398, "sampling/sampling_logp_difference/mean": 0.022742299363017082, "step": 169, "step_time": 17.452941112220287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 104.8984375, "completions/mean_terminated_length": 104.8984375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.5526913814246655, "epoch": 0.136, "frac_reward_zero_std": 0.8125, "grad_norm": 0.6080096364021301, "learning_rate": 9.94997062859058e-06, "loss": 0.0097, "num_tokens": 6956990.0, "reward": 3.7073745727539062, "reward_std": 0.46168726682662964, "rewards/evaluation_direction_reward/mean": 0.916015625, "rewards/evaluation_direction_reward/std": 0.1754559427499771, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1796875, "rewards/move_legality_reward/std": 0.3854354918003082, "rewards/pv_length_reward/mean": 0.5452651381492615, "rewards/pv_length_reward/std": 0.2423442155122757, "rewards/pv_quality_reward/mean": 0.06640625, "rewards/pv_quality_reward/std": 0.24399152398109436, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8037397861480713, "sampling/importance_sampling_ratio/mean": 0.9787135720252991, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6654542684555054, "sampling/sampling_logp_difference/mean": 0.022223224863409996, "step": 170, "step_time": 17.29078072309494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 105.6484375, "completions/mean_terminated_length": 105.6484375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.5649156719446182, "epoch": 0.1368, "frac_reward_zero_std": 0.625, "grad_norm": 0.6106797456741333, "learning_rate": 9.94937781738658e-06, "loss": -0.0094, "num_tokens": 6988881.0, "reward": 3.363715171813965, "reward_std": 0.461304247379303, "rewards/evaluation_direction_reward/mean": 0.82421875, "rewards/evaluation_direction_reward/std": 0.28471454977989197, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.0625, "rewards/move_legality_reward/std": 0.24301259219646454, "rewards/pv_length_reward/mean": 0.4262152910232544, "rewards/pv_length_reward/std": 0.1555539071559906, "rewards/pv_quality_reward/mean": 0.05078125, "rewards/pv_quality_reward/std": 0.19930829107761383, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.571122884750366, "sampling/importance_sampling_ratio/mean": 0.9493281245231628, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.596631646156311, "sampling/sampling_logp_difference/mean": 0.02225419506430626, "step": 171, "step_time": 17.29028182476759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 105.5, "completions/mean_terminated_length": 105.5, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.5621819980442524, "epoch": 0.1376, "frac_reward_zero_std": 0.8125, "grad_norm": 0.5356910824775696, "learning_rate": 9.948781532518706e-06, "loss": -0.0077, "num_tokens": 7020985.0, "reward": 3.483910083770752, "reward_std": 0.6563894152641296, "rewards/evaluation_direction_reward/mean": 0.728515625, "rewards/evaluation_direction_reward/std": 0.35219690203666687, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.4389880895614624, "rewards/pv_length_reward/std": 0.22660206258296967, "rewards/pv_quality_reward/mean": 0.06640625, "rewards/pv_quality_reward/std": 0.17880941927433014, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8378660678863525, "sampling/importance_sampling_ratio/mean": 0.9796107411384583, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.790198564529419, "sampling/sampling_logp_difference/mean": 0.022769639268517494, "step": 172, "step_time": 17.40231452137232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 104.265625, "completions/mean_terminated_length": 104.265625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.5544756464660168, "epoch": 0.1384, "frac_reward_zero_std": 0.75, "grad_norm": 0.6398528814315796, "learning_rate": 9.948181774405453e-06, "loss": -0.0027, "num_tokens": 7052747.0, "reward": 3.7259860038757324, "reward_std": 0.7076936364173889, "rewards/evaluation_direction_reward/mean": 0.88671875, "rewards/evaluation_direction_reward/std": 0.24674928188323975, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.375, "rewards/move_legality_reward/std": 0.4860251843929291, "rewards/pv_length_reward/mean": 0.34122025966644287, "rewards/pv_length_reward/std": 0.18137365579605103, "rewards/pv_quality_reward/mean": 0.123046875, "rewards/pv_quality_reward/std": 0.20087429881095886, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7164742946624756, "sampling/importance_sampling_ratio/mean": 1.0083839893341064, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5640368461608887, "sampling/sampling_logp_difference/mean": 0.022496085613965988, "step": 173, "step_time": 17.22868785262108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 106.984375, "completions/mean_terminated_length": 106.984375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.5621345788240433, "epoch": 0.1392, "frac_reward_zero_std": 0.875, "grad_norm": 0.2836557924747467, "learning_rate": 9.947578543467755e-06, "loss": 0.0055, "num_tokens": 7085001.0, "reward": 3.329631805419922, "reward_std": 0.5376916527748108, "rewards/evaluation_direction_reward/mean": 0.798828125, "rewards/evaluation_direction_reward/std": 0.360525906085968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.4058035612106323, "rewards/pv_length_reward/std": 0.20633454620838165, "rewards/pv_quality_reward/mean": 0.0, "rewards/pv_quality_reward/std": 0.0, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7499806880950928, "sampling/importance_sampling_ratio/mean": 0.9680594205856323, "sampling/importance_sampling_ratio/min": 0.13813014328479767, "sampling/sampling_logp_difference/max": 0.9380855560302734, "sampling/sampling_logp_difference/mean": 0.023233771324157715, "step": 174, "step_time": 17.456972628831863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 106.75, "completions/mean_terminated_length": 106.75, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.5597610622644424, "epoch": 0.14, "frac_reward_zero_std": 0.75, "grad_norm": 0.5446375012397766, "learning_rate": 9.946971840128982e-06, "loss": 0.0023, "num_tokens": 7117233.0, "reward": 3.4210939407348633, "reward_std": 0.49565696716308594, "rewards/evaluation_direction_reward/mean": 0.814453125, "rewards/evaluation_direction_reward/std": 0.33825352787971497, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.45429688692092896, "rewards/pv_length_reward/std": 0.21295782923698425, "rewards/pv_quality_reward/mean": 0.02734375, "rewards/pv_quality_reward/std": 0.11413132399320602, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.440307378768921, "sampling/importance_sampling_ratio/mean": 1.0100600719451904, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.704348087310791, "sampling/sampling_logp_difference/mean": 0.022500405088067055, "step": 175, "step_time": 17.750726744532585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 106.953125, "completions/mean_terminated_length": 106.953125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.5679527297616005, "epoch": 0.1408, "frac_reward_zero_std": 0.75, "grad_norm": 0.6395970582962036, "learning_rate": 9.946361664814942e-06, "loss": 0.0006, "num_tokens": 7148987.0, "reward": 3.5812995433807373, "reward_std": 0.3904765546321869, "rewards/evaluation_direction_reward/mean": 0.921875, "rewards/evaluation_direction_reward/std": 0.1739705353975296, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.171875, "rewards/move_legality_reward/std": 0.3787541687488556, "rewards/pv_length_reward/mean": 0.4406746029853821, "rewards/pv_length_reward/std": 0.2190672904253006, "rewards/pv_quality_reward/mean": 0.046875, "rewards/pv_quality_reward/std": 0.1822594404220581, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9676342010498047, "sampling/importance_sampling_ratio/mean": 0.9778540730476379, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6778785586357117, "sampling/sampling_logp_difference/mean": 0.022821398451924324, "step": 176, "step_time": 17.25733708590269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 106.0703125, "completions/mean_terminated_length": 106.0703125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.5657963007688522, "epoch": 0.1416, "frac_reward_zero_std": 0.875, "grad_norm": 0.7542115449905396, "learning_rate": 9.94574801795388e-06, "loss": 0.0187, "num_tokens": 7181396.0, "reward": 3.8877604007720947, "reward_std": 0.5708447694778442, "rewards/evaluation_direction_reward/mean": 0.90625, "rewards/evaluation_direction_reward/std": 0.1746762990951538, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.421875, "rewards/move_legality_reward/std": 0.4957992732524872, "rewards/pv_length_reward/mean": 0.3760416805744171, "rewards/pv_length_reward/std": 0.18800130486488342, "rewards/pv_quality_reward/mean": 0.18359375, "rewards/pv_quality_reward/std": 0.265252023935318, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6709601879119873, "sampling/importance_sampling_ratio/mean": 0.9732141494750977, "sampling/importance_sampling_ratio/min": 0.1372266560792923, "sampling/sampling_logp_difference/max": 0.9627337455749512, "sampling/sampling_logp_difference/mean": 0.02274094521999359, "step": 177, "step_time": 19.33628825098276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 107.4140625, "completions/mean_terminated_length": 107.4140625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.5571431331336498, "epoch": 0.1424, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6299512982368469, "learning_rate": 9.945130899976477e-06, "loss": -0.0102, "num_tokens": 7213881.0, "reward": 3.516368865966797, "reward_std": 0.7321932315826416, "rewards/evaluation_direction_reward/mean": 0.830078125, "rewards/evaluation_direction_reward/std": 0.2604765295982361, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.2976190447807312, "rewards/pv_length_reward/std": 0.21025200188159943, "rewards/pv_quality_reward/mean": 0.138671875, "rewards/pv_quality_reward/std": 0.2749527394771576, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.993873357772827, "sampling/importance_sampling_ratio/mean": 0.9990828037261963, "sampling/importance_sampling_ratio/min": 0.26808249950408936, "sampling/sampling_logp_difference/max": 0.5591051578521729, "sampling/sampling_logp_difference/mean": 0.022503292188048363, "step": 178, "step_time": 18.242841839790344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 106.1015625, "completions/mean_terminated_length": 106.1015625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.5633266866207123, "epoch": 0.1432, "frac_reward_zero_std": 0.75, "grad_norm": 0.45144641399383545, "learning_rate": 9.94451031131585e-06, "loss": -0.0217, "num_tokens": 7245862.0, "reward": 3.521949529647827, "reward_std": 0.5546077489852905, "rewards/evaluation_direction_reward/mean": 0.875, "rewards/evaluation_direction_reward/std": 0.2716963291168213, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.2421875, "rewards/move_legality_reward/std": 0.4300905168056488, "rewards/pv_length_reward/mean": 0.4047619104385376, "rewards/pv_length_reward/std": 0.19783318042755127, "rewards/pv_quality_reward/mean": 0.0, "rewards/pv_quality_reward/std": 0.0, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.444666624069214, "sampling/importance_sampling_ratio/mean": 1.0111262798309326, "sampling/importance_sampling_ratio/min": 0.23065423965454102, "sampling/sampling_logp_difference/max": 0.5609273910522461, "sampling/sampling_logp_difference/mean": 0.021906377747654915, "step": 179, "step_time": 16.8648212403059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 105.9140625, "completions/mean_terminated_length": 105.9140625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.5559212788939476, "epoch": 0.144, "frac_reward_zero_std": 0.8125, "grad_norm": 0.5583869814872742, "learning_rate": 9.943886252407551e-06, "loss": 0.0088, "num_tokens": 7277731.0, "reward": 3.3839471340179443, "reward_std": 0.554033100605011, "rewards/evaluation_direction_reward/mean": 0.748046875, "rewards/evaluation_direction_reward/std": 0.37318602204322815, "rewards/format_reward/mean": 0.9976562261581421, "rewards/format_reward/std": 0.02651650458574295, "rewards/move_legality_reward/mean": 0.1796875, "rewards/move_legality_reward/std": 0.3854354918003082, "rewards/pv_length_reward/mean": 0.42730656266212463, "rewards/pv_length_reward/std": 0.23884059488773346, "rewards/pv_quality_reward/mean": 0.03125, "rewards/pv_quality_reward/std": 0.12150629609823227, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8952369689941406, "sampling/importance_sampling_ratio/mean": 0.9812285900115967, "sampling/importance_sampling_ratio/min": 0.18206524848937988, "sampling/sampling_logp_difference/max": 0.5646779537200928, "sampling/sampling_logp_difference/mean": 0.022296246141195297, "step": 180, "step_time": 16.95820613950491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 105.796875, "completions/mean_terminated_length": 105.796875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.5675275921821594, "epoch": 0.1448, "frac_reward_zero_std": 0.8125, "grad_norm": 0.4850504398345947, "learning_rate": 9.94325872368957e-06, "loss": -0.009, "num_tokens": 7309753.0, "reward": 3.7679128646850586, "reward_std": 0.6641586422920227, "rewards/evaluation_direction_reward/mean": 0.87890625, "rewards/evaluation_direction_reward/std": 0.21274447441101074, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3046875, "rewards/move_legality_reward/std": 0.46208351850509644, "rewards/pv_length_reward/mean": 0.49642857909202576, "rewards/pv_length_reward/std": 0.23449090123176575, "rewards/pv_quality_reward/mean": 0.087890625, "rewards/pv_quality_reward/std": 0.25601011514663696, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8980162143707275, "sampling/importance_sampling_ratio/mean": 0.9313772916793823, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8153598308563232, "sampling/sampling_logp_difference/mean": 0.022556111216545105, "step": 181, "step_time": 17.89696927368641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 106.59375, "completions/mean_terminated_length": 106.59375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.5728050097823143, "epoch": 0.1456, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5611481070518494, "learning_rate": 9.942627725602332e-06, "loss": -0.0237, "num_tokens": 7341677.0, "reward": 3.788268804550171, "reward_std": 0.5842961668968201, "rewards/evaluation_direction_reward/mean": 0.95703125, "rewards/evaluation_direction_reward/std": 0.1219484806060791, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.375, "rewards/move_legality_reward/std": 0.4860251843929291, "rewards/pv_length_reward/mean": 0.3742063641548157, "rewards/pv_length_reward/std": 0.22277042269706726, "rewards/pv_quality_reward/mean": 0.08203125, "rewards/pv_quality_reward/std": 0.1777743399143219, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8359181880950928, "sampling/importance_sampling_ratio/mean": 0.9917862415313721, "sampling/importance_sampling_ratio/min": 0.299421101808548, "sampling/sampling_logp_difference/max": 0.6487910747528076, "sampling/sampling_logp_difference/mean": 0.023485254496335983, "step": 182, "step_time": 17.385075256228447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 105.9921875, "completions/mean_terminated_length": 105.9921875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.5631194710731506, "epoch": 0.1464, "frac_reward_zero_std": 0.8125, "grad_norm": 0.4433145523071289, "learning_rate": 9.941993258588691e-06, "loss": 0.002, "num_tokens": 7373852.0, "reward": 3.716796875, "reward_std": 0.4550604224205017, "rewards/evaluation_direction_reward/mean": 0.931640625, "rewards/evaluation_direction_reward/std": 0.17101722955703735, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3046875, "rewards/move_legality_reward/std": 0.46208351850509644, "rewards/pv_length_reward/mean": 0.390625, "rewards/pv_length_reward/std": 0.17923417687416077, "rewards/pv_quality_reward/mean": 0.08984375, "rewards/pv_quality_reward/std": 0.20747444033622742, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.650259256362915, "sampling/importance_sampling_ratio/mean": 0.9530683159828186, "sampling/importance_sampling_ratio/min": 0.21120592951774597, "sampling/sampling_logp_difference/max": 0.7153493165969849, "sampling/sampling_logp_difference/mean": 0.023369072005152702, "step": 183, "step_time": 17.851952828466892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 105.0859375, "completions/mean_terminated_length": 105.0859375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.5489344969391823, "epoch": 0.1472, "frac_reward_zero_std": 0.75, "grad_norm": 0.42139583826065063, "learning_rate": 9.941355323093944e-06, "loss": -0.0486, "num_tokens": 7405847.0, "reward": 3.3617746829986572, "reward_std": 0.5989161133766174, "rewards/evaluation_direction_reward/mean": 0.736328125, "rewards/evaluation_direction_reward/std": 0.333952397108078, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.4379464387893677, "rewards/pv_length_reward/std": 0.18339279294013977, "rewards/pv_quality_reward/mean": 0.0625, "rewards/pv_quality_reward/std": 0.24301259219646454, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2484259605407715, "sampling/importance_sampling_ratio/mean": 0.9509456157684326, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7091975212097168, "sampling/sampling_logp_difference/mean": 0.021926086395978928, "step": 184, "step_time": 18.096101112663746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 102.8203125, "completions/mean_terminated_length": 102.8203125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.5514926947653294, "epoch": 0.148, "frac_reward_zero_std": 0.75, "grad_norm": 0.6903696060180664, "learning_rate": 9.940713919565819e-06, "loss": -0.0062, "num_tokens": 7437536.0, "reward": 3.506063938140869, "reward_std": 0.5921996235847473, "rewards/evaluation_direction_reward/mean": 0.814453125, "rewards/evaluation_direction_reward/std": 0.3233773708343506, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.4162202477455139, "rewards/pv_length_reward/std": 0.2187967747449875, "rewards/pv_quality_reward/mean": 0.087890625, "rewards/pv_quality_reward/std": 0.20713134109973907, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.896007537841797, "sampling/importance_sampling_ratio/mean": 1.0009047985076904, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.47212719917297363, "sampling/sampling_logp_difference/mean": 0.02198231965303421, "step": 185, "step_time": 17.08118373155594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 103.8359375, "completions/mean_terminated_length": 103.8359375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.5532329119741917, "epoch": 0.1488, "frac_reward_zero_std": 0.875, "grad_norm": 0.6309822797775269, "learning_rate": 9.940069048454478e-06, "loss": 0.037, "num_tokens": 7469331.0, "reward": 3.6109561920166016, "reward_std": 0.573309063911438, "rewards/evaluation_direction_reward/mean": 0.923828125, "rewards/evaluation_direction_reward/std": 0.20705707371234894, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.3746279776096344, "rewards/pv_length_reward/std": 0.2540659010410309, "rewards/pv_quality_reward/mean": 0.0625, "rewards/pv_quality_reward/std": 0.24301259219646454, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.71579909324646, "sampling/importance_sampling_ratio/mean": 0.9548814296722412, "sampling/importance_sampling_ratio/min": 0.20091311633586884, "sampling/sampling_logp_difference/max": 0.6205998659133911, "sampling/sampling_logp_difference/mean": 0.022699905559420586, "step": 186, "step_time": 17.663258962333202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 103.234375, "completions/mean_terminated_length": 103.234375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.5433066338300705, "epoch": 0.1496, "frac_reward_zero_std": 0.875, "grad_norm": 0.4652441442012787, "learning_rate": 9.939420710212511e-06, "loss": 0.007, "num_tokens": 7500969.0, "reward": 3.6004278659820557, "reward_std": 0.5650128722190857, "rewards/evaluation_direction_reward/mean": 0.888671875, "rewards/evaluation_direction_reward/std": 0.20041441917419434, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.36800599098205566, "rewards/pv_length_reward/std": 0.15879806876182556, "rewards/pv_quality_reward/mean": 0.09375, "rewards/pv_quality_reward/std": 0.26435181498527527, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6764185428619385, "sampling/importance_sampling_ratio/mean": 0.9610778093338013, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5218560695648193, "sampling/sampling_logp_difference/mean": 0.022050349041819572, "step": 187, "step_time": 17.40953340381384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 102.1953125, "completions/mean_terminated_length": 102.1953125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.5397511385381222, "epoch": 0.1504, "frac_reward_zero_std": 0.75, "grad_norm": 0.7369158864021301, "learning_rate": 9.938768905294954e-06, "loss": 0.0471, "num_tokens": 7532650.0, "reward": 3.5918984413146973, "reward_std": 0.5889501571655273, "rewards/evaluation_direction_reward/mean": 0.890625, "rewards/evaluation_direction_reward/std": 0.19529405236244202, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3125, "rewards/move_legality_reward/std": 0.4653336703777313, "rewards/pv_length_reward/mean": 0.3360389769077301, "rewards/pv_length_reward/std": 0.18596291542053223, "rewards/pv_quality_reward/mean": 0.052734375, "rewards/pv_quality_reward/std": 0.1493171602487564, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7131783962249756, "sampling/importance_sampling_ratio/mean": 0.9174181222915649, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5870134830474854, "sampling/sampling_logp_difference/mean": 0.022436682134866714, "step": 188, "step_time": 18.629171803593636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 101.171875, "completions/mean_terminated_length": 101.171875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.5387878082692623, "epoch": 0.1512, "frac_reward_zero_std": 0.5625, "grad_norm": 0.8380022048950195, "learning_rate": 9.938113634159266e-06, "loss": 0.0613, "num_tokens": 7563864.0, "reward": 3.6798176765441895, "reward_std": 0.5449265837669373, "rewards/evaluation_direction_reward/mean": 0.896484375, "rewards/evaluation_direction_reward/std": 0.17333954572677612, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3125, "rewards/move_legality_reward/std": 0.4653336703777313, "rewards/pv_length_reward/mean": 0.40833336114883423, "rewards/pv_length_reward/std": 0.16372737288475037, "rewards/pv_quality_reward/mean": 0.0625, "rewards/pv_quality_reward/std": 0.14376819133758545, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9572815895080566, "sampling/importance_sampling_ratio/mean": 0.9866320490837097, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.58510422706604, "sampling/sampling_logp_difference/mean": 0.021635601297020912, "step": 189, "step_time": 17.45724617689848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 101.6796875, "completions/mean_terminated_length": 101.6796875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.5269472468644381, "epoch": 0.152, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6876732110977173, "learning_rate": 9.937454897265338e-06, "loss": 0.0062, "num_tokens": 7595567.0, "reward": 3.6542844772338867, "reward_std": 0.5664248466491699, "rewards/evaluation_direction_reward/mean": 0.873046875, "rewards/evaluation_direction_reward/std": 0.2430046796798706, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3125, "rewards/move_legality_reward/std": 0.4653336703777313, "rewards/pv_length_reward/mean": 0.3867063522338867, "rewards/pv_length_reward/std": 0.17866551876068115, "rewards/pv_quality_reward/mean": 0.08203125, "rewards/pv_quality_reward/std": 0.185893714427948, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9186556339263916, "sampling/importance_sampling_ratio/mean": 0.9718688130378723, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8393466472625732, "sampling/sampling_logp_difference/mean": 0.0214095301926136, "step": 190, "step_time": 17.754529558122158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 97.7421875, "completions/mean_terminated_length": 97.7421875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.5440040174871683, "epoch": 0.1528, "frac_reward_zero_std": 0.6875, "grad_norm": 0.481954962015152, "learning_rate": 9.936792695075502e-06, "loss": -0.0139, "num_tokens": 7626206.0, "reward": 3.6537373065948486, "reward_std": 0.6429774165153503, "rewards/evaluation_direction_reward/mean": 0.916015625, "rewards/evaluation_direction_reward/std": 0.18633785843849182, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.375, "rewards/move_legality_reward/std": 0.4860251843929291, "rewards/pv_length_reward/mean": 0.2982684075832367, "rewards/pv_length_reward/std": 0.17314818501472473, "rewards/pv_quality_reward/mean": 0.064453125, "rewards/pv_quality_reward/std": 0.1667371392250061, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6048381328582764, "sampling/importance_sampling_ratio/mean": 0.9665162563323975, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9910125732421875, "sampling/sampling_logp_difference/mean": 0.022327639162540436, "step": 191, "step_time": 17.304198570549488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 96.3359375, "completions/mean_terminated_length": 96.3359375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.4994178880006075, "epoch": 0.1536, "frac_reward_zero_std": 0.75, "grad_norm": 1.2280173301696777, "learning_rate": 9.936127028054516e-06, "loss": 0.0112, "num_tokens": 7657105.0, "reward": 3.859189033508301, "reward_std": 0.6942920684814453, "rewards/evaluation_direction_reward/mean": 0.78125, "rewards/evaluation_direction_reward/std": 0.32147544622421265, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.5, "rewards/move_legality_reward/std": 0.5019646286964417, "rewards/pv_length_reward/mean": 0.394345223903656, "rewards/pv_length_reward/std": 0.16858211159706116, "rewards/pv_quality_reward/mean": 0.18359375, "rewards/pv_quality_reward/std": 0.2934393584728241, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9024710655212402, "sampling/importance_sampling_ratio/mean": 1.025327205657959, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7498941421508789, "sampling/sampling_logp_difference/mean": 0.02057693712413311, "step": 192, "step_time": 17.56568543612957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 95.125, "completions/mean_terminated_length": 95.125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.5106205958873034, "epoch": 0.1544, "frac_reward_zero_std": 0.8125, "grad_norm": 0.4225583076477051, "learning_rate": 9.935457896669568e-06, "loss": -0.0396, "num_tokens": 7687545.0, "reward": 3.425520896911621, "reward_std": 0.5610963106155396, "rewards/evaluation_direction_reward/mean": 0.7421875, "rewards/evaluation_direction_reward/std": 0.3259412348270416, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.4333333373069763, "rewards/pv_length_reward/std": 0.19691407680511475, "rewards/pv_quality_reward/mean": 0.0625, "rewards/pv_quality_reward/std": 0.16600920259952545, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9908363819122314, "sampling/importance_sampling_ratio/mean": 0.9754372835159302, "sampling/importance_sampling_ratio/min": 0.2748029828071594, "sampling/sampling_logp_difference/max": 0.7484798431396484, "sampling/sampling_logp_difference/mean": 0.02169005759060383, "step": 193, "step_time": 17.034854620695114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 94.7265625, "completions/mean_terminated_length": 94.7265625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.5020957458764315, "epoch": 0.1552, "frac_reward_zero_std": 0.8125, "grad_norm": 0.48397138714790344, "learning_rate": 9.934785301390282e-06, "loss": -0.0188, "num_tokens": 7718190.0, "reward": 3.6909596920013428, "reward_std": 0.7089264988899231, "rewards/evaluation_direction_reward/mean": 0.90625, "rewards/evaluation_direction_reward/std": 0.18823674321174622, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.4526785910129547, "rewards/pv_length_reward/std": 0.22079943120479584, "rewards/pv_quality_reward/mean": 0.08203125, "rewards/pv_quality_reward/std": 0.17214880883693695, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7807812690734863, "sampling/importance_sampling_ratio/mean": 0.8962146043777466, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7177703380584717, "sampling/sampling_logp_difference/mean": 0.022578811272978783, "step": 194, "step_time": 16.694207213819027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 95.546875, "completions/mean_terminated_length": 95.546875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.523994192481041, "epoch": 0.156, "frac_reward_zero_std": 0.75, "grad_norm": 0.6312493681907654, "learning_rate": 9.934109242688712e-06, "loss": 0.0435, "num_tokens": 7748764.0, "reward": 3.6042370796203613, "reward_std": 0.814193069934845, "rewards/evaluation_direction_reward/mean": 0.8984375, "rewards/evaluation_direction_reward/std": 0.1893770843744278, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.42454975843429565, "rewards/pv_length_reward/std": 0.25442826747894287, "rewards/pv_quality_reward/mean": 0.09375, "rewards/pv_quality_reward/std": 0.26435181498527527, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.886199712753296, "sampling/importance_sampling_ratio/mean": 1.004776120185852, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6160609722137451, "sampling/sampling_logp_difference/mean": 0.02200045995414257, "step": 195, "step_time": 16.637334309518337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 93.15625, "completions/mean_terminated_length": 93.15625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.49758128449320793, "epoch": 0.1568, "frac_reward_zero_std": 0.8125, "grad_norm": 0.5702590942382812, "learning_rate": 9.93342972103934e-06, "loss": 0.0106, "num_tokens": 7779360.0, "reward": 3.612332582473755, "reward_std": 0.6428152918815613, "rewards/evaluation_direction_reward/mean": 0.8515625, "rewards/evaluation_direction_reward/std": 0.3080889582633972, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3125, "rewards/move_legality_reward/std": 0.4653336703777313, "rewards/pv_length_reward/mean": 0.3642857074737549, "rewards/pv_length_reward/std": 0.19671812653541565, "rewards/pv_quality_reward/mean": 0.083984375, "rewards/pv_quality_reward/std": 0.23325271904468536, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.827693223953247, "sampling/importance_sampling_ratio/mean": 0.985292911529541, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7441859245300293, "sampling/sampling_logp_difference/mean": 0.021757176145911217, "step": 196, "step_time": 17.011983945965767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 93.34375, "completions/mean_terminated_length": 93.34375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.5036126412451267, "epoch": 0.1576, "frac_reward_zero_std": 0.875, "grad_norm": 0.4041709303855896, "learning_rate": 9.932746736919084e-06, "loss": 0.0161, "num_tokens": 7810004.0, "reward": 3.493880271911621, "reward_std": 0.5657932162284851, "rewards/evaluation_direction_reward/mean": 0.85546875, "rewards/evaluation_direction_reward/std": 0.25021520256996155, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.3708333373069763, "rewards/pv_length_reward/std": 0.15099553763866425, "rewards/pv_quality_reward/mean": 0.080078125, "rewards/pv_quality_reward/std": 0.21946066617965698, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.925686836242676, "sampling/importance_sampling_ratio/mean": 1.011624813079834, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7596330642700195, "sampling/sampling_logp_difference/mean": 0.021813079714775085, "step": 197, "step_time": 16.88676591962576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 91.8515625, "completions/mean_terminated_length": 91.8515625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.5057928431779146, "epoch": 0.1584, "frac_reward_zero_std": 0.8125, "grad_norm": 0.4539107084274292, "learning_rate": 9.932060290807283e-06, "loss": 0.0042, "num_tokens": 7840273.0, "reward": 3.573195695877075, "reward_std": 0.7329097986221313, "rewards/evaluation_direction_reward/mean": 0.888671875, "rewards/evaluation_direction_reward/std": 0.2785094380378723, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.4970238208770752, "rewards/pv_length_reward/std": 0.2425006777048111, "rewards/pv_quality_reward/mean": 0.0625, "rewards/pv_quality_reward/std": 0.24301259219646454, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6748270988464355, "sampling/importance_sampling_ratio/mean": 0.9081436991691589, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8463406562805176, "sampling/sampling_logp_difference/mean": 0.02121604047715664, "step": 198, "step_time": 16.42266035079956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/max_terminated_length": 119.0, "completions/mean_length": 91.8046875, "completions/mean_terminated_length": 91.8046875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.49744176119565964, "epoch": 0.1592, "frac_reward_zero_std": 0.625, "grad_norm": 0.7946390509605408, "learning_rate": 9.931370383185717e-06, "loss": -0.0234, "num_tokens": 7870648.0, "reward": 3.424814224243164, "reward_std": 0.6656935214996338, "rewards/evaluation_direction_reward/mean": 0.732421875, "rewards/evaluation_direction_reward/std": 0.2861356735229492, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.39551711082458496, "rewards/pv_length_reward/std": 0.18057496845722198, "rewards/pv_quality_reward/mean": 0.109375, "rewards/pv_quality_reward/std": 0.2657443881034851, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.454230785369873, "sampling/importance_sampling_ratio/mean": 1.0400601625442505, "sampling/importance_sampling_ratio/min": 0.2406150996685028, "sampling/sampling_logp_difference/max": 0.7002571821212769, "sampling/sampling_logp_difference/mean": 0.020306896418333054, "step": 199, "step_time": 16.991892993450165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 89.9140625, "completions/mean_terminated_length": 89.9140625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.4959145560860634, "epoch": 0.16, "frac_reward_zero_std": 0.5625, "grad_norm": 0.9086190462112427, "learning_rate": 9.930677014538587e-06, "loss": -0.0335, "num_tokens": 7900813.0, "reward": 3.5365328788757324, "reward_std": 0.8167019486427307, "rewards/evaluation_direction_reward/mean": 0.68359375, "rewards/evaluation_direction_reward/std": 0.2849305272102356, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3125, "rewards/move_legality_reward/std": 0.4653336703777313, "rewards/pv_length_reward/mean": 0.33340775966644287, "rewards/pv_length_reward/std": 0.14213789999485016, "rewards/pv_quality_reward/mean": 0.20703125, "rewards/pv_quality_reward/std": 0.3673556447029114, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.749330759048462, "sampling/importance_sampling_ratio/mean": 1.012747049331665, "sampling/importance_sampling_ratio/min": 0.2762693762779236, "sampling/sampling_logp_difference/max": 0.6478204727172852, "sampling/sampling_logp_difference/mean": 0.020439568907022476, "step": 200, "step_time": 17.443434849381447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 89.796875, "completions/mean_terminated_length": 89.796875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.48098448663949966, "epoch": 0.1608, "frac_reward_zero_std": 0.625, "grad_norm": 0.976069450378418, "learning_rate": 9.929980185352525e-06, "loss": 0.0323, "num_tokens": 7930995.0, "reward": 3.3753533363342285, "reward_std": 0.7001367807388306, "rewards/evaluation_direction_reward/mean": 0.716796875, "rewards/evaluation_direction_reward/std": 0.3540695309638977, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.3773065507411957, "rewards/pv_length_reward/std": 0.17400625348091125, "rewards/pv_quality_reward/mean": 0.03125, "rewards/pv_quality_reward/std": 0.12150629609823227, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.894320249557495, "sampling/importance_sampling_ratio/mean": 0.9629106521606445, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8189761638641357, "sampling/sampling_logp_difference/mean": 0.021492721512913704, "step": 201, "step_time": 17.097130052745342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 90.0859375, "completions/mean_terminated_length": 90.0859375, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.5052812527865171, "epoch": 0.1616, "frac_reward_zero_std": 0.875, "grad_norm": 0.6507732272148132, "learning_rate": 9.929279896116595e-06, "loss": 0.0136, "num_tokens": 7961182.0, "reward": 3.6883556842803955, "reward_std": 0.623706579208374, "rewards/evaluation_direction_reward/mean": 0.859375, "rewards/evaluation_direction_reward/std": 0.3316476345062256, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3046875, "rewards/move_legality_reward/std": 0.46208351850509644, "rewards/pv_length_reward/mean": 0.3797619044780731, "rewards/pv_length_reward/std": 0.2647717595100403, "rewards/pv_quality_reward/mean": 0.14453125, "rewards/pv_quality_reward/std": 0.31624236702919006, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7747654914855957, "sampling/importance_sampling_ratio/mean": 0.9195259809494019, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6850378513336182, "sampling/sampling_logp_difference/mean": 0.022275423631072044, "step": 202, "step_time": 16.233024388551712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 90.0625, "completions/mean_terminated_length": 90.0625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.49305158108472824, "epoch": 0.1624, "frac_reward_zero_std": 0.6875, "grad_norm": 0.8178473114967346, "learning_rate": 9.928576147322283e-06, "loss": 0.0069, "num_tokens": 7991454.0, "reward": 3.4859375953674316, "reward_std": 0.45043203234672546, "rewards/evaluation_direction_reward/mean": 0.875, "rewards/evaluation_direction_reward/std": 0.20331890881061554, "rewards/format_reward/mean": 0.9984375238418579, "rewards/format_reward/std": 0.0176776684820652, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.42500001192092896, "rewards/pv_length_reward/std": 0.20713435113430023, "rewards/pv_quality_reward/mean": 0.0, "rewards/pv_quality_reward/std": 0.0, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7951643466949463, "sampling/importance_sampling_ratio/mean": 0.9858810305595398, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.722325325012207, "sampling/sampling_logp_difference/mean": 0.021333681419491768, "step": 203, "step_time": 16.193083599209785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 90.375, "completions/mean_terminated_length": 90.375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.4934678543359041, "epoch": 0.1632, "frac_reward_zero_std": 0.9375, "grad_norm": 0.21664094924926758, "learning_rate": 9.927868939463511e-06, "loss": -0.0099, "num_tokens": 8021422.0, "reward": 3.642838716506958, "reward_std": 0.7906100749969482, "rewards/evaluation_direction_reward/mean": 0.921875, "rewards/evaluation_direction_reward/std": 0.17111836373806, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.3947916626930237, "rewards/pv_length_reward/std": 0.27491945028305054, "rewards/pv_quality_reward/mean": 0.076171875, "rewards/pv_quality_reward/std": 0.24614867568016052, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9711945056915283, "sampling/importance_sampling_ratio/mean": 1.003018856048584, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7460694313049316, "sampling/sampling_logp_difference/mean": 0.02149534411728382, "step": 204, "step_time": 16.04449862241745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/max_terminated_length": 119.0, "completions/mean_length": 90.6953125, "completions/mean_terminated_length": 90.6953125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.5039780214428902, "epoch": 0.164, "frac_reward_zero_std": 0.6875, "grad_norm": 0.8517963290214539, "learning_rate": 9.927158273036624e-06, "loss": 0.0052, "num_tokens": 8051263.0, "reward": 3.541796922683716, "reward_std": 0.5308917760848999, "rewards/evaluation_direction_reward/mean": 0.91796875, "rewards/evaluation_direction_reward/std": 0.1805213838815689, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.46562501788139343, "rewards/pv_length_reward/std": 0.22043214738368988, "rewards/pv_quality_reward/mean": 0.033203125, "rewards/pv_quality_reward/std": 0.13075754046440125, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6745927333831787, "sampling/importance_sampling_ratio/mean": 1.029658317565918, "sampling/importance_sampling_ratio/min": 0.3531336784362793, "sampling/sampling_logp_difference/max": 0.9518411159515381, "sampling/sampling_logp_difference/mean": 0.021799229085445404, "step": 205, "step_time": 15.957839086651802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 89.46875, "completions/mean_terminated_length": 89.46875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.48822643607854843, "epoch": 0.1648, "frac_reward_zero_std": 0.6875, "grad_norm": 0.8201076984405518, "learning_rate": 9.926444148540394e-06, "loss": -0.0079, "num_tokens": 8081283.0, "reward": 3.77487850189209, "reward_std": 0.5926098227500916, "rewards/evaluation_direction_reward/mean": 0.849609375, "rewards/evaluation_direction_reward/std": 0.3063305914402008, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.4375, "rewards/move_legality_reward/std": 0.49802759289741516, "rewards/pv_length_reward/mean": 0.347222238779068, "rewards/pv_length_reward/std": 0.14107036590576172, "rewards/pv_quality_reward/mean": 0.140625, "rewards/pv_quality_reward/std": 0.30698880553245544, "rewards/verbosity_reward/mean": 0.9999218583106995, "rewards/verbosity_reward/std": 0.0008838826324790716, "sampling/importance_sampling_ratio/max": 2.507277727127075, "sampling/importance_sampling_ratio/mean": 1.005576252937317, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7485520839691162, "sampling/sampling_logp_difference/mean": 0.020821532234549522, "step": 206, "step_time": 16.649166151881218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 93.6640625, "completions/mean_terminated_length": 93.6640625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.5039138104766607, "epoch": 0.1656, "frac_reward_zero_std": 0.625, "grad_norm": 1.087353229522705, "learning_rate": 9.925726566476021e-06, "loss": -0.0491, "num_tokens": 8112104.0, "reward": 3.6900856494903564, "reward_std": 0.6850990653038025, "rewards/evaluation_direction_reward/mean": 0.865234375, "rewards/evaluation_direction_reward/std": 0.19314619898796082, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3125, "rewards/move_legality_reward/std": 0.4653336703777313, "rewards/pv_length_reward/mean": 0.4186011850833893, "rewards/pv_length_reward/std": 0.20587237179279327, "rewards/pv_quality_reward/mean": 0.09375, "rewards/pv_quality_reward/std": 0.24901379644870758, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.872396469116211, "sampling/importance_sampling_ratio/mean": 1.0431087017059326, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6375930309295654, "sampling/sampling_logp_difference/mean": 0.02164786122739315, "step": 207, "step_time": 16.10090158134699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 94.6328125, "completions/mean_terminated_length": 94.6328125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.5144986640661955, "epoch": 0.1664, "frac_reward_zero_std": 0.9375, "grad_norm": 0.30063652992248535, "learning_rate": 9.925005527347132e-06, "loss": 0.001, "num_tokens": 8142833.0, "reward": 3.688746213912964, "reward_std": 0.5201935768127441, "rewards/evaluation_direction_reward/mean": 0.865234375, "rewards/evaluation_direction_reward/std": 0.2804354429244995, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3125, "rewards/move_legality_reward/std": 0.4653336703777313, "rewards/pv_length_reward/mean": 0.38601189851760864, "rewards/pv_length_reward/std": 0.27515271306037903, "rewards/pv_quality_reward/mean": 0.125, "rewards/pv_quality_reward/std": 0.3320184051990509, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.434133291244507, "sampling/importance_sampling_ratio/mean": 0.9665261507034302, "sampling/importance_sampling_ratio/min": 0.2090747356414795, "sampling/sampling_logp_difference/max": 0.5971935987472534, "sampling/sampling_logp_difference/mean": 0.021818852052092552, "step": 208, "step_time": 16.77495603263378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 94.4765625, "completions/mean_terminated_length": 94.4765625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.5171906352043152, "epoch": 0.1672, "frac_reward_zero_std": 0.75, "grad_norm": 0.622795581817627, "learning_rate": 9.92428103165978e-06, "loss": -0.0012, "num_tokens": 8173454.0, "reward": 3.8497581481933594, "reward_std": 0.6376062035560608, "rewards/evaluation_direction_reward/mean": 0.783203125, "rewards/evaluation_direction_reward/std": 0.334044486284256, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.4921875, "rewards/move_legality_reward/std": 0.5019033551216125, "rewards/pv_length_reward/mean": 0.31264883279800415, "rewards/pv_length_reward/std": 0.1751149743795395, "rewards/pv_quality_reward/mean": 0.26171875, "rewards/pv_quality_reward/std": 0.4112820625305176, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9946844577789307, "sampling/importance_sampling_ratio/mean": 0.9439754486083984, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9635217189788818, "sampling/sampling_logp_difference/mean": 0.021724293008446693, "step": 209, "step_time": 18.012024991214275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 95.453125, "completions/mean_terminated_length": 95.453125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.5244055651128292, "epoch": 0.168, "frac_reward_zero_std": 0.8125, "grad_norm": 0.789767324924469, "learning_rate": 9.923553079922443e-06, "loss": 0.0144, "num_tokens": 8203872.0, "reward": 3.6672122478485107, "reward_std": 0.6640348434448242, "rewards/evaluation_direction_reward/mean": 0.90625, "rewards/evaluation_direction_reward/std": 0.2735016345977783, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3125, "rewards/move_legality_reward/std": 0.4653336703777313, "rewards/pv_length_reward/mean": 0.3547123074531555, "rewards/pv_length_reward/std": 0.2549729347229004, "rewards/pv_quality_reward/mean": 0.09375, "rewards/pv_quality_reward/std": 0.26435181498527527, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.93829083442688, "sampling/importance_sampling_ratio/mean": 0.9790071249008179, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6249966621398926, "sampling/sampling_logp_difference/mean": 0.021024473011493683, "step": 210, "step_time": 16.156775034964085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 96.453125, "completions/mean_terminated_length": 96.453125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.5348669830709696, "epoch": 0.1688, "frac_reward_zero_std": 0.75, "grad_norm": 0.6640205979347229, "learning_rate": 9.922821672646028e-06, "loss": 0.0245, "num_tokens": 8234826.0, "reward": 3.5893726348876953, "reward_std": 0.5344425439834595, "rewards/evaluation_direction_reward/mean": 0.87890625, "rewards/evaluation_direction_reward/std": 0.2326337844133377, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.44484126567840576, "rewards/pv_length_reward/std": 0.20539617538452148, "rewards/pv_quality_reward/mean": 0.078125, "rewards/pv_quality_reward/std": 0.24653105437755585, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8124938011169434, "sampling/importance_sampling_ratio/mean": 0.9743281602859497, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.853367805480957, "sampling/sampling_logp_difference/mean": 0.023874787613749504, "step": 211, "step_time": 17.325993083417416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 98.1875, "completions/mean_terminated_length": 98.1875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.5394844003021717, "epoch": 0.1696, "frac_reward_zero_std": 0.75, "grad_norm": 0.6403343081474304, "learning_rate": 9.922086810343862e-06, "loss": 0.006, "num_tokens": 8265682.0, "reward": 3.4386720657348633, "reward_std": 0.4741615056991577, "rewards/evaluation_direction_reward/mean": 0.912109375, "rewards/evaluation_direction_reward/std": 0.22751188278198242, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.0703125, "rewards/move_legality_reward/std": 0.2566775679588318, "rewards/pv_length_reward/mean": 0.42500001192092896, "rewards/pv_length_reward/std": 0.2696681320667267, "rewards/pv_quality_reward/mean": 0.03125, "rewards/pv_quality_reward/std": 0.12150629609823227, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7980782985687256, "sampling/importance_sampling_ratio/mean": 1.0253280401229858, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.8578438758850098, "sampling/sampling_logp_difference/mean": 0.02328907512128353, "step": 212, "step_time": 16.717887543141842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 96.65625, "completions/mean_terminated_length": 96.65625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.5275638401508331, "epoch": 0.1704, "frac_reward_zero_std": 0.75, "grad_norm": 0.6172040104866028, "learning_rate": 9.921348493531701e-06, "loss": 0.005, "num_tokens": 8296142.0, "reward": 3.845170021057129, "reward_std": 0.7909978628158569, "rewards/evaluation_direction_reward/mean": 0.888671875, "rewards/evaluation_direction_reward/std": 0.26769760251045227, "rewards/format_reward/mean": 0.995312511920929, "rewards/format_reward/std": 0.0373520702123642, "rewards/move_legality_reward/mean": 0.3671875, "rewards/move_legality_reward/std": 0.4839322865009308, "rewards/pv_length_reward/mean": 0.4768105447292328, "rewards/pv_length_reward/std": 0.2662941515445709, "rewards/pv_quality_reward/mean": 0.1171875, "rewards/pv_quality_reward/std": 0.322907418012619, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.707052707672119, "sampling/importance_sampling_ratio/mean": 0.9242091178894043, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6765792369842529, "sampling/sampling_logp_difference/mean": 0.02197238989174366, "step": 213, "step_time": 17.348352655768394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 99.7890625, "completions/mean_terminated_length": 99.7890625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.5509024113416672, "epoch": 0.1712, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7587373852729797, "learning_rate": 9.920606722727726e-06, "loss": 0.0124, "num_tokens": 8327227.0, "reward": 3.3787946701049805, "reward_std": 0.4805954098701477, "rewards/evaluation_direction_reward/mean": 0.828125, "rewards/evaluation_direction_reward/std": 0.32866647839546204, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.4178571403026581, "rewards/pv_length_reward/std": 0.21770510077476501, "rewards/pv_quality_reward/mean": 0.0078125, "rewards/pv_quality_reward/std": 0.04366907477378845, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9531168937683105, "sampling/importance_sampling_ratio/mean": 1.017535924911499, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5492435693740845, "sampling/sampling_logp_difference/mean": 0.021590886637568474, "step": 214, "step_time": 17.27706579118967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 100.625, "completions/mean_terminated_length": 100.625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.5348014160990715, "epoch": 0.172, "frac_reward_zero_std": 0.75, "grad_norm": 0.5904310345649719, "learning_rate": 9.919861498452538e-06, "loss": -0.0123, "num_tokens": 8358763.0, "reward": 3.6585936546325684, "reward_std": 0.714840829372406, "rewards/evaluation_direction_reward/mean": 0.865234375, "rewards/evaluation_direction_reward/std": 0.333353191614151, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3125, "rewards/move_legality_reward/std": 0.4653336703777313, "rewards/pv_length_reward/mean": 0.4125000238418579, "rewards/pv_length_reward/std": 0.15735121071338654, "rewards/pv_quality_reward/mean": 0.068359375, "rewards/pv_quality_reward/std": 0.24445593357086182, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.547579288482666, "sampling/importance_sampling_ratio/mean": 0.9653394222259521, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7246589660644531, "sampling/sampling_logp_difference/mean": 0.021658703684806824, "step": 215, "step_time": 17.36807904392481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 101.75, "completions/mean_terminated_length": 101.75, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.5507263503968716, "epoch": 0.1728, "frac_reward_zero_std": 0.75, "grad_norm": 0.4616512954235077, "learning_rate": 9.919112821229165e-06, "loss": -0.0043, "num_tokens": 8390467.0, "reward": 3.693880081176758, "reward_std": 0.5698492527008057, "rewards/evaluation_direction_reward/mean": 0.828125, "rewards/evaluation_direction_reward/std": 0.25823065638542175, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.375, "rewards/move_legality_reward/std": 0.4860251843929291, "rewards/pv_length_reward/mean": 0.39895832538604736, "rewards/pv_length_reward/std": 0.20152254402637482, "rewards/pv_quality_reward/mean": 0.091796875, "rewards/pv_quality_reward/std": 0.2467726618051529, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8398115634918213, "sampling/importance_sampling_ratio/mean": 0.9068424105644226, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6736569404602051, "sampling/sampling_logp_difference/mean": 0.02222319506108761, "step": 216, "step_time": 17.847810193896294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 103.875, "completions/mean_terminated_length": 103.875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.5457164905965328, "epoch": 0.1736, "frac_reward_zero_std": 0.875, "grad_norm": 0.34486714005470276, "learning_rate": 9.918360691583056e-06, "loss": -0.0037, "num_tokens": 8422291.0, "reward": 3.53908109664917, "reward_std": 0.5829433798789978, "rewards/evaluation_direction_reward/mean": 0.900390625, "rewards/evaluation_direction_reward/std": 0.26683446764945984, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.4511904716491699, "rewards/pv_length_reward/std": 0.19041243195533752, "rewards/pv_quality_reward/mean": 0.0625, "rewards/pv_quality_reward/std": 0.24301259219646454, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7468502521514893, "sampling/importance_sampling_ratio/mean": 0.970695436000824, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6272335052490234, "sampling/sampling_logp_difference/mean": 0.023252813145518303, "step": 217, "step_time": 17.160901993513107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 105.015625, "completions/mean_terminated_length": 105.015625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.5661472268402576, "epoch": 0.1744, "frac_reward_zero_std": 0.8125, "grad_norm": 0.5852113962173462, "learning_rate": 9.917605110042084e-06, "loss": -0.0017, "num_tokens": 8454405.0, "reward": 3.343161106109619, "reward_std": 0.44713273644447327, "rewards/evaluation_direction_reward/mean": 0.787109375, "rewards/evaluation_direction_reward/std": 0.36464017629623413, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.0, "rewards/move_legality_reward/std": 0.0, "rewards/pv_length_reward/mean": 0.5560516119003296, "rewards/pv_length_reward/std": 0.21081416308879852, "rewards/pv_quality_reward/mean": 0.0, "rewards/pv_quality_reward/std": 0.0, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.419389009475708, "sampling/importance_sampling_ratio/mean": 0.9424474239349365, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6272950172424316, "sampling/sampling_logp_difference/mean": 0.02266862615942955, "step": 218, "step_time": 16.840833336114883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 104.609375, "completions/mean_terminated_length": 104.609375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.5419118329882622, "epoch": 0.1752, "frac_reward_zero_std": 0.875, "grad_norm": 0.4296424388885498, "learning_rate": 9.916846077136548e-06, "loss": -0.0312, "num_tokens": 8486651.0, "reward": 3.2838728427886963, "reward_std": 0.5016101002693176, "rewards/evaluation_direction_reward/mean": 0.712890625, "rewards/evaluation_direction_reward/std": 0.36598730087280273, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.4459821581840515, "rewards/pv_length_reward/std": 0.21256396174430847, "rewards/pv_quality_reward/mean": 0.0, "rewards/pv_quality_reward/std": 0.0, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.955223798751831, "sampling/importance_sampling_ratio/mean": 0.9473031759262085, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6898369789123535, "sampling/sampling_logp_difference/mean": 0.021836213767528534, "step": 219, "step_time": 17.481353372335434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 107.0546875, "completions/mean_terminated_length": 107.0546875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.5428693778812885, "epoch": 0.176, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5752185583114624, "learning_rate": 9.916083593399167e-06, "loss": -0.0182, "num_tokens": 8519210.0, "reward": 3.337462902069092, "reward_std": 0.5402758717536926, "rewards/evaluation_direction_reward/mean": 0.77734375, "rewards/evaluation_direction_reward/std": 0.33236563205718994, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.40386903285980225, "rewards/pv_length_reward/std": 0.15060095489025116, "rewards/pv_quality_reward/mean": 0.03125, "rewards/pv_quality_reward/std": 0.12150629609823227, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9815945625305176, "sampling/importance_sampling_ratio/mean": 1.0034712553024292, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6985692977905273, "sampling/sampling_logp_difference/mean": 0.021637869998812675, "step": 220, "step_time": 17.28901708871126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 106.015625, "completions/mean_terminated_length": 106.015625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.5540709793567657, "epoch": 0.1768, "frac_reward_zero_std": 0.75, "grad_norm": 0.5444409847259521, "learning_rate": 9.915317659365078e-06, "loss": -0.0014, "num_tokens": 8551116.0, "reward": 3.7242929935455322, "reward_std": 0.6107851266860962, "rewards/evaluation_direction_reward/mean": 0.87109375, "rewards/evaluation_direction_reward/std": 0.23050862550735474, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3125, "rewards/move_legality_reward/std": 0.4653336703777313, "rewards/pv_length_reward/mean": 0.42351192235946655, "rewards/pv_length_reward/std": 0.24769096076488495, "rewards/pv_quality_reward/mean": 0.1171875, "rewards/pv_quality_reward/std": 0.27338916063308716, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5652289390563965, "sampling/importance_sampling_ratio/mean": 0.970901608467102, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6623623371124268, "sampling/sampling_logp_difference/mean": 0.021838149055838585, "step": 221, "step_time": 17.982603691518307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 107.078125, "completions/mean_terminated_length": 107.078125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.5634807497262955, "epoch": 0.1776, "frac_reward_zero_std": 0.75, "grad_norm": 0.45823371410369873, "learning_rate": 9.914548275571845e-06, "loss": -0.0149, "num_tokens": 8583398.0, "reward": 3.3531250953674316, "reward_std": 0.6292823553085327, "rewards/evaluation_direction_reward/mean": 0.74609375, "rewards/evaluation_direction_reward/std": 0.3121306896209717, "rewards/format_reward/mean": 0.999218761920929, "rewards/format_reward/std": 0.008838837035000324, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.3656249940395355, "rewards/pv_length_reward/std": 0.19854843616485596, "rewards/pv_quality_reward/mean": 0.0625, "rewards/pv_quality_reward/std": 0.24301259219646454, "rewards/verbosity_reward/mean": 0.9921875, "rewards/verbosity_reward/std": 0.0883883461356163, "sampling/importance_sampling_ratio/max": 2.8845183849334717, "sampling/importance_sampling_ratio/mean": 0.9778324365615845, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4928274154663086, "sampling/sampling_logp_difference/mean": 0.022778522223234177, "step": 222, "step_time": 17.625556766986847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 107.0546875, "completions/mean_terminated_length": 107.0546875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.5610363371670246, "epoch": 0.1784, "frac_reward_zero_std": 0.8125, "grad_norm": 0.4998296797275543, "learning_rate": 9.913775442559451e-06, "loss": 0.0026, "num_tokens": 8615397.0, "reward": 3.9262547492980957, "reward_std": 0.7713082432746887, "rewards/evaluation_direction_reward/mean": 0.9921875, "rewards/evaluation_direction_reward/std": 0.05377025529742241, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.375, "rewards/move_legality_reward/std": 0.4860251843929291, "rewards/pv_length_reward/mean": 0.4379734992980957, "rewards/pv_length_reward/std": 0.2215777188539505, "rewards/pv_quality_reward/mean": 0.12109375, "rewards/pv_quality_reward/std": 0.32297882437705994, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0058209896087646, "sampling/importance_sampling_ratio/mean": 0.9095339179039001, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7441568374633789, "sampling/sampling_logp_difference/mean": 0.02172688953578472, "step": 223, "step_time": 17.09670104086399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 106.1484375, "completions/mean_terminated_length": 106.1484375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.544129341840744, "epoch": 0.1792, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5025683045387268, "learning_rate": 9.9129991608703e-06, "loss": -0.0016, "num_tokens": 8647568.0, "reward": 3.701599597930908, "reward_std": 0.7067708373069763, "rewards/evaluation_direction_reward/mean": 0.865234375, "rewards/evaluation_direction_reward/std": 0.22818684577941895, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.375, "rewards/move_legality_reward/std": 0.4860251843929291, "rewards/pv_length_reward/mean": 0.31488096714019775, "rewards/pv_length_reward/std": 0.11445900052785873, "rewards/pv_quality_reward/mean": 0.146484375, "rewards/pv_quality_reward/std": 0.29351142048835754, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8792762756347656, "sampling/importance_sampling_ratio/mean": 0.941304087638855, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7785123586654663, "sampling/sampling_logp_difference/mean": 0.022411376237869263, "step": 224, "step_time": 17.446232602000237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 105.859375, "completions/mean_terminated_length": 105.859375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.5598696358501911, "epoch": 0.18, "frac_reward_zero_std": 0.625, "grad_norm": 0.607205867767334, "learning_rate": 9.912219431049217e-06, "loss": -0.0373, "num_tokens": 8679646.0, "reward": 3.622079849243164, "reward_std": 0.5849727392196655, "rewards/evaluation_direction_reward/mean": 0.86328125, "rewards/evaluation_direction_reward/std": 0.25070643424987793, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3125, "rewards/move_legality_reward/std": 0.4653336703777313, "rewards/pv_length_reward/mean": 0.38184523582458496, "rewards/pv_length_reward/std": 0.17860430479049683, "rewards/pv_quality_reward/mean": 0.064453125, "rewards/pv_quality_reward/std": 0.22458644211292267, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.953274726867676, "sampling/importance_sampling_ratio/mean": 0.9860661029815674, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5733137130737305, "sampling/sampling_logp_difference/mean": 0.022326741367578506, "step": 225, "step_time": 17.872941225767136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 105.6328125, "completions/mean_terminated_length": 105.6328125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.549843791872263, "epoch": 0.1808, "frac_reward_zero_std": 0.625, "grad_norm": 0.8182983994483948, "learning_rate": 9.911436253643445e-06, "loss": -0.049, "num_tokens": 8711639.0, "reward": 3.6700706481933594, "reward_std": 0.7156980037689209, "rewards/evaluation_direction_reward/mean": 0.8125, "rewards/evaluation_direction_reward/std": 0.2943028509616852, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3671875, "rewards/move_legality_reward/std": 0.4839322865009308, "rewards/pv_length_reward/mean": 0.39077380299568176, "rewards/pv_length_reward/std": 0.17619073390960693, "rewards/pv_quality_reward/mean": 0.099609375, "rewards/pv_quality_reward/std": 0.2931968867778778, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8845911026000977, "sampling/importance_sampling_ratio/mean": 0.9718968272209167, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.526186466217041, "sampling/sampling_logp_difference/mean": 0.021993009373545647, "step": 226, "step_time": 17.634965158998966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 105.109375, "completions/mean_terminated_length": 105.109375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.5470160245895386, "epoch": 0.1816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.910649629202648e-06, "loss": 0.0, "num_tokens": 8743661.0, "reward": 3.5677084922790527, "reward_std": 0.421365886926651, "rewards/evaluation_direction_reward/mean": 0.9375, "rewards/evaluation_direction_reward/std": 0.14030338823795319, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.3645833432674408, "rewards/pv_length_reward/std": 0.17041712999343872, "rewards/pv_quality_reward/mean": 0.015625, "rewards/pv_quality_reward/std": 0.060753148049116135, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.867856979370117, "sampling/importance_sampling_ratio/mean": 1.0211164951324463, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7300472259521484, "sampling/sampling_logp_difference/mean": 0.02228063903748989, "step": 227, "step_time": 18.083039298653603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 105.2578125, "completions/mean_terminated_length": 105.2578125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.5500266253948212, "epoch": 0.1824, "frac_reward_zero_std": 0.875, "grad_norm": 0.37441495060920715, "learning_rate": 9.90985955827891e-06, "loss": -0.0092, "num_tokens": 8775750.0, "reward": 3.6420016288757324, "reward_std": 0.5908427238464355, "rewards/evaluation_direction_reward/mean": 0.771484375, "rewards/evaluation_direction_reward/std": 0.33938828110694885, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3125, "rewards/move_legality_reward/std": 0.4653336703777313, "rewards/pv_length_reward/mean": 0.43497025966644287, "rewards/pv_length_reward/std": 0.17611540853977203, "rewards/pv_quality_reward/mean": 0.123046875, "rewards/pv_quality_reward/std": 0.2661997377872467, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8604187965393066, "sampling/importance_sampling_ratio/mean": 0.8823824524879456, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9216549396514893, "sampling/sampling_logp_difference/mean": 0.02282278798520565, "step": 228, "step_time": 17.949894800782204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 103.7421875, "completions/mean_terminated_length": 103.7421875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.5484180822968483, "epoch": 0.1832, "frac_reward_zero_std": 0.75, "grad_norm": 0.5977399945259094, "learning_rate": 9.909066041426733e-06, "loss": 0.0136, "num_tokens": 8807237.0, "reward": 3.7017672061920166, "reward_std": 0.6043641567230225, "rewards/evaluation_direction_reward/mean": 0.90234375, "rewards/evaluation_direction_reward/std": 0.2494765669107437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.375, "rewards/move_legality_reward/std": 0.4860251843929291, "rewards/pv_length_reward/mean": 0.31309524178504944, "rewards/pv_length_reward/std": 0.18074505031108856, "rewards/pv_quality_reward/mean": 0.111328125, "rewards/pv_quality_reward/std": 0.28027087450027466, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7084922790527344, "sampling/importance_sampling_ratio/mean": 0.930148184299469, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9853119850158691, "sampling/sampling_logp_difference/mean": 0.022082868963479996, "step": 229, "step_time": 17.498957507312298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 102.9453125, "completions/mean_terminated_length": 102.9453125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.5432911086827517, "epoch": 0.184, "frac_reward_zero_std": 0.75, "grad_norm": 0.48023471236228943, "learning_rate": 9.908269079203039e-06, "loss": -0.0228, "num_tokens": 8838630.0, "reward": 3.602827548980713, "reward_std": 0.634362518787384, "rewards/evaluation_direction_reward/mean": 0.875, "rewards/evaluation_direction_reward/std": 0.24901379644870758, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3125, "rewards/move_legality_reward/std": 0.4653336703777313, "rewards/pv_length_reward/mean": 0.36845237016677856, "rewards/pv_length_reward/std": 0.183469757437706, "rewards/pv_quality_reward/mean": 0.046875, "rewards/pv_quality_reward/std": 0.1822594404220581, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5040571689605713, "sampling/importance_sampling_ratio/mean": 0.9936805963516235, "sampling/importance_sampling_ratio/min": 0.2685393989086151, "sampling/sampling_logp_difference/max": 0.49825477600097656, "sampling/sampling_logp_difference/mean": 0.02144652046263218, "step": 230, "step_time": 17.646124571561813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 104.734375, "completions/mean_terminated_length": 104.734375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.5454355180263519, "epoch": 0.1848, "frac_reward_zero_std": 0.8125, "grad_norm": 0.775268018245697, "learning_rate": 9.907468672167165e-06, "loss": 0.0067, "num_tokens": 8870516.0, "reward": 3.692392110824585, "reward_std": 0.6889772415161133, "rewards/evaluation_direction_reward/mean": 0.935546875, "rewards/evaluation_direction_reward/std": 0.1637590378522873, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3125, "rewards/move_legality_reward/std": 0.4653336703777313, "rewards/pv_length_reward/mean": 0.35059526562690735, "rewards/pv_length_reward/std": 0.20770162343978882, "rewards/pv_quality_reward/mean": 0.09375, "rewards/pv_quality_reward/std": 0.26435181498527527, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.4302053451538086, "sampling/importance_sampling_ratio/mean": 0.9272066354751587, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.868140697479248, "sampling/sampling_logp_difference/mean": 0.0236398633569479, "step": 231, "step_time": 17.65247306972742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 102.7890625, "completions/mean_terminated_length": 102.7890625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.5386176072061062, "epoch": 0.1856, "frac_reward_zero_std": 0.75, "grad_norm": 0.5816819071769714, "learning_rate": 9.906664820880869e-06, "loss": -0.0039, "num_tokens": 8902217.0, "reward": 3.717317581176758, "reward_std": 0.6313873529434204, "rewards/evaluation_direction_reward/mean": 0.791015625, "rewards/evaluation_direction_reward/std": 0.2955998480319977, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.375, "rewards/move_legality_reward/std": 0.4860251843929291, "rewards/pv_length_reward/mean": 0.44583335518836975, "rewards/pv_length_reward/std": 0.17302821576595306, "rewards/pv_quality_reward/mean": 0.10546875, "rewards/pv_quality_reward/std": 0.28686702251434326, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7030162811279297, "sampling/importance_sampling_ratio/mean": 0.9872941374778748, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8111996650695801, "sampling/sampling_logp_difference/mean": 0.023395167663693428, "step": 232, "step_time": 18.09365525841713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 100.53125, "completions/mean_terminated_length": 100.53125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.5466305650770664, "epoch": 0.1864, "frac_reward_zero_std": 0.75, "grad_norm": 0.5701033473014832, "learning_rate": 9.905857525908322e-06, "loss": 0.0023, "num_tokens": 8933637.0, "reward": 3.3897323608398438, "reward_std": 0.4459342360496521, "rewards/evaluation_direction_reward/mean": 0.796875, "rewards/evaluation_direction_reward/std": 0.2278580516576767, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.43660715222358704, "rewards/pv_length_reward/std": 0.17498065531253815, "rewards/pv_quality_reward/mean": 0.03125, "rewards/pv_quality_reward/std": 0.12150629609823227, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.385957956314087, "sampling/importance_sampling_ratio/mean": 0.9988860487937927, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6484885215759277, "sampling/sampling_logp_difference/mean": 0.022515293210744858, "step": 233, "step_time": 16.707654118537903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 99.375, "completions/mean_terminated_length": 99.375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.5163170322775841, "epoch": 0.1872, "frac_reward_zero_std": 0.9375, "grad_norm": 0.29670703411102295, "learning_rate": 9.905046787816118e-06, "loss": -0.0007, "num_tokens": 8965053.0, "reward": 3.4569196701049805, "reward_std": 0.589154064655304, "rewards/evaluation_direction_reward/mean": 0.7265625, "rewards/evaluation_direction_reward/std": 0.38351550698280334, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.5428571701049805, "rewards/pv_length_reward/std": 0.20041558146476746, "rewards/pv_quality_reward/mean": 0.0625, "rewards/pv_quality_reward/std": 0.24301259219646454, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9123666286468506, "sampling/importance_sampling_ratio/mean": 0.9777878522872925, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.040832757949829, "sampling/sampling_logp_difference/mean": 0.022633792832493782, "step": 234, "step_time": 16.812264159321785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 100.515625, "completions/mean_terminated_length": 100.515625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.5231474712491035, "epoch": 0.188, "frac_reward_zero_std": 0.8125, "grad_norm": 0.45764613151550293, "learning_rate": 9.904232607173262e-06, "loss": 0.0126, "num_tokens": 8996583.0, "reward": 3.5699572563171387, "reward_std": 0.4891619086265564, "rewards/evaluation_direction_reward/mean": 0.82421875, "rewards/evaluation_direction_reward/std": 0.22041338682174683, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.4488636255264282, "rewards/pv_length_reward/std": 0.17795588076114655, "rewards/pv_quality_reward/mean": 0.109375, "rewards/pv_quality_reward/std": 0.2657443881034851, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.935485363006592, "sampling/importance_sampling_ratio/mean": 0.9827637672424316, "sampling/importance_sampling_ratio/min": 0.1636374294757843, "sampling/sampling_logp_difference/max": 0.9370012283325195, "sampling/sampling_logp_difference/mean": 0.021892964839935303, "step": 235, "step_time": 16.772784367203712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 99.578125, "completions/mean_terminated_length": 99.578125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.518452350050211, "epoch": 0.1888, "frac_reward_zero_std": 0.875, "grad_norm": 0.42984098196029663, "learning_rate": 9.903414984551178e-06, "loss": 0.0047, "num_tokens": 9027913.0, "reward": 3.80078125, "reward_std": 0.46886834502220154, "rewards/evaluation_direction_reward/mean": 0.94140625, "rewards/evaluation_direction_reward/std": 0.1384829729795456, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.4375, "rewards/move_legality_reward/std": 0.49802759289741516, "rewards/pv_length_reward/mean": 0.375, "rewards/pv_length_reward/std": 0.18310055136680603, "rewards/pv_quality_reward/mean": 0.046875, "rewards/pv_quality_reward/std": 0.1822594404220581, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7136456966400146, "sampling/importance_sampling_ratio/mean": 1.0023396015167236, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9012866020202637, "sampling/sampling_logp_difference/mean": 0.020886529237031937, "step": 236, "step_time": 17.180096842348576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 99.96875, "completions/mean_terminated_length": 99.96875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.5331674627959728, "epoch": 0.1896, "frac_reward_zero_std": 0.75, "grad_norm": 0.5375129580497742, "learning_rate": 9.902593920523706e-06, "loss": 0.0004, "num_tokens": 9059541.0, "reward": 4.051723480224609, "reward_std": 0.8397536873817444, "rewards/evaluation_direction_reward/mean": 0.92578125, "rewards/evaluation_direction_reward/std": 0.2457500398159027, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.5625, "rewards/move_legality_reward/std": 0.49802759289741516, "rewards/pv_length_reward/mean": 0.4071924686431885, "rewards/pv_length_reward/std": 0.22580036520957947, "rewards/pv_quality_reward/mean": 0.15625, "rewards/pv_quality_reward/std": 0.3275415599346161, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5171000957489014, "sampling/importance_sampling_ratio/mean": 1.0253654718399048, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7794666290283203, "sampling/sampling_logp_difference/mean": 0.022065196186304092, "step": 237, "step_time": 17.3582820892334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 98.375, "completions/mean_terminated_length": 98.375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.5192316267639399, "epoch": 0.1904, "frac_reward_zero_std": 0.875, "grad_norm": 0.538259744644165, "learning_rate": 9.9017694156671e-06, "loss": 0.0138, "num_tokens": 9090333.0, "reward": 3.6780219078063965, "reward_std": 0.7435531616210938, "rewards/evaluation_direction_reward/mean": 0.9140625, "rewards/evaluation_direction_reward/std": 0.17868036031723022, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.4260687232017517, "rewards/pv_length_reward/std": 0.263492614030838, "rewards/pv_quality_reward/mean": 0.087890625, "rewards/pv_quality_reward/std": 0.25792524218559265, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5736887454986572, "sampling/importance_sampling_ratio/mean": 1.022862195968628, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6712779998779297, "sampling/sampling_logp_difference/mean": 0.022090081125497818, "step": 238, "step_time": 16.764518104493618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 98.90625, "completions/mean_terminated_length": 98.90625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.5262427255511284, "epoch": 0.1912, "frac_reward_zero_std": 0.8125, "grad_norm": 0.46122536063194275, "learning_rate": 9.900941470560025e-06, "loss": 0.0104, "num_tokens": 9121489.0, "reward": 3.489732265472412, "reward_std": 0.49739760160446167, "rewards/evaluation_direction_reward/mean": 0.84375, "rewards/evaluation_direction_reward/std": 0.23685936629772186, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.44285714626312256, "rewards/pv_length_reward/std": 0.23563633859157562, "rewards/pv_quality_reward/mean": 0.015625, "rewards/pv_quality_reward/std": 0.060753148049116135, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7481682300567627, "sampling/importance_sampling_ratio/mean": 1.0751875638961792, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7343654632568359, "sampling/sampling_logp_difference/mean": 0.022898102179169655, "step": 239, "step_time": 16.783232107758522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 97.765625, "completions/mean_terminated_length": 97.765625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.5019750501960516, "epoch": 0.192, "frac_reward_zero_std": 0.875, "grad_norm": 0.398515909910202, "learning_rate": 9.900110085783573e-06, "loss": -0.0131, "num_tokens": 9152355.0, "reward": 3.644841194152832, "reward_std": 0.720064640045166, "rewards/evaluation_direction_reward/mean": 0.828125, "rewards/evaluation_direction_reward/std": 0.3404345214366913, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.375, "rewards/move_legality_reward/std": 0.4860251843929291, "rewards/pv_length_reward/mean": 0.3167162835597992, "rewards/pv_length_reward/std": 0.17248709499835968, "rewards/pv_quality_reward/mean": 0.125, "rewards/pv_quality_reward/std": 0.3320184051990509, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8477513790130615, "sampling/importance_sampling_ratio/mean": 0.9344274401664734, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8782403469085693, "sampling/sampling_logp_difference/mean": 0.02126310020685196, "step": 240, "step_time": 16.829302176833153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 95.390625, "completions/mean_terminated_length": 95.390625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.513994213193655, "epoch": 0.1928, "frac_reward_zero_std": 0.8125, "grad_norm": 0.5011602640151978, "learning_rate": 9.899275261921236e-06, "loss": -0.0039, "num_tokens": 9182901.0, "reward": 3.66011905670166, "reward_std": 0.7575767040252686, "rewards/evaluation_direction_reward/mean": 0.9375, "rewards/evaluation_direction_reward/std": 0.1471514254808426, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.41011905670166016, "rewards/pv_length_reward/std": 0.2017926275730133, "rewards/pv_quality_reward/mean": 0.0625, "rewards/pv_quality_reward/std": 0.24301259219646454, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.649271249771118, "sampling/importance_sampling_ratio/mean": 0.9553195238113403, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6489603519439697, "sampling/sampling_logp_difference/mean": 0.0216241292655468, "step": 241, "step_time": 16.796678490936756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 96.6171875, "completions/mean_terminated_length": 96.6171875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.49492721259593964, "epoch": 0.1936, "frac_reward_zero_std": 0.875, "grad_norm": 0.2835954427719116, "learning_rate": 9.898436999558924e-06, "loss": -0.0076, "num_tokens": 9214092.0, "reward": 3.4559521675109863, "reward_std": 0.6267017126083374, "rewards/evaluation_direction_reward/mean": 0.783203125, "rewards/evaluation_direction_reward/std": 0.3384353220462799, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.40907740592956543, "rewards/pv_length_reward/std": 0.21301957964897156, "rewards/pv_quality_reward/mean": 0.013671875, "rewards/pv_quality_reward/std": 0.05706566199660301, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5968315601348877, "sampling/importance_sampling_ratio/mean": 0.9929227828979492, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8135902881622314, "sampling/sampling_logp_difference/mean": 0.02145012468099594, "step": 242, "step_time": 16.947001062333584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 97.0234375, "completions/mean_terminated_length": 97.0234375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.5092323329299688, "epoch": 0.1944, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6082643270492554, "learning_rate": 9.897595299284968e-06, "loss": 0.0096, "num_tokens": 9244975.0, "reward": 3.7793402671813965, "reward_std": 0.582976758480072, "rewards/evaluation_direction_reward/mean": 0.927734375, "rewards/evaluation_direction_reward/std": 0.14428871870040894, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3125, "rewards/move_legality_reward/std": 0.4653336703777313, "rewards/pv_length_reward/mean": 0.4121527671813965, "rewards/pv_length_reward/std": 0.17027267813682556, "rewards/pv_quality_reward/mean": 0.126953125, "rewards/pv_quality_reward/std": 0.26247626543045044, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.4935948848724365, "sampling/importance_sampling_ratio/mean": 0.9253159761428833, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6057093143463135, "sampling/sampling_logp_difference/mean": 0.021331213414669037, "step": 243, "step_time": 17.389711931347847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 94.3359375, "completions/mean_terminated_length": 94.3359375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.5012060906738043, "epoch": 0.1952, "frac_reward_zero_std": 0.8125, "grad_norm": 0.6112009286880493, "learning_rate": 9.8967501616901e-06, "loss": 0.0204, "num_tokens": 9275506.0, "reward": 3.9551711082458496, "reward_std": 0.6389508247375488, "rewards/evaluation_direction_reward/mean": 0.953125, "rewards/evaluation_direction_reward/std": 0.12839867174625397, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.5, "rewards/move_legality_reward/std": 0.5019646286964417, "rewards/pv_length_reward/mean": 0.334077388048172, "rewards/pv_length_reward/std": 0.17164449393749237, "rewards/pv_quality_reward/mean": 0.16796875, "rewards/pv_quality_reward/std": 0.32770583033561707, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.504556179046631, "sampling/importance_sampling_ratio/mean": 0.9268198013305664, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6201486587524414, "sampling/sampling_logp_difference/mean": 0.02096528373658657, "step": 244, "step_time": 17.34862156957388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 95.28125, "completions/mean_terminated_length": 95.28125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.5003610085695982, "epoch": 0.196, "frac_reward_zero_std": 0.75, "grad_norm": 0.46289047598838806, "learning_rate": 9.895901587367473e-06, "loss": -0.0132, "num_tokens": 9306358.0, "reward": 3.729445695877075, "reward_std": 0.6033271551132202, "rewards/evaluation_direction_reward/mean": 0.8828125, "rewards/evaluation_direction_reward/std": 0.2528139054775238, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.375, "rewards/move_legality_reward/std": 0.4860251843929291, "rewards/pv_length_reward/mean": 0.3720238208770752, "rewards/pv_length_reward/std": 0.1930525153875351, "rewards/pv_quality_reward/mean": 0.099609375, "rewards/pv_quality_reward/std": 0.22036978602409363, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.771207809448242, "sampling/importance_sampling_ratio/mean": 0.9497836828231812, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7307982444763184, "sampling/sampling_logp_difference/mean": 0.021479716524481773, "step": 245, "step_time": 16.91723706573248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 96.90625, "completions/mean_terminated_length": 96.90625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.5069093126803637, "epoch": 0.1968, "frac_reward_zero_std": 0.75, "grad_norm": 1.1540782451629639, "learning_rate": 9.89504957691265e-06, "loss": 0.0184, "num_tokens": 9337794.0, "reward": 3.6179873943328857, "reward_std": 0.6151419281959534, "rewards/evaluation_direction_reward/mean": 0.88671875, "rewards/evaluation_direction_reward/std": 0.2659468650817871, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3125, "rewards/move_legality_reward/std": 0.4653336703777313, "rewards/pv_length_reward/mean": 0.33869048953056335, "rewards/pv_length_reward/std": 0.19167135655879974, "rewards/pv_quality_reward/mean": 0.080078125, "rewards/pv_quality_reward/std": 0.24489592015743256, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.863454818725586, "sampling/importance_sampling_ratio/mean": 1.0335344076156616, "sampling/importance_sampling_ratio/min": 0.21266669034957886, "sampling/sampling_logp_difference/max": 0.675990641117096, "sampling/sampling_logp_difference/mean": 0.02164638042449951, "step": 246, "step_time": 17.320264413952827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 94.90625, "completions/mean_terminated_length": 94.90625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.4973294176161289, "epoch": 0.1976, "frac_reward_zero_std": 0.875, "grad_norm": 0.29928863048553467, "learning_rate": 9.894194130923602e-06, "loss": -0.0338, "num_tokens": 9368598.0, "reward": 3.5631511211395264, "reward_std": 0.3979416787624359, "rewards/evaluation_direction_reward/mean": 0.958984375, "rewards/evaluation_direction_reward/std": 0.13594718277454376, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.4166666865348816, "rewards/pv_length_reward/std": 0.22190703451633453, "rewards/pv_quality_reward/mean": 0.0, "rewards/pv_quality_reward/std": 0.0, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5121893882751465, "sampling/importance_sampling_ratio/mean": 0.8862806558609009, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.732081413269043, "sampling/sampling_logp_difference/mean": 0.021203109994530678, "step": 247, "step_time": 16.628403887152672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 95.8828125, "completions/mean_terminated_length": 95.8828125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.5192444026470184, "epoch": 0.1984, "frac_reward_zero_std": 0.75, "grad_norm": 0.6607003808021545, "learning_rate": 9.893335250000715e-06, "loss": -0.0305, "num_tokens": 9399431.0, "reward": 3.479296922683716, "reward_std": 0.49456697702407837, "rewards/evaluation_direction_reward/mean": 0.884765625, "rewards/evaluation_direction_reward/std": 0.21713584661483765, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.43437498807907104, "rewards/pv_length_reward/std": 0.1497117429971695, "rewards/pv_quality_reward/mean": 0.03515625, "rewards/pv_quality_reward/std": 0.1244451254606247, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.894920825958252, "sampling/importance_sampling_ratio/mean": 0.9784379601478577, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6023240089416504, "sampling/sampling_logp_difference/mean": 0.022289590910077095, "step": 248, "step_time": 16.947919078171253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 94.71875, "completions/mean_terminated_length": 94.71875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.4919206704944372, "epoch": 0.1992, "frac_reward_zero_std": 0.8125, "grad_norm": 0.5109397768974304, "learning_rate": 9.892472934746784e-06, "loss": 0.0173, "num_tokens": 9430067.0, "reward": 3.7493488788604736, "reward_std": 0.5728984475135803, "rewards/evaluation_direction_reward/mean": 0.908203125, "rewards/evaluation_direction_reward/std": 0.1930665671825409, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.4375, "rewards/move_legality_reward/std": 0.49802759289741516, "rewards/pv_length_reward/mean": 0.3333333134651184, "rewards/pv_length_reward/std": 0.17388565838336945, "rewards/pv_quality_reward/mean": 0.0703125, "rewards/pv_quality_reward/std": 0.2449037730693817, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.600476026535034, "sampling/importance_sampling_ratio/mean": 0.9963136911392212, "sampling/importance_sampling_ratio/min": 0.23234066367149353, "sampling/sampling_logp_difference/max": 0.6850242614746094, "sampling/sampling_logp_difference/mean": 0.021773304790258408, "step": 249, "step_time": 17.740871153771877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 96.609375, "completions/mean_terminated_length": 96.609375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.500329939648509, "epoch": 0.2, "frac_reward_zero_std": 0.75, "grad_norm": 0.4895586669445038, "learning_rate": 9.891607185767018e-06, "loss": -0.0016, "num_tokens": 9461105.0, "reward": 3.284449577331543, "reward_std": 0.5138751864433289, "rewards/evaluation_direction_reward/mean": 0.7734375, "rewards/evaluation_direction_reward/std": 0.3399824798107147, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.37038689851760864, "rewards/pv_length_reward/std": 0.2020251452922821, "rewards/pv_quality_reward/mean": 0.015625, "rewards/pv_quality_reward/std": 0.060753148049116135, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.878535747528076, "sampling/importance_sampling_ratio/mean": 1.0455741882324219, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.833798885345459, "sampling/sampling_logp_difference/mean": 0.022324781864881516, "step": 250, "step_time": 16.68236730992794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 93.9765625, "completions/mean_terminated_length": 93.9765625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.48797330260276794, "epoch": 0.2008, "frac_reward_zero_std": 0.8125, "grad_norm": 0.5822048783302307, "learning_rate": 9.890738003669029e-06, "loss": -0.0256, "num_tokens": 9491550.0, "reward": 3.7183594703674316, "reward_std": 0.7940555810928345, "rewards/evaluation_direction_reward/mean": 0.896484375, "rewards/evaluation_direction_reward/std": 0.23166494071483612, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.4625000059604645, "rewards/pv_length_reward/std": 0.22382189333438873, "rewards/pv_quality_reward/mean": 0.109375, "rewards/pv_quality_reward/std": 0.2657443881034851, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6040706634521484, "sampling/importance_sampling_ratio/mean": 0.9859834909439087, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5998554229736328, "sampling/sampling_logp_difference/mean": 0.022045809775590897, "step": 251, "step_time": 16.84421817958355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 96.5859375, "completions/mean_terminated_length": 96.5859375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.49278309755027294, "epoch": 0.2016, "frac_reward_zero_std": 0.8125, "grad_norm": 0.4780709445476532, "learning_rate": 9.889865389062845e-06, "loss": 0.012, "num_tokens": 9522713.0, "reward": 3.7925782203674316, "reward_std": 0.585748016834259, "rewards/evaluation_direction_reward/mean": 0.8046875, "rewards/evaluation_direction_reward/std": 0.3259412348270416, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.375, "rewards/move_legality_reward/std": 0.4860251843929291, "rewards/pv_length_reward/mean": 0.40000003576278687, "rewards/pv_length_reward/std": 0.13593266904354095, "rewards/pv_quality_reward/mean": 0.212890625, "rewards/pv_quality_reward/std": 0.388173907995224, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.649976968765259, "sampling/importance_sampling_ratio/mean": 0.9113112688064575, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.604292631149292, "sampling/sampling_logp_difference/mean": 0.021862102672457695, "step": 252, "step_time": 17.38397814333439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 96.484375, "completions/mean_terminated_length": 96.484375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.5105656329542398, "epoch": 0.2024, "frac_reward_zero_std": 0.75, "grad_norm": 0.9248540997505188, "learning_rate": 9.8889893425609e-06, "loss": 0.0336, "num_tokens": 9553583.0, "reward": 3.5591888427734375, "reward_std": 0.8168637752532959, "rewards/evaluation_direction_reward/mean": 0.87890625, "rewards/evaluation_direction_reward/std": 0.27166804671287537, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1953125, "rewards/move_legality_reward/std": 0.3979988098144531, "rewards/pv_length_reward/mean": 0.36778274178504944, "rewards/pv_length_reward/std": 0.2493160218000412, "rewards/pv_quality_reward/mean": 0.1171875, "rewards/pv_quality_reward/std": 0.3040693402290344, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.990858793258667, "sampling/importance_sampling_ratio/mean": 1.0215094089508057, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7464056015014648, "sampling/sampling_logp_difference/mean": 0.022537177428603172, "step": 253, "step_time": 16.783539697527885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 94.5234375, "completions/mean_terminated_length": 94.5234375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.49080866388976574, "epoch": 0.2032, "frac_reward_zero_std": 0.6875, "grad_norm": 1.0064436197280884, "learning_rate": 9.888109864778036e-06, "loss": 0.047, "num_tokens": 9583914.0, "reward": 3.9278831481933594, "reward_std": 0.6212257146835327, "rewards/evaluation_direction_reward/mean": 0.939453125, "rewards/evaluation_direction_reward/std": 0.13216136395931244, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.5, "rewards/move_legality_reward/std": 0.5019646286964417, "rewards/pv_length_reward/mean": 0.35171130299568176, "rewards/pv_length_reward/std": 0.1771663874387741, "rewards/pv_quality_reward/mean": 0.13671875, "rewards/pv_quality_reward/std": 0.2696223855018616, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.876636028289795, "sampling/importance_sampling_ratio/mean": 0.9566516280174255, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.345326542854309, "sampling/sampling_logp_difference/mean": 0.021799851208925247, "step": 254, "step_time": 17.113194718956947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 93.71875, "completions/mean_terminated_length": 93.71875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.47428605891764164, "epoch": 0.204, "frac_reward_zero_std": 0.9375, "grad_norm": 0.1853635013103485, "learning_rate": 9.887226956331506e-06, "loss": -0.0052, "num_tokens": 9614366.0, "reward": 3.724349021911621, "reward_std": 0.8466208577156067, "rewards/evaluation_direction_reward/mean": 0.8515625, "rewards/evaluation_direction_reward/std": 0.289986252784729, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.375, "rewards/move_legality_reward/std": 0.4860251843929291, "rewards/pv_length_reward/mean": 0.4333333373069763, "rewards/pv_length_reward/std": 0.25485658645629883, "rewards/pv_quality_reward/mean": 0.064453125, "rewards/pv_quality_reward/std": 0.243510439991951, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.3513989448547363, "sampling/importance_sampling_ratio/mean": 0.933074951171875, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6633062362670898, "sampling/sampling_logp_difference/mean": 0.020800109952688217, "step": 255, "step_time": 16.803112633526325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 95.9140625, "completions/mean_terminated_length": 95.9140625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.4913067314773798, "epoch": 0.2048, "frac_reward_zero_std": 0.75, "grad_norm": 0.5660045146942139, "learning_rate": 9.886340617840968e-06, "loss": -0.0076, "num_tokens": 9645547.0, "reward": 3.6637649536132812, "reward_std": 0.4807610511779785, "rewards/evaluation_direction_reward/mean": 0.875, "rewards/evaluation_direction_reward/std": 0.2716963291168213, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.4137648940086365, "rewards/pv_length_reward/std": 0.20706667006015778, "rewards/pv_quality_reward/mean": 0.125, "rewards/pv_quality_reward/std": 0.3320184051990509, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.646975040435791, "sampling/importance_sampling_ratio/mean": 0.9994962215423584, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7454042434692383, "sampling/sampling_logp_difference/mean": 0.021700629964470863, "step": 256, "step_time": 16.907234117388725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 94.828125, "completions/mean_terminated_length": 94.828125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.4901136979460716, "epoch": 0.2056, "frac_reward_zero_std": 0.5, "grad_norm": 0.714374303817749, "learning_rate": 9.885450849928489e-06, "loss": 0.0264, "num_tokens": 9676165.0, "reward": 3.3678572177886963, "reward_std": 0.4951363205909729, "rewards/evaluation_direction_reward/mean": 0.7734375, "rewards/evaluation_direction_reward/std": 0.31126725673675537, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.3991071581840515, "rewards/pv_length_reward/std": 0.21687495708465576, "rewards/pv_quality_reward/mean": 0.0078125, "rewards/pv_quality_reward/std": 0.04366907477378845, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.4939537048339844, "sampling/importance_sampling_ratio/mean": 0.969041645526886, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5893857479095459, "sampling/sampling_logp_difference/mean": 0.020574891939759254, "step": 257, "step_time": 16.87489653378725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 93.625, "completions/mean_terminated_length": 93.625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.4883228335529566, "epoch": 0.2064, "frac_reward_zero_std": 0.9375, "grad_norm": 0.19903874397277832, "learning_rate": 9.884557653218544e-06, "loss": -0.0055, "num_tokens": 9706789.0, "reward": 3.6438803672790527, "reward_std": 0.53044193983078, "rewards/evaluation_direction_reward/mean": 0.873046875, "rewards/evaluation_direction_reward/std": 0.23476427793502808, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.4427083134651184, "rewards/pv_length_reward/std": 0.1687760204076767, "rewards/pv_quality_reward/mean": 0.078125, "rewards/pv_quality_reward/std": 0.24653105437755585, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2520675659179688, "sampling/importance_sampling_ratio/mean": 0.8768810629844666, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7734333276748657, "sampling/sampling_logp_difference/mean": 0.02198687754571438, "step": 258, "step_time": 16.739206090569496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 92.3125, "completions/mean_terminated_length": 92.3125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.4705841485410929, "epoch": 0.2072, "frac_reward_zero_std": 0.75, "grad_norm": 0.587548553943634, "learning_rate": 9.883661028338009e-06, "loss": 0.0006, "num_tokens": 9737077.0, "reward": 4.071558952331543, "reward_std": 0.8220670819282532, "rewards/evaluation_direction_reward/mean": 0.71875, "rewards/evaluation_direction_reward/std": 0.3436717092990875, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.5625, "rewards/move_legality_reward/std": 0.49802759289741516, "rewards/pv_length_reward/mean": 0.44851192831993103, "rewards/pv_length_reward/std": 0.26870885491371155, "rewards/pv_quality_reward/mean": 0.341796875, "rewards/pv_quality_reward/std": 0.45014333724975586, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5314736366271973, "sampling/importance_sampling_ratio/mean": 0.9490119814872742, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6851232051849365, "sampling/sampling_logp_difference/mean": 0.020847661420702934, "step": 259, "step_time": 17.380682162940502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 93.84375, "completions/mean_terminated_length": 93.84375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.47628372348845005, "epoch": 0.208, "frac_reward_zero_std": 0.8125, "grad_norm": 0.6387223601341248, "learning_rate": 9.882760975916173e-06, "loss": 0.0089, "num_tokens": 9767817.0, "reward": 3.9422061443328857, "reward_std": 0.6610453724861145, "rewards/evaluation_direction_reward/mean": 0.935546875, "rewards/evaluation_direction_reward/std": 0.14116394519805908, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.5, "rewards/move_legality_reward/std": 0.5019646286964417, "rewards/pv_length_reward/mean": 0.33869045972824097, "rewards/pv_length_reward/std": 0.21584278345108032, "rewards/pv_quality_reward/mean": 0.16796875, "rewards/pv_quality_reward/std": 0.28942885994911194, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9399585723876953, "sampling/importance_sampling_ratio/mean": 1.0163465738296509, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.848233699798584, "sampling/sampling_logp_difference/mean": 0.022073090076446533, "step": 260, "step_time": 16.960545480251312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/max_terminated_length": 119.0, "completions/mean_length": 93.390625, "completions/mean_terminated_length": 93.390625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.47909967228770256, "epoch": 0.2088, "frac_reward_zero_std": 0.9375, "grad_norm": 0.5060194134712219, "learning_rate": 9.881857496584726e-06, "loss": 0.0092, "num_tokens": 9798267.0, "reward": 3.5867373943328857, "reward_std": 0.5669169425964355, "rewards/evaluation_direction_reward/mean": 0.90625, "rewards/evaluation_direction_reward/std": 0.19592301547527313, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.33869048953056335, "rewards/pv_length_reward/std": 0.15626724064350128, "rewards/pv_quality_reward/mean": 0.091796875, "rewards/pv_quality_reward/std": 0.1930665671825409, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.4842164516448975, "sampling/importance_sampling_ratio/mean": 1.0519921779632568, "sampling/importance_sampling_ratio/min": 0.21658602356910706, "sampling/sampling_logp_difference/max": 0.7027907371520996, "sampling/sampling_logp_difference/mean": 0.021189743652939796, "step": 261, "step_time": 16.4410208389163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 92.7421875, "completions/mean_terminated_length": 92.7421875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.4782496355473995, "epoch": 0.2096, "frac_reward_zero_std": 0.875, "grad_norm": 0.36264777183532715, "learning_rate": 9.880950590977764e-06, "loss": -0.0054, "num_tokens": 9828522.0, "reward": 3.985881805419922, "reward_std": 0.7467353343963623, "rewards/evaluation_direction_reward/mean": 0.8125, "rewards/evaluation_direction_reward/std": 0.3493525981903076, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.5, "rewards/move_legality_reward/std": 0.5019646286964417, "rewards/pv_length_reward/mean": 0.4839285612106323, "rewards/pv_length_reward/std": 0.2254307121038437, "rewards/pv_quality_reward/mean": 0.189453125, "rewards/pv_quality_reward/std": 0.3682076334953308, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8565075397491455, "sampling/importance_sampling_ratio/mean": 0.9538329839706421, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7936763763427734, "sampling/sampling_logp_difference/mean": 0.022299468517303467, "step": 262, "step_time": 18.061093911528587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 93.90625, "completions/mean_terminated_length": 93.90625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.49330000951886177, "epoch": 0.2104, "frac_reward_zero_std": 0.9375, "grad_norm": 0.2863839268684387, "learning_rate": 9.88004025973179e-06, "loss": 0.0064, "num_tokens": 9859374.0, "reward": 3.480022430419922, "reward_std": 0.40458211302757263, "rewards/evaluation_direction_reward/mean": 0.87890625, "rewards/evaluation_direction_reward/std": 0.19335509836673737, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.0625, "rewards/move_legality_reward/std": 0.24301259219646454, "rewards/pv_length_reward/mean": 0.4839285612106323, "rewards/pv_length_reward/std": 0.19621556997299194, "rewards/pv_quality_reward/mean": 0.0546875, "rewards/pv_quality_reward/std": 0.2149379551410675, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.921642541885376, "sampling/importance_sampling_ratio/mean": 0.9982161521911621, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5532766580581665, "sampling/sampling_logp_difference/mean": 0.020827021449804306, "step": 263, "step_time": 16.80552240461111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 93.96875, "completions/mean_terminated_length": 93.96875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.48688468895852566, "epoch": 0.2112, "frac_reward_zero_std": 0.8125, "grad_norm": 0.47269248962402344, "learning_rate": 9.879126503485709e-06, "loss": -0.0064, "num_tokens": 9890058.0, "reward": 3.319828987121582, "reward_std": 0.29835617542266846, "rewards/evaluation_direction_reward/mean": 0.86328125, "rewards/evaluation_direction_reward/std": 0.21703843772411346, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.0, "rewards/move_legality_reward/std": 0.0, "rewards/pv_length_reward/mean": 0.4565476179122925, "rewards/pv_length_reward/std": 0.20721277594566345, "rewards/pv_quality_reward/mean": 0.0, "rewards/pv_quality_reward/std": 0.0, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.750805139541626, "sampling/importance_sampling_ratio/mean": 0.9014521837234497, "sampling/importance_sampling_ratio/min": 0.235592320561409, "sampling/sampling_logp_difference/max": 0.6872525215148926, "sampling/sampling_logp_difference/mean": 0.021146930754184723, "step": 264, "step_time": 16.174600318074226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 91.8671875, "completions/mean_terminated_length": 91.8671875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.47176135517656803, "epoch": 0.212, "frac_reward_zero_std": 0.875, "grad_norm": 0.46355053782463074, "learning_rate": 9.87820932288083e-06, "loss": -0.0015, "num_tokens": 9920129.0, "reward": 3.6913504600524902, "reward_std": 0.8366533517837524, "rewards/evaluation_direction_reward/mean": 0.857421875, "rewards/evaluation_direction_reward/std": 0.2800512909889221, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.41205358505249023, "rewards/pv_length_reward/std": 0.2197059988975525, "rewards/pv_quality_reward/mean": 0.171875, "rewards/pv_quality_reward/std": 0.3628273606300354, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7274668216705322, "sampling/importance_sampling_ratio/mean": 0.9486125707626343, "sampling/importance_sampling_ratio/min": 0.28599607944488525, "sampling/sampling_logp_difference/max": 0.7098314762115479, "sampling/sampling_logp_difference/mean": 0.02049780637025833, "step": 265, "step_time": 16.893886253237724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/max_terminated_length": 119.0, "completions/mean_length": 92.796875, "completions/mean_terminated_length": 92.796875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.47614573687314987, "epoch": 0.2128, "frac_reward_zero_std": 0.75, "grad_norm": 0.6514647006988525, "learning_rate": 9.877288718560866e-06, "loss": -0.015, "num_tokens": 9950447.0, "reward": 3.833463430404663, "reward_std": 0.4715459644794464, "rewards/evaluation_direction_reward/mean": 0.953125, "rewards/evaluation_direction_reward/std": 0.14290985465049744, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.4375, "rewards/move_legality_reward/std": 0.49802759289741516, "rewards/pv_length_reward/mean": 0.40572917461395264, "rewards/pv_length_reward/std": 0.2481818050146103, "rewards/pv_quality_reward/mean": 0.037109375, "rewards/pv_quality_reward/std": 0.12584300339221954, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.4359240531921387, "sampling/importance_sampling_ratio/mean": 0.9574443101882935, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.749119758605957, "sampling/sampling_logp_difference/mean": 0.020710602402687073, "step": 266, "step_time": 16.866929963231087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 94.4375, "completions/mean_terminated_length": 94.4375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.4892511051148176, "epoch": 0.2136, "frac_reward_zero_std": 0.875, "grad_norm": 0.46271079778671265, "learning_rate": 9.876364691171933e-06, "loss": -0.004, "num_tokens": 9981223.0, "reward": 3.2660343647003174, "reward_std": 0.4775852560997009, "rewards/evaluation_direction_reward/mean": 0.71484375, "rewards/evaluation_direction_reward/std": 0.36416009068489075, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.0625, "rewards/move_legality_reward/std": 0.24301259219646454, "rewards/pv_length_reward/mean": 0.48869046568870544, "rewards/pv_length_reward/std": 0.1962883472442627, "rewards/pv_quality_reward/mean": 0.0, "rewards/pv_quality_reward/std": 0.0, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.3441455364227295, "sampling/importance_sampling_ratio/mean": 0.8906300067901611, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5815517902374268, "sampling/sampling_logp_difference/mean": 0.021631544455885887, "step": 267, "step_time": 16.75079169869423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 92.859375, "completions/mean_terminated_length": 92.859375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.48605198971927166, "epoch": 0.2144, "frac_reward_zero_std": 0.875, "grad_norm": 0.36478447914123535, "learning_rate": 9.875437241362546e-06, "loss": -0.0117, "num_tokens": 10011717.0, "reward": 3.7022321224212646, "reward_std": 0.40133941173553467, "rewards/evaluation_direction_reward/mean": 0.927734375, "rewards/evaluation_direction_reward/std": 0.25128084421157837, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3125, "rewards/move_legality_reward/std": 0.4653336703777313, "rewards/pv_length_reward/mean": 0.40535715222358704, "rewards/pv_length_reward/std": 0.18619772791862488, "rewards/pv_quality_reward/mean": 0.056640625, "rewards/pv_quality_reward/std": 0.2223152220249176, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8728153705596924, "sampling/importance_sampling_ratio/mean": 0.9848246574401855, "sampling/importance_sampling_ratio/min": 0.11337102204561234, "sampling/sampling_logp_difference/max": 0.747464656829834, "sampling/sampling_logp_difference/mean": 0.020953506231307983, "step": 268, "step_time": 17.566472567617893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 92.8203125, "completions/mean_terminated_length": 92.8203125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.4797453247010708, "epoch": 0.2152, "frac_reward_zero_std": 0.75, "grad_norm": 0.5625317692756653, "learning_rate": 9.874506369783629e-06, "loss": 0.0096, "num_tokens": 10042190.0, "reward": 3.49808406829834, "reward_std": 0.6985728144645691, "rewards/evaluation_direction_reward/mean": 0.8359375, "rewards/evaluation_direction_reward/std": 0.276076078414917, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.33988094329833984, "rewards/pv_length_reward/std": 0.1858445107936859, "rewards/pv_quality_reward/mean": 0.072265625, "rewards/pv_quality_reward/std": 0.24931469559669495, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7225630283355713, "sampling/importance_sampling_ratio/mean": 0.9501902461051941, "sampling/importance_sampling_ratio/min": 0.20485380291938782, "sampling/sampling_logp_difference/max": 0.7180817127227783, "sampling/sampling_logp_difference/mean": 0.021078554913401604, "step": 269, "step_time": 16.72538471221924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 92.65625, "completions/mean_terminated_length": 92.65625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.47093166038393974, "epoch": 0.216, "frac_reward_zero_std": 0.9375, "grad_norm": 0.39299219846725464, "learning_rate": 9.873572077088502e-06, "loss": 0.0059, "num_tokens": 10072410.0, "reward": 3.936768054962158, "reward_std": 0.6950901746749878, "rewards/evaluation_direction_reward/mean": 0.849609375, "rewards/evaluation_direction_reward/std": 0.21585720777511597, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.4375, "rewards/move_legality_reward/std": 0.49802759289741516, "rewards/pv_length_reward/mean": 0.3996586203575134, "rewards/pv_length_reward/std": 0.1959814578294754, "rewards/pv_quality_reward/mean": 0.25, "rewards/pv_quality_reward/std": 0.3436717092990875, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.505139112472534, "sampling/importance_sampling_ratio/mean": 0.9527800679206848, "sampling/importance_sampling_ratio/min": 0.2606716454029083, "sampling/sampling_logp_difference/max": 0.632954478263855, "sampling/sampling_logp_difference/mean": 0.02106475457549095, "step": 270, "step_time": 16.58033510297537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 89.78125, "completions/mean_terminated_length": 89.78125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.4552720636129379, "epoch": 0.2168, "frac_reward_zero_std": 0.9375, "grad_norm": 0.2541173994541168, "learning_rate": 9.872634363932887e-06, "loss": -0.0072, "num_tokens": 10102542.0, "reward": 3.9124317169189453, "reward_std": 0.5574107766151428, "rewards/evaluation_direction_reward/mean": 0.84375, "rewards/evaluation_direction_reward/std": 0.34223672747612, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.5, "rewards/move_legality_reward/std": 0.5019646286964417, "rewards/pv_length_reward/mean": 0.3362599313259125, "rewards/pv_length_reward/std": 0.17705222964286804, "rewards/pv_quality_reward/mean": 0.232421875, "rewards/pv_quality_reward/std": 0.39081934094429016, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5537214279174805, "sampling/importance_sampling_ratio/mean": 0.9742228984832764, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6296958923339844, "sampling/sampling_logp_difference/mean": 0.020373331382870674, "step": 271, "step_time": 17.437726132571697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 93.25, "completions/mean_terminated_length": 93.25, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.4845840707421303, "epoch": 0.2176, "frac_reward_zero_std": 0.8125, "grad_norm": 0.6562314033508301, "learning_rate": 9.871693230974907e-06, "loss": 0.0148, "num_tokens": 10132774.0, "reward": 3.615680694580078, "reward_std": 0.7659569978713989, "rewards/evaluation_direction_reward/mean": 0.91015625, "rewards/evaluation_direction_reward/std": 0.24848829209804535, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.4535714387893677, "rewards/pv_length_reward/std": 0.2093089520931244, "rewards/pv_quality_reward/mean": 0.064453125, "rewards/pv_quality_reward/std": 0.243510439991951, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.490668773651123, "sampling/importance_sampling_ratio/mean": 0.9968818426132202, "sampling/importance_sampling_ratio/min": 0.1897338479757309, "sampling/sampling_logp_difference/max": 0.9737803936004639, "sampling/sampling_logp_difference/mean": 0.021284209564328194, "step": 272, "step_time": 16.45322073251009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/max_terminated_length": 119.0, "completions/mean_length": 93.2421875, "completions/mean_terminated_length": 93.2421875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.47560771740972996, "epoch": 0.2184, "frac_reward_zero_std": 0.5625, "grad_norm": 1.0638282299041748, "learning_rate": 9.870748678875086e-06, "loss": -0.0503, "num_tokens": 10163181.0, "reward": 3.51175594329834, "reward_std": 0.5394186973571777, "rewards/evaluation_direction_reward/mean": 0.802734375, "rewards/evaluation_direction_reward/std": 0.207279771566391, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.46488097310066223, "rewards/pv_length_reward/std": 0.18611332774162292, "rewards/pv_quality_reward/mean": 0.056640625, "rewards/pv_quality_reward/std": 0.2223152220249176, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.6978158950805664, "sampling/importance_sampling_ratio/mean": 0.935054361820221, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7244789600372314, "sampling/sampling_logp_difference/mean": 0.021124577149748802, "step": 273, "step_time": 16.311005026102066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 91.5859375, "completions/mean_terminated_length": 91.5859375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.4613671060651541, "epoch": 0.2192, "frac_reward_zero_std": 0.9375, "grad_norm": 0.4018169641494751, "learning_rate": 9.869800708296347e-06, "loss": 0.0129, "num_tokens": 10193480.0, "reward": 3.478087902069092, "reward_std": 0.5987433195114136, "rewards/evaluation_direction_reward/mean": 0.779296875, "rewards/evaluation_direction_reward/std": 0.39262527227401733, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.38824406266212463, "rewards/pv_length_reward/std": 0.1749941110610962, "rewards/pv_quality_reward/mean": 0.060546875, "rewards/pv_quality_reward/std": 0.23633123934268951, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8113651275634766, "sampling/importance_sampling_ratio/mean": 0.9667813777923584, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6731845140457153, "sampling/sampling_logp_difference/mean": 0.021313171833753586, "step": 274, "step_time": 16.689988933503628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 92.484375, "completions/mean_terminated_length": 92.484375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.47540988214313984, "epoch": 0.22, "frac_reward_zero_std": 0.75, "grad_norm": 0.5918298959732056, "learning_rate": 9.868849319904012e-06, "loss": -0.0138, "num_tokens": 10223902.0, "reward": 3.7865512371063232, "reward_std": 0.5948868989944458, "rewards/evaluation_direction_reward/mean": 0.931640625, "rewards/evaluation_direction_reward/std": 0.14952300488948822, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.4375, "rewards/move_legality_reward/std": 0.49802759289741516, "rewards/pv_length_reward/mean": 0.3392857313156128, "rewards/pv_length_reward/std": 0.14734362065792084, "rewards/pv_quality_reward/mean": 0.078125, "rewards/pv_quality_reward/std": 0.17111836373806, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7832624912261963, "sampling/importance_sampling_ratio/mean": 0.9399960041046143, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6185944080352783, "sampling/sampling_logp_difference/mean": 0.02081560343503952, "step": 275, "step_time": 17.243385553359985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 93.421875, "completions/mean_terminated_length": 93.421875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.4619004260748625, "epoch": 0.2208, "frac_reward_zero_std": 0.875, "grad_norm": 0.48553138971328735, "learning_rate": 9.867894514365802e-06, "loss": 0.0165, "num_tokens": 10254484.0, "reward": 3.8606772422790527, "reward_std": 0.5410681962966919, "rewards/evaluation_direction_reward/mean": 0.927734375, "rewards/evaluation_direction_reward/std": 0.16939082741737366, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.5, "rewards/move_legality_reward/std": 0.5019646286964417, "rewards/pv_length_reward/mean": 0.3645833432674408, "rewards/pv_length_reward/std": 0.1417149156332016, "rewards/pv_quality_reward/mean": 0.068359375, "rewards/pv_quality_reward/std": 0.24445593357086182, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7247610092163086, "sampling/importance_sampling_ratio/mean": 1.0040212869644165, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7146055698394775, "sampling/sampling_logp_difference/mean": 0.020520281046628952, "step": 276, "step_time": 17.328942485153675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 91.421875, "completions/mean_terminated_length": 91.421875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.4724207278341055, "epoch": 0.2216, "frac_reward_zero_std": 0.4375, "grad_norm": 0.9493516683578491, "learning_rate": 9.866936292351837e-06, "loss": 0.0413, "num_tokens": 10284610.0, "reward": 3.4731009006500244, "reward_std": 0.8073118329048157, "rewards/evaluation_direction_reward/mean": 0.744140625, "rewards/evaluation_direction_reward/std": 0.25383833050727844, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.4438041150569916, "rewards/pv_length_reward/std": 0.20147614181041718, "rewards/pv_quality_reward/mean": 0.09765625, "rewards/pv_quality_reward/std": 0.27212053537368774, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.775667190551758, "sampling/importance_sampling_ratio/mean": 0.963442325592041, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7144937515258789, "sampling/sampling_logp_difference/mean": 0.021387292072176933, "step": 277, "step_time": 16.6855748295784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 91.6953125, "completions/mean_terminated_length": 91.6953125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.4672803785651922, "epoch": 0.2224, "frac_reward_zero_std": 0.75, "grad_norm": 0.6167562007904053, "learning_rate": 9.865974654534634e-06, "loss": 0.0169, "num_tokens": 10314955.0, "reward": 3.659982681274414, "reward_std": 0.6388766765594482, "rewards/evaluation_direction_reward/mean": 0.869140625, "rewards/evaluation_direction_reward/std": 0.2698148190975189, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.4451389014720917, "rewards/pv_length_reward/std": 0.18483242392539978, "rewards/pv_quality_reward/mean": 0.095703125, "rewards/pv_quality_reward/std": 0.25510746240615845, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2764148712158203, "sampling/importance_sampling_ratio/mean": 0.9408693313598633, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5969638824462891, "sampling/sampling_logp_difference/mean": 0.02106255292892456, "step": 278, "step_time": 16.373732186853886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 92.3984375, "completions/mean_terminated_length": 92.3984375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.47043224051594734, "epoch": 0.2232, "frac_reward_zero_std": 0.8125, "grad_norm": 0.44934388995170593, "learning_rate": 9.865009601589105e-06, "loss": -0.004, "num_tokens": 10345550.0, "reward": 3.641555070877075, "reward_std": 0.826061487197876, "rewards/evaluation_direction_reward/mean": 0.857421875, "rewards/evaluation_direction_reward/std": 0.2503611445426941, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.2421875, "rewards/move_legality_reward/std": 0.4300905168056488, "rewards/pv_length_reward/mean": 0.4188988208770752, "rewards/pv_length_reward/std": 0.2181384563446045, "rewards/pv_quality_reward/mean": 0.123046875, "rewards/pv_quality_reward/std": 0.2752881348133087, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.51180362701416, "sampling/importance_sampling_ratio/mean": 0.882743239402771, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5278523564338684, "sampling/sampling_logp_difference/mean": 0.021533522754907608, "step": 279, "step_time": 16.99904826283455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 91.5390625, "completions/mean_terminated_length": 91.5390625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.4611786212772131, "epoch": 0.224, "frac_reward_zero_std": 0.8125, "grad_norm": 0.8283146619796753, "learning_rate": 9.864041134192563e-06, "loss": 0.0226, "num_tokens": 10375587.0, "reward": 3.9083518981933594, "reward_std": 0.5398520231246948, "rewards/evaluation_direction_reward/mean": 0.947265625, "rewards/evaluation_direction_reward/std": 0.14257317781448364, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.4296875, "rewards/move_legality_reward/std": 0.4969765841960907, "rewards/pv_length_reward/mean": 0.42202383279800415, "rewards/pv_length_reward/std": 0.229537233710289, "rewards/pv_quality_reward/mean": 0.109375, "rewards/pv_quality_reward/std": 0.2657443881034851, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9726340770721436, "sampling/importance_sampling_ratio/mean": 1.0500056743621826, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6569018363952637, "sampling/sampling_logp_difference/mean": 0.020574182271957397, "step": 280, "step_time": 17.14847891777754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 89.2734375, "completions/mean_terminated_length": 89.2734375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.47057723067700863, "epoch": 0.2248, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6409481167793274, "learning_rate": 9.863069253024719e-06, "loss": -0.0245, "num_tokens": 10405590.0, "reward": 3.5124053955078125, "reward_std": 0.5755425095558167, "rewards/evaluation_direction_reward/mean": 0.8125, "rewards/evaluation_direction_reward/std": 0.2875363826751709, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.4655303359031677, "rewards/pv_length_reward/std": 0.19921213388442993, "rewards/pv_quality_reward/mean": 0.046875, "rewards/pv_quality_reward/std": 0.18493986129760742, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.411996364593506, "sampling/importance_sampling_ratio/mean": 0.9328136444091797, "sampling/importance_sampling_ratio/min": 0.2294369637966156, "sampling/sampling_logp_difference/max": 0.5524171590805054, "sampling/sampling_logp_difference/mean": 0.020359497517347336, "step": 281, "step_time": 16.142147406935692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 91.25, "completions/mean_terminated_length": 91.25, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.45492017082870007, "epoch": 0.2256, "frac_reward_zero_std": 0.875, "grad_norm": 0.5670854449272156, "learning_rate": 9.862093958767671e-06, "loss": 0.0113, "num_tokens": 10435894.0, "reward": 3.8848958015441895, "reward_std": 0.7164313793182373, "rewards/evaluation_direction_reward/mean": 0.8828125, "rewards/evaluation_direction_reward/std": 0.21030887961387634, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.4375, "rewards/move_legality_reward/std": 0.49802759289741516, "rewards/pv_length_reward/mean": 0.40833336114883423, "rewards/pv_length_reward/std": 0.2525114417076111, "rewards/pv_quality_reward/mean": 0.15625, "rewards/pv_quality_reward/std": 0.34223672747612, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2119393348693848, "sampling/importance_sampling_ratio/mean": 0.9410221576690674, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8688621520996094, "sampling/sampling_logp_difference/mean": 0.020418323576450348, "step": 282, "step_time": 16.37001907080412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 113.0, "completions/max_terminated_length": 113.0, "completions/mean_length": 91.015625, "completions/mean_terminated_length": 91.015625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.45576743222773075, "epoch": 0.2264, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6399089097976685, "learning_rate": 9.861115252105922e-06, "loss": -0.0329, "num_tokens": 10466296.0, "reward": 3.417795181274414, "reward_std": 0.5205116868019104, "rewards/evaluation_direction_reward/mean": 0.87109375, "rewards/evaluation_direction_reward/std": 0.2698504328727722, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.3513889014720917, "rewards/pv_length_reward/std": 0.1355244368314743, "rewards/pv_quality_reward/mean": 0.0078125, "rewards/pv_quality_reward/std": 0.04366907477378845, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8341317176818848, "sampling/importance_sampling_ratio/mean": 0.9505306482315063, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6528794765472412, "sampling/sampling_logp_difference/mean": 0.021700924262404442, "step": 283, "step_time": 16.19833254814148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/max_terminated_length": 119.0, "completions/mean_length": 92.8203125, "completions/mean_terminated_length": 92.8203125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.47980040684342384, "epoch": 0.2272, "frac_reward_zero_std": 0.625, "grad_norm": 0.7170718312263489, "learning_rate": 9.860133133726364e-06, "loss": -0.0161, "num_tokens": 10496833.0, "reward": 3.7816033363342285, "reward_std": 0.5831307768821716, "rewards/evaluation_direction_reward/mean": 0.955078125, "rewards/evaluation_direction_reward/std": 0.11498713493347168, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.375, "rewards/move_legality_reward/std": 0.4860251843929291, "rewards/pv_length_reward/mean": 0.3851190507411957, "rewards/pv_length_reward/std": 0.17788031697273254, "rewards/pv_quality_reward/mean": 0.06640625, "rewards/pv_quality_reward/std": 0.1703527420759201, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8087663650512695, "sampling/importance_sampling_ratio/mean": 1.0080652236938477, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8802461624145508, "sampling/sampling_logp_difference/mean": 0.0217270627617836, "step": 284, "step_time": 17.01715798676014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/max_terminated_length": 119.0, "completions/mean_length": 92.0859375, "completions/mean_terminated_length": 92.0859375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.47133895196020603, "epoch": 0.228, "frac_reward_zero_std": 0.625, "grad_norm": 1.0173856019973755, "learning_rate": 9.859147604318286e-06, "loss": 0.01, "num_tokens": 10526940.0, "reward": 3.694531202316284, "reward_std": 0.5611398816108704, "rewards/evaluation_direction_reward/mean": 0.86328125, "rewards/evaluation_direction_reward/std": 0.2603362798690796, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.45625001192092896, "rewards/pv_length_reward/std": 0.18087714910507202, "rewards/pv_quality_reward/mean": 0.125, "rewards/pv_quality_reward/std": 0.3320184051990509, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8310036659240723, "sampling/importance_sampling_ratio/mean": 1.032743215560913, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.814582347869873, "sampling/sampling_logp_difference/mean": 0.02139401063323021, "step": 285, "step_time": 16.67747375369072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 92.9609375, "completions/mean_terminated_length": 92.9609375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.47602468356490135, "epoch": 0.2288, "frac_reward_zero_std": 0.9375, "grad_norm": 0.36301514506340027, "learning_rate": 9.85815866457337e-06, "loss": -0.0049, "num_tokens": 10557063.0, "reward": 3.724869728088379, "reward_std": 0.45798060297966003, "rewards/evaluation_direction_reward/mean": 1.0, "rewards/evaluation_direction_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.4416666626930237, "rewards/pv_length_reward/std": 0.18462324142456055, "rewards/pv_quality_reward/mean": 0.033203125, "rewards/pv_quality_reward/std": 0.1230001151561737, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.946251153945923, "sampling/importance_sampling_ratio/mean": 0.9465701580047607, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.879497766494751, "sampling/sampling_logp_difference/mean": 0.023022154346108437, "step": 286, "step_time": 16.213660798966885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 90.625, "completions/mean_terminated_length": 90.625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.46368678100407124, "epoch": 0.2296, "frac_reward_zero_std": 0.625, "grad_norm": 0.7363535761833191, "learning_rate": 9.857166315185693e-06, "loss": -0.0001, "num_tokens": 10586975.0, "reward": 3.652496337890625, "reward_std": 0.5255768895149231, "rewards/evaluation_direction_reward/mean": 0.8515625, "rewards/evaluation_direction_reward/std": 0.20195281505584717, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.3125, "rewards/move_legality_reward/std": 0.4653336703777313, "rewards/pv_length_reward/mean": 0.4203869104385376, "rewards/pv_length_reward/std": 0.1782141774892807, "rewards/pv_quality_reward/mean": 0.068359375, "rewards/pv_quality_reward/std": 0.18484629690647125, "rewards/verbosity_reward/mean": 0.9996874928474426, "rewards/verbosity_reward/std": 0.0035355358850210905, "sampling/importance_sampling_ratio/max": 2.5468616485595703, "sampling/importance_sampling_ratio/mean": 0.9752179384231567, "sampling/importance_sampling_ratio/min": 0.27615824341773987, "sampling/sampling_logp_difference/max": 0.564537525177002, "sampling/sampling_logp_difference/mean": 0.021646270528435707, "step": 287, "step_time": 16.86013199388981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 92.9140625, "completions/mean_terminated_length": 92.9140625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.4783679377287626, "epoch": 0.2304, "frac_reward_zero_std": 0.75, "grad_norm": 0.496090829372406, "learning_rate": 9.856170556851725e-06, "loss": 0.004, "num_tokens": 10617492.0, "reward": 3.462146520614624, "reward_std": 0.5793706774711609, "rewards/evaluation_direction_reward/mean": 0.806640625, "rewards/evaluation_direction_reward/std": 0.29798343777656555, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.3898809552192688, "rewards/pv_length_reward/std": 0.14635512232780457, "rewards/pv_quality_reward/mean": 0.078125, "rewards/pv_quality_reward/std": 0.24653105437755585, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.5328357219696045, "sampling/importance_sampling_ratio/mean": 0.8534888029098511, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5616698265075684, "sampling/sampling_logp_difference/mean": 0.022278130054473877, "step": 288, "step_time": 16.941840283572674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 92.1640625, "completions/mean_terminated_length": 92.1640625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.46948322281241417, "epoch": 0.2312, "frac_reward_zero_std": 0.9375, "grad_norm": 0.2832399904727936, "learning_rate": 9.855171390270325e-06, "loss": 0.0018, "num_tokens": 10647449.0, "reward": 3.5660157203674316, "reward_std": 0.7026739716529846, "rewards/evaluation_direction_reward/mean": 0.916015625, "rewards/evaluation_direction_reward/std": 0.21342110633850098, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.4625000059604645, "rewards/pv_length_reward/std": 0.2629331946372986, "rewards/pv_quality_reward/mean": 0.0625, "rewards/pv_quality_reward/std": 0.24301259219646454, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.8552756309509277, "sampling/importance_sampling_ratio/mean": 0.9896925091743469, "sampling/importance_sampling_ratio/min": 0.3334096074104309, "sampling/sampling_logp_difference/max": 0.5035052299499512, "sampling/sampling_logp_difference/mean": 0.021430136635899544, "step": 289, "step_time": 15.894281961023808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 90.4453125, "completions/mean_terminated_length": 90.4453125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.4602336697280407, "epoch": 0.232, "frac_reward_zero_std": 0.875, "grad_norm": 0.48520272970199585, "learning_rate": 9.854168816142747e-06, "loss": 0.0077, "num_tokens": 10677138.0, "reward": 3.498220443725586, "reward_std": 0.5213614702224731, "rewards/evaluation_direction_reward/mean": 0.943359375, "rewards/evaluation_direction_reward/std": 0.144501730799675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.3361111283302307, "rewards/pv_length_reward/std": 0.1856314241886139, "rewards/pv_quality_reward/mean": 0.03125, "rewards/pv_quality_reward/std": 0.12150629609823227, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.472017526626587, "sampling/importance_sampling_ratio/mean": 0.9950680732727051, "sampling/importance_sampling_ratio/min": 0.2824673354625702, "sampling/sampling_logp_difference/max": 0.603421688079834, "sampling/sampling_logp_difference/mean": 0.021351726725697517, "step": 290, "step_time": 16.022014901041985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 90.09375, "completions/mean_terminated_length": 90.09375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.4680994153022766, "epoch": 0.2328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 9.853162835172638e-06, "loss": 0.0, "num_tokens": 10706918.0, "reward": 3.512500286102295, "reward_std": 0.36483675241470337, "rewards/evaluation_direction_reward/mean": 1.0, "rewards/evaluation_direction_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.38749998807907104, "rewards/pv_length_reward/std": 0.19109667837619781, "rewards/pv_quality_reward/mean": 0.0, "rewards/pv_quality_reward/std": 0.0, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.9554576873779297, "sampling/importance_sampling_ratio/mean": 1.0185034275054932, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5725469589233398, "sampling/sampling_logp_difference/mean": 0.02163226343691349, "step": 291, "step_time": 16.54939780384302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/max_terminated_length": 116.0, "completions/mean_length": 93.0625, "completions/mean_terminated_length": 93.0625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.46601790376007557, "epoch": 0.2336, "frac_reward_zero_std": 0.875, "grad_norm": 0.3398497998714447, "learning_rate": 9.852153448066031e-06, "loss": 0.0098, "num_tokens": 10737646.0, "reward": 3.376283645629883, "reward_std": 0.36790499091148376, "rewards/evaluation_direction_reward/mean": 0.931640625, "rewards/evaluation_direction_reward/std": 0.1794426143169403, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.3196428418159485, "rewards/pv_length_reward/std": 0.1652243584394455, "rewards/pv_quality_reward/mean": 0.0, "rewards/pv_quality_reward/std": 0.0, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.532820224761963, "sampling/importance_sampling_ratio/mean": 0.9457254409790039, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6733040809631348, "sampling/sampling_logp_difference/mean": 0.02210535854101181, "step": 292, "step_time": 16.66788686066866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 92.28125, "completions/mean_terminated_length": 92.28125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.4646486509591341, "epoch": 0.2344, "frac_reward_zero_std": 0.8125, "grad_norm": 0.4346781075000763, "learning_rate": 9.851140655531357e-06, "loss": -0.0226, "num_tokens": 10768042.0, "reward": 3.5130085945129395, "reward_std": 0.39515069127082825, "rewards/evaluation_direction_reward/mean": 0.908203125, "rewards/evaluation_direction_reward/std": 0.25072944164276123, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.125, "rewards/move_legality_reward/std": 0.3320184051990509, "rewards/pv_length_reward/mean": 0.4075396955013275, "rewards/pv_length_reward/std": 0.19473432004451752, "rewards/pv_quality_reward/mean": 0.072265625, "rewards/pv_quality_reward/std": 0.24533510208129883, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.578496217727661, "sampling/importance_sampling_ratio/mean": 0.9613955616950989, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5323832035064697, "sampling/sampling_logp_difference/mean": 0.021809345111250877, "step": 293, "step_time": 16.36730906367302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 92.7109375, "completions/mean_terminated_length": 92.7109375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.462736239656806, "epoch": 0.2352, "frac_reward_zero_std": 0.6875, "grad_norm": 0.859946072101593, "learning_rate": 9.850124458279429e-06, "loss": 0.0205, "num_tokens": 10798661.0, "reward": 3.6545495986938477, "reward_std": 0.6664267778396606, "rewards/evaluation_direction_reward/mean": 0.93359375, "rewards/evaluation_direction_reward/std": 0.1583762764930725, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.3479090631008148, "rewards/pv_length_reward/std": 0.2359127551317215, "rewards/pv_quality_reward/mean": 0.123046875, "rewards/pv_quality_reward/std": 0.22403796017169952, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.486553430557251, "sampling/importance_sampling_ratio/mean": 1.0724773406982422, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9996781349182129, "sampling/sampling_logp_difference/mean": 0.022328447550535202, "step": 294, "step_time": 16.714472576975822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 90.75, "completions/mean_terminated_length": 90.75, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.44526692293584347, "epoch": 0.236, "frac_reward_zero_std": 0.875, "grad_norm": 0.27940237522125244, "learning_rate": 9.849104857023455e-06, "loss": 0.0062, "num_tokens": 10828973.0, "reward": 3.7433595657348633, "reward_std": 0.6942815780639648, "rewards/evaluation_direction_reward/mean": 0.70703125, "rewards/evaluation_direction_reward/std": 0.44046032428741455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.375, "rewards/move_legality_reward/std": 0.4860251843929291, "rewards/pv_length_reward/mean": 0.48750001192092896, "rewards/pv_length_reward/std": 0.2674202024936676, "rewards/pv_quality_reward/mean": 0.173828125, "rewards/pv_quality_reward/std": 0.3416014313697815, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.780669689178467, "sampling/importance_sampling_ratio/mean": 0.9845236539840698, "sampling/importance_sampling_ratio/min": 0.19872799515724182, "sampling/sampling_logp_difference/max": 0.7492289543151855, "sampling/sampling_logp_difference/mean": 0.021477248519659042, "step": 295, "step_time": 16.222284726798534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 107.0, "completions/max_terminated_length": 107.0, "completions/mean_length": 87.1953125, "completions/mean_terminated_length": 87.1953125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.4443143103271723, "epoch": 0.2368, "frac_reward_zero_std": 0.875, "grad_norm": 0.620238184928894, "learning_rate": 9.84808185247903e-06, "loss": -0.0003, "num_tokens": 10858526.0, "reward": 3.624119520187378, "reward_std": 0.6349241733551025, "rewards/evaluation_direction_reward/mean": 0.91796875, "rewards/evaluation_direction_reward/std": 0.1508159339427948, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.3936508297920227, "rewards/pv_length_reward/std": 0.21088062226772308, "rewards/pv_quality_reward/mean": 0.0625, "rewards/pv_quality_reward/std": 0.24301259219646454, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.7678534984588623, "sampling/importance_sampling_ratio/mean": 1.0140568017959595, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5997982025146484, "sampling/sampling_logp_difference/mean": 0.020497802644968033, "step": 296, "step_time": 15.66672097146511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 90.7421875, "completions/mean_terminated_length": 90.7421875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.46256712824106216, "epoch": 0.2376, "frac_reward_zero_std": 0.625, "grad_norm": 0.9867584109306335, "learning_rate": 9.84705544536414e-06, "loss": -0.0558, "num_tokens": 10888957.0, "reward": 3.454784870147705, "reward_std": 0.5506653785705566, "rewards/evaluation_direction_reward/mean": 0.830078125, "rewards/evaluation_direction_reward/std": 0.23463322222232819, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.1875, "rewards/move_legality_reward/std": 0.39184603095054626, "rewards/pv_length_reward/mean": 0.405956894159317, "rewards/pv_length_reward/std": 0.18692633509635925, "rewards/pv_quality_reward/mean": 0.03125, "rewards/pv_quality_reward/std": 0.12150629609823227, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.953326940536499, "sampling/importance_sampling_ratio/mean": 0.9826947450637817, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8819694519042969, "sampling/sampling_logp_difference/mean": 0.021383950486779213, "step": 297, "step_time": 16.592852652072906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 91.1015625, "completions/mean_terminated_length": 91.1015625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.4567346479743719, "epoch": 0.2384, "frac_reward_zero_std": 0.8125, "grad_norm": 0.47794055938720703, "learning_rate": 9.846025636399152e-06, "loss": -0.0032, "num_tokens": 10919146.0, "reward": 3.3851747512817383, "reward_std": 0.350882887840271, "rewards/evaluation_direction_reward/mean": 0.912109375, "rewards/evaluation_direction_reward/std": 0.2231438010931015, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.0625, "rewards/move_legality_reward/std": 0.24301259219646454, "rewards/pv_length_reward/mean": 0.39494049549102783, "rewards/pv_length_reward/std": 0.1534409075975418, "rewards/pv_quality_reward/mean": 0.015625, "rewards/pv_quality_reward/std": 0.060753148049116135, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.2540078163146973, "sampling/importance_sampling_ratio/mean": 0.9385329484939575, "sampling/importance_sampling_ratio/min": 0.22314825654029846, "sampling/sampling_logp_difference/max": 0.6416785717010498, "sampling/sampling_logp_difference/mean": 0.021224725991487503, "step": 298, "step_time": 16.703831426799297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 86.7734375, "completions/mean_terminated_length": 86.7734375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.4486342966556549, "epoch": 0.2392, "frac_reward_zero_std": 0.8125, "grad_norm": 0.5375232696533203, "learning_rate": 9.844992426306832e-06, "loss": -0.0191, "num_tokens": 10948493.0, "reward": 3.5879526138305664, "reward_std": 0.4960968494415283, "rewards/evaluation_direction_reward/mean": 0.927734375, "rewards/evaluation_direction_reward/std": 0.17227159440517426, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.25, "rewards/move_legality_reward/std": 0.434714138507843, "rewards/pv_length_reward/mean": 0.3789682388305664, "rewards/pv_length_reward/std": 0.21630746126174927, "rewards/pv_quality_reward/mean": 0.03125, "rewards/pv_quality_reward/std": 0.10867853462696075, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.871483564376831, "sampling/importance_sampling_ratio/mean": 1.0117902755737305, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6267547607421875, "sampling/sampling_logp_difference/mean": 0.020845094695687294, "step": 299, "step_time": 16.540189526975155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 87.3515625, "completions/mean_terminated_length": 87.3515625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.4397370535880327, "epoch": 0.24, "frac_reward_zero_std": 0.8125, "grad_norm": 0.5375248789787292, "learning_rate": 9.843955815812322e-06, "loss": -0.0089, "num_tokens": 10978258.0, "reward": 3.672656297683716, "reward_std": 0.6919003129005432, "rewards/evaluation_direction_reward/mean": 0.830078125, "rewards/evaluation_direction_reward/std": 0.2508520781993866, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/move_legality_reward/mean": 0.375, "rewards/move_legality_reward/std": 0.4860251843929291, "rewards/pv_length_reward/mean": 0.35625001788139343, "rewards/pv_length_reward/std": 0.1581692099571228, "rewards/pv_quality_reward/mean": 0.111328125, "rewards/pv_quality_reward/std": 0.2564302980899811, "rewards/verbosity_reward/mean": 1.0, "rewards/verbosity_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.3057777881622314, "sampling/importance_sampling_ratio/mean": 0.9821693897247314, "sampling/importance_sampling_ratio/min": 0.26325365900993347, "sampling/sampling_logp_difference/max": 0.691962718963623, "sampling/sampling_logp_difference/mean": 0.021255753934383392, "step": 300, "step_time": 17.01894522458315 } ], "logging_steps": 1.0, "max_steps": 3750, "num_input_tokens_seen": 10978258, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }