{ "best_global_step": 750, "best_metric": 1.6413334274291993, "best_model_checkpoint": "/scratch/checkpoints/exp8_grpo_fast/checkpoint-750", "epoch": 0.273224043715847, "eval_steps": 150, "global_step": 750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333432674407, "completions/max_length": 1388.2, "completions/max_terminated_length": 1149.4, "completions/mean_length": 961.8666870117188, "completions/mean_terminated_length": 813.0800170898438, "completions/min_length": 593.6, "completions/min_terminated_length": 593.6, "entropy": 0.15543196325500805, "epoch": 0.0018214936247723133, "frac_reward_zero_std": 0.4, "grad_norm": 0.04320038482546806, "kl": 0.0001638038011151366, "learning_rate": 1.45985401459854e-08, "loss": -0.002662058547139168, "num_tokens": 30920.0, "reward": 2.02333345413208, "reward_std": 0.5894708633422852, "rewards/boxed_rate/mean": 0.8666666626930237, "rewards/boxed_rate/std": 0.19119417667388916, "rewards/correctness/mean": 0.46666666567325593, "rewards/correctness/std": 0.2665788769721985, "rewards/multi_component_reward/mean": 0.5233333643525839, "rewards/multi_component_reward/std": 0.4098228693008423, "rewards/no_answer_rate/mean": 0.13333333432674407, "rewards/no_answer_rate/std": 0.19119417667388916, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.12532416619360448, "sampling/importance_sampling_ratio/mean": 0.04354772977530956, "sampling/importance_sampling_ratio/min": 0.006865633289925732, "sampling/sampling_logp_difference/max": 0.305552613735199, "sampling/sampling_logp_difference/mean": 0.010744557529687882, "step": 5, "step_time": 32.99731477890164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1657.0, "completions/max_terminated_length": 1145.8, "completions/mean_length": 919.3000366210938, "completions/mean_terminated_length": 807.0200317382812, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "entropy": 0.21022272085150082, "epoch": 0.0036429872495446266, "frac_reward_zero_std": 0.4, "grad_norm": 0.014283710159361362, "kl": 0.00032655518104244646, "learning_rate": 3.2846715328467156e-08, "loss": -0.0015700791031122209, "num_tokens": 61163.0, "reward": 1.180000114440918, "reward_std": 0.4240097999572754, "rewards/boxed_rate/mean": 0.899999988079071, "rewards/boxed_rate/std": 0.24494898319244385, "rewards/correctness/mean": 0.06666666865348816, "rewards/correctness/std": 0.1632993221282959, "rewards/multi_component_reward/mean": 0.11333334147930145, "rewards/multi_component_reward/std": 0.27013830840587616, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.24494898319244385, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.06362589932978154, "sampling/importance_sampling_ratio/mean": 0.01664090696722269, "sampling/importance_sampling_ratio/min": 0.0005761486502072345, "sampling/sampling_logp_difference/max": 0.49286861419677735, "sampling/sampling_logp_difference/mean": 0.01417195163667202, "step": 10, "step_time": 37.58825172036886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20000000298023224, "completions/max_length": 1657.4, "completions/max_terminated_length": 899.0, "completions/mean_length": 897.2000244140625, "completions/mean_terminated_length": 615.4466735839844, "completions/min_length": 296.4, "completions/min_terminated_length": 296.4, "entropy": 0.20501985649267832, "epoch": 0.00546448087431694, "frac_reward_zero_std": 0.2, "grad_norm": 0.0192733071744442, "kl": 0.0003123952769480335, "learning_rate": 5.10948905109489e-08, "loss": -0.004246947914361953, "num_tokens": 93923.0, "reward": 1.8200001120567322, "reward_std": 0.5815625011920929, "rewards/boxed_rate/mean": 0.8, "rewards/boxed_rate/std": 0.29447373151779177, "rewards/correctness/mean": 0.4000000059604645, "rewards/correctness/std": 0.20655910968780516, "rewards/multi_component_reward/mean": 0.42000001221895217, "rewards/multi_component_reward/std": 0.3785122692584991, "rewards/no_answer_rate/mean": 0.20000000298023224, "rewards/no_answer_rate/std": 0.29447373151779177, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.18622441291809083, "sampling/importance_sampling_ratio/mean": 0.04659921806305647, "sampling/importance_sampling_ratio/min": 0.0010122977004923522, "sampling/sampling_logp_difference/max": 0.39253207445144656, "sampling/sampling_logp_difference/mean": 0.01371441949158907, "step": 15, "step_time": 38.31680890209973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 2023.8, "completions/max_terminated_length": 1348.6, "completions/mean_length": 1195.0000610351562, "completions/mean_terminated_length": 1013.5533630371094, "completions/min_length": 736.4, "completions/min_terminated_length": 736.4, "entropy": 0.15656836417814096, "epoch": 0.007285974499089253, "frac_reward_zero_std": 0.0, "grad_norm": 0.02143455669283867, "kl": 0.00023284607256452244, "learning_rate": 6.934306569343065e-08, "loss": -0.006335017085075378, "num_tokens": 131981.0, "reward": 1.3033334732055664, "reward_std": 0.7852452993392944, "rewards/boxed_rate/mean": 0.8333333253860473, "rewards/boxed_rate/std": 0.34822853803634646, "rewards/correctness/mean": 0.13333333730697633, "rewards/correctness/std": 0.3265986442565918, "rewards/multi_component_reward/mean": 0.1366666793823242, "rewards/multi_component_reward/std": 0.5012870132923126, "rewards/no_answer_rate/mean": 0.1666666716337204, "rewards/no_answer_rate/std": 0.34822853803634646, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.05830998048186302, "sampling/importance_sampling_ratio/mean": 0.016274437960237264, "sampling/importance_sampling_ratio/min": 0.00017535050738298644, "sampling/sampling_logp_difference/max": 0.3216116189956665, "sampling/sampling_logp_difference/mean": 0.011110224667936564, "step": 20, "step_time": 45.75825558342039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333730697633, "completions/max_length": 1375.4, "completions/max_terminated_length": 1177.4, "completions/mean_length": 1042.1333740234375, "completions/mean_terminated_length": 911.9000244140625, "completions/min_length": 680.6, "completions/min_terminated_length": 680.6, "entropy": 0.1741524614393711, "epoch": 0.009107468123861567, "frac_reward_zero_std": 0.2, "grad_norm": 0.01335141435265541, "kl": 0.0002629145164974034, "learning_rate": 8.759124087591241e-08, "loss": 0.0026412704959511758, "num_tokens": 166221.0, "reward": 1.6033333897590638, "reward_std": 0.5469928443431854, "rewards/boxed_rate/mean": 0.8666666746139526, "rewards/boxed_rate/std": 0.20655910968780516, "rewards/correctness/mean": 0.26666667461395266, "rewards/correctness/std": 0.20655910968780516, "rewards/multi_component_reward/mean": 0.3033333241939545, "rewards/multi_component_reward/std": 0.36893237233161924, "rewards/no_answer_rate/mean": 0.13333333730697633, "rewards/no_answer_rate/std": 0.20655910968780516, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.04747706279158592, "sampling/importance_sampling_ratio/mean": 0.016832177247852086, "sampling/importance_sampling_ratio/min": 0.00119685190729814, "sampling/sampling_logp_difference/max": 0.3412928104400635, "sampling/sampling_logp_difference/mean": 0.011194901075214148, "step": 25, "step_time": 33.345185896754266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2666666716337204, "completions/max_length": 1851.2, "completions/max_terminated_length": 1397.6, "completions/mean_length": 1248.2000366210937, "completions/mean_terminated_length": 1000.1933471679688, "completions/min_length": 643.4, "completions/min_terminated_length": 643.4, "entropy": 0.1906235640247663, "epoch": 0.01092896174863388, "frac_reward_zero_std": 0.2, "grad_norm": 0.02501230686903, "kl": 0.0002681747629443028, "learning_rate": 1.0583941605839415e-07, "loss": -0.0029103375971317293, "num_tokens": 207351.0, "reward": 1.3366667747497558, "reward_std": 0.7248244881629944, "rewards/boxed_rate/mean": 0.7333333313465118, "rewards/boxed_rate/std": 0.29447373151779177, "rewards/correctness/mean": 0.1666666716337204, "rewards/correctness/std": 0.2882087707519531, "rewards/multi_component_reward/mean": 0.10333333685994148, "rewards/multi_component_reward/std": 0.48569379448890687, "rewards/no_answer_rate/mean": 0.2666666716337204, "rewards/no_answer_rate/std": 0.29447373151779177, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.10327955484390258, "sampling/importance_sampling_ratio/max": 0.05718064345419407, "sampling/importance_sampling_ratio/mean": 0.01394880290608853, "sampling/importance_sampling_ratio/min": 3.71510857371279e-06, "sampling/sampling_logp_difference/max": 0.39807581901550293, "sampling/sampling_logp_difference/mean": 0.011931476183235645, "step": 30, "step_time": 42.76963838078082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06666666865348816, "completions/max_length": 1265.8, "completions/max_terminated_length": 1010.8, "completions/mean_length": 801.2000183105469, "completions/mean_terminated_length": 710.2333435058594, "completions/min_length": 515.6, "completions/min_terminated_length": 515.6, "entropy": 0.18569878712296486, "epoch": 0.012750455373406194, "frac_reward_zero_std": 0.4, "grad_norm": 0.013473724946379662, "kl": 0.00025912571412239536, "learning_rate": 1.240875912408759e-07, "loss": -0.007362464070320129, "num_tokens": 234243.0, "reward": 1.5500000715255737, "reward_std": 0.6336740732192994, "rewards/boxed_rate/mean": 0.9333333373069763, "rewards/boxed_rate/std": 0.10327955484390258, "rewards/correctness/mean": 0.23333334028720856, "rewards/correctness/std": 0.2882087707519531, "rewards/multi_component_reward/mean": 0.3166666716337204, "rewards/multi_component_reward/std": 0.35141916275024415, "rewards/no_answer_rate/mean": 0.06666666865348816, "rewards/no_answer_rate/std": 0.10327955484390258, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.10159578723832965, "sampling/importance_sampling_ratio/mean": 0.03656362611800432, "sampling/importance_sampling_ratio/min": 0.002998881617435645, "sampling/sampling_logp_difference/max": 0.3329684495925903, "sampling/sampling_logp_difference/mean": 0.011683504190295934, "step": 35, "step_time": 29.749523213878273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23333334028720856, "completions/max_length": 1788.4, "completions/max_terminated_length": 1253.4, "completions/mean_length": 1151.96669921875, "completions/mean_terminated_length": 852.2566772460938, "completions/min_length": 632.2, "completions/min_terminated_length": 632.2, "entropy": 0.19733075598875682, "epoch": 0.014571948998178506, "frac_reward_zero_std": 0.0, "grad_norm": 0.017843080684542656, "kl": 0.00030036380388385927, "learning_rate": 1.4233576642335764e-07, "loss": 0.0052158843725919725, "num_tokens": 271334.0, "reward": 1.3800000905990601, "reward_std": 0.8547543048858642, "rewards/boxed_rate/mean": 0.7666666686534882, "rewards/boxed_rate/std": 0.2128240704536438, "rewards/correctness/mean": 0.20000000298023224, "rewards/correctness/std": 0.35449349880218506, "rewards/multi_component_reward/mean": 0.1800000086426735, "rewards/multi_component_reward/std": 0.5035018444061279, "rewards/no_answer_rate/mean": 0.2333333373069763, "rewards/no_answer_rate/std": 0.2128240704536438, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.05587816089391708, "sampling/importance_sampling_ratio/mean": 0.020892591564916074, "sampling/importance_sampling_ratio/min": 2.3939082747448788e-05, "sampling/sampling_logp_difference/max": 0.42613508701324465, "sampling/sampling_logp_difference/mean": 0.012935483455657959, "step": 40, "step_time": 40.51654889807105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06666666865348816, "completions/max_length": 1250.0, "completions/max_terminated_length": 700.2, "completions/mean_length": 596.7666748046875, "completions/mean_terminated_length": 492.7933410644531, "completions/min_length": 341.8, "completions/min_terminated_length": 341.8, "entropy": 0.22809130648771922, "epoch": 0.01639344262295082, "frac_reward_zero_std": 0.4, "grad_norm": 0.06362088024616241, "kl": 0.00031622202028908455, "learning_rate": 1.6058394160583942e-07, "loss": -0.0004649309907108545, "num_tokens": 293263.0, "reward": 1.6900001049041748, "reward_std": 0.6551559090614318, "rewards/boxed_rate/mean": 0.9333333253860474, "rewards/boxed_rate/std": 0.1632993221282959, "rewards/correctness/mean": 0.3000000089406967, "rewards/correctness/std": 0.2882087707519531, "rewards/multi_component_reward/mean": 0.3900000035762787, "rewards/multi_component_reward/std": 0.37366979122161864, "rewards/no_answer_rate/mean": 0.06666666865348816, "rewards/no_answer_rate/std": 0.1632993221282959, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.06921428106725216, "sampling/importance_sampling_ratio/mean": 0.029317377135157585, "sampling/importance_sampling_ratio/min": 0.00602934339824146, "sampling/sampling_logp_difference/max": 0.35652687549591067, "sampling/sampling_logp_difference/mean": 0.015356982313096523, "step": 45, "step_time": 28.76009795963764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1886.2, "completions/max_terminated_length": 1599.0, "completions/mean_length": 1163.9333862304688, "completions/mean_terminated_length": 1057.6466918945312, "completions/min_length": 663.8, "completions/min_terminated_length": 663.8, "entropy": 0.17236283471186956, "epoch": 0.018214936247723135, "frac_reward_zero_std": 0.0, "grad_norm": 0.01997266709804535, "kl": 0.0002550489671799975, "learning_rate": 1.7883211678832117e-07, "loss": -0.005824955180287361, "num_tokens": 331739.0, "reward": 1.8566666960716247, "reward_std": 0.7818804442882538, "rewards/boxed_rate/mean": 0.9, "rewards/boxed_rate/std": 0.18492921590805053, "rewards/correctness/mean": 0.36666667461395264, "rewards/correctness/std": 0.3161036252975464, "rewards/multi_component_reward/mean": 0.4233333319425583, "rewards/multi_component_reward/std": 0.4823453426361084, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.18492921590805053, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.10327955484390258, "sampling/importance_sampling_ratio/max": 0.054672015644609925, "sampling/importance_sampling_ratio/mean": 0.01720548253506422, "sampling/importance_sampling_ratio/min": 0.00035658250299805546, "sampling/sampling_logp_difference/max": 0.3509830951690674, "sampling/sampling_logp_difference/mean": 0.011117835808545352, "step": 50, "step_time": 43.21503445059061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3333333432674408, "completions/max_length": 1744.8, "completions/max_terminated_length": 1236.2, "completions/mean_length": 1297.1666931152345, "completions/mean_terminated_length": 1065.0066833496094, "completions/min_length": 936.6, "completions/min_terminated_length": 936.6, "entropy": 0.12224519066512585, "epoch": 0.020036429872495445, "frac_reward_zero_std": 0.2, "grad_norm": 0.021082526072859764, "kl": 0.00019305126891898302, "learning_rate": 1.9708029197080292e-07, "loss": 0.0025430308654904366, "num_tokens": 373144.0, "reward": 1.4600000858306885, "reward_std": 0.7388215482234954, "rewards/boxed_rate/mean": 0.6666666597127915, "rewards/boxed_rate/std": 0.35449349880218506, "rewards/correctness/mean": 0.23333334028720856, "rewards/correctness/std": 0.2882087707519531, "rewards/multi_component_reward/mean": 0.12666668593883515, "rewards/multi_component_reward/std": 0.506528252363205, "rewards/no_answer_rate/mean": 0.33333333134651183, "rewards/no_answer_rate/std": 0.35449349880218506, "rewards/repetition_rate/mean": 0.10000000298023223, "rewards/repetition_rate/std": 0.18492921590805053, "sampling/importance_sampling_ratio/max": 0.10442680462729186, "sampling/importance_sampling_ratio/mean": 0.04177726461493876, "sampling/importance_sampling_ratio/min": 0.011532440476071316, "sampling/sampling_logp_difference/max": 0.32466731071472166, "sampling/sampling_logp_difference/mean": 0.007843528036028146, "step": 55, "step_time": 40.733187234401704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333432674407, "completions/max_length": 1700.0, "completions/max_terminated_length": 1413.4, "completions/mean_length": 923.9000366210937, "completions/mean_terminated_length": 779.9666870117187, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "entropy": 0.18838655066986879, "epoch": 0.02185792349726776, "frac_reward_zero_std": 0.4, "grad_norm": 0.016334548592567444, "kl": 0.0002971158295016115, "learning_rate": 2.1532846715328465e-07, "loss": 0.0031619951128959655, "num_tokens": 403573.0, "reward": 1.393333411216736, "reward_std": 0.45263335704803465, "rewards/boxed_rate/mean": 0.8666666626930237, "rewards/boxed_rate/std": 0.19119417667388916, "rewards/correctness/mean": 0.1666666716337204, "rewards/correctness/std": 0.18492921590805053, "rewards/multi_component_reward/mean": 0.19333333075046538, "rewards/multi_component_reward/std": 0.30624626874923705, "rewards/no_answer_rate/mean": 0.13333333432674407, "rewards/no_answer_rate/std": 0.19119417667388916, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.14395143389701842, "sampling/importance_sampling_ratio/mean": 0.04990396222565323, "sampling/importance_sampling_ratio/min": 0.00032090346864226403, "sampling/sampling_logp_difference/max": 0.40355303287506106, "sampling/sampling_logp_difference/mean": 0.012135114334523679, "step": 60, "step_time": 38.38553868718445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666865348817, "completions/max_length": 1554.4, "completions/max_terminated_length": 1147.6, "completions/mean_length": 972.8000244140625, "completions/mean_terminated_length": 804.5466735839843, "completions/min_length": 498.2, "completions/min_terminated_length": 498.2, "entropy": 0.1604183206955592, "epoch": 0.023679417122040074, "frac_reward_zero_std": 0.2, "grad_norm": 0.00623254245147109, "kl": 0.00027490937888311845, "learning_rate": 2.335766423357664e-07, "loss": -0.005589158460497856, "num_tokens": 436225.0, "reward": 1.490000057220459, "reward_std": 0.6202400684356689, "rewards/boxed_rate/mean": 0.8333333253860473, "rewards/boxed_rate/std": 0.2728438377380371, "rewards/correctness/mean": 0.20000000596046447, "rewards/correctness/std": 0.2665788769721985, "rewards/multi_component_reward/mean": 0.19000000208616258, "rewards/multi_component_reward/std": 0.4335045665502548, "rewards/no_answer_rate/mean": 0.16666666865348817, "rewards/no_answer_rate/std": 0.2728438377380371, "rewards/repetition_rate/mean": 0.1, "rewards/repetition_rate/std": 0.10954451560974121, "sampling/importance_sampling_ratio/max": 0.0990872667171061, "sampling/importance_sampling_ratio/mean": 0.04436926119960845, "sampling/importance_sampling_ratio/min": 0.004196830893285148, "sampling/sampling_logp_difference/max": 0.297141432762146, "sampling/sampling_logp_difference/mean": 0.010508796572685242, "step": 65, "step_time": 36.07466428950429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1634.0, "completions/max_terminated_length": 994.8, "completions/mean_length": 744.2666870117188, "completions/mean_terminated_length": 601.7000122070312, "completions/min_length": 374.2, "completions/min_terminated_length": 374.2, "entropy": 0.2318819284439087, "epoch": 0.025500910746812388, "frac_reward_zero_std": 0.0, "grad_norm": 0.05441533029079437, "kl": 0.0003520799073157832, "learning_rate": 2.518248175182482e-07, "loss": -0.003310435265302658, "num_tokens": 460845.0, "reward": 1.6933334827423097, "reward_std": 0.7755845502018929, "rewards/boxed_rate/mean": 0.899999988079071, "rewards/boxed_rate/std": 0.24494898319244385, "rewards/correctness/mean": 0.3, "rewards/correctness/std": 0.34822853803634646, "rewards/multi_component_reward/mean": 0.36000003293156624, "rewards/multi_component_reward/std": 0.5008985161781311, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.24494898319244385, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.056702688336372375, "sampling/importance_sampling_ratio/mean": 0.019475685060024263, "sampling/importance_sampling_ratio/min": 0.001143362923176032, "sampling/sampling_logp_difference/max": 0.3391279220581055, "sampling/sampling_logp_difference/mean": 0.014528140239417554, "step": 70, "step_time": 36.275424292869864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20000000298023224, "completions/max_length": 1515.2, "completions/max_terminated_length": 845.0, "completions/mean_length": 917.0000244140625, "completions/mean_terminated_length": 678.5633483886719, "completions/min_length": 475.4, "completions/min_terminated_length": 475.4, "entropy": 0.16173700019717216, "epoch": 0.0273224043715847, "frac_reward_zero_std": 0.0, "grad_norm": 0.018244028091430664, "kl": 0.00024622106284368785, "learning_rate": 2.700729927007299e-07, "loss": -0.014353486895561218, "num_tokens": 491205.0, "reward": 1.633333432674408, "reward_std": 0.8448223888874054, "rewards/boxed_rate/mean": 0.8, "rewards/boxed_rate/std": 0.29447373151779177, "rewards/correctness/mean": 0.3, "rewards/correctness/std": 0.34822853803634646, "rewards/multi_component_reward/mean": 0.3000000298023224, "rewards/multi_component_reward/std": 0.5207933127880097, "rewards/no_answer_rate/mean": 0.20000000298023224, "rewards/no_answer_rate/std": 0.29447373151779177, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.16285581663250923, "sampling/importance_sampling_ratio/mean": 0.05478177797049284, "sampling/importance_sampling_ratio/min": 0.0013068500558716862, "sampling/sampling_logp_difference/max": 0.3721138000488281, "sampling/sampling_logp_difference/mean": 0.011166188679635525, "step": 75, "step_time": 35.12463851571083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333432674408, "completions/max_length": 1582.6, "completions/max_terminated_length": 1313.0, "completions/mean_length": 985.9666870117187, "completions/mean_terminated_length": 935.2533569335938, "completions/min_length": 670.8, "completions/min_terminated_length": 670.8, "entropy": 0.17678433768451213, "epoch": 0.029143897996357013, "frac_reward_zero_std": 0.0, "grad_norm": 0.009146427735686302, "kl": 0.0002533862963900901, "learning_rate": 2.8832116788321166e-07, "loss": 0.002100883610546589, "num_tokens": 524180.0, "reward": 1.6633334636688233, "reward_std": 0.8858610212802887, "rewards/boxed_rate/mean": 0.9666666626930237, "rewards/boxed_rate/std": 0.08164966106414795, "rewards/correctness/mean": 0.26666666865348815, "rewards/correctness/std": 0.3823883533477783, "rewards/multi_component_reward/mean": 0.36333334222435953, "rewards/multi_component_reward/std": 0.473619781434536, "rewards/no_answer_rate/mean": 0.03333333432674408, "rewards/no_answer_rate/std": 0.08164966106414795, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.044011454284191134, "sampling/importance_sampling_ratio/mean": 0.013894069381058216, "sampling/importance_sampling_ratio/min": 0.001124254113574473, "sampling/sampling_logp_difference/max": 0.32026147842407227, "sampling/sampling_logp_difference/mean": 0.01106615299358964, "step": 80, "step_time": 36.22448882814497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1693.6, "completions/max_terminated_length": 1402.8, "completions/mean_length": 1091.4666870117187, "completions/mean_terminated_length": 998.7600219726562, "completions/min_length": 584.2, "completions/min_terminated_length": 584.2, "entropy": 0.1462914695342382, "epoch": 0.030965391621129327, "frac_reward_zero_std": 0.0, "grad_norm": 0.018090059980750084, "kl": 0.00022486963668294873, "learning_rate": 3.065693430656934e-07, "loss": -0.00974438264966011, "num_tokens": 560254.0, "reward": 1.8100001335144043, "reward_std": 1.0823543906211852, "rewards/boxed_rate/mean": 0.899999988079071, "rewards/boxed_rate/std": 0.24494898319244385, "rewards/correctness/mean": 0.36666667461395264, "rewards/correctness/std": 0.4794029474258423, "rewards/multi_component_reward/mean": 0.44333334267139435, "rewards/multi_component_reward/std": 0.6124810755252839, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.24494898319244385, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.1276155561208725, "sampling/importance_sampling_ratio/mean": 0.03584536015987396, "sampling/importance_sampling_ratio/min": 0.002319600686418858, "sampling/sampling_logp_difference/max": 0.41160542964935304, "sampling/sampling_logp_difference/mean": 0.010097185987979174, "step": 85, "step_time": 39.475563449971375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3333333432674408, "completions/max_length": 1841.2, "completions/max_terminated_length": 875.0, "completions/mean_length": 1059.2000122070312, "completions/mean_terminated_length": 630.0533386230469, "completions/min_length": 458.8, "completions/min_terminated_length": 458.8, "entropy": 0.2378736046453317, "epoch": 0.03278688524590164, "frac_reward_zero_std": 0.0, "grad_norm": 0.00910874642431736, "kl": 0.0003503820747331095, "learning_rate": 3.2481751824817516e-07, "loss": 0.0034383241087198257, "num_tokens": 593668.0, "reward": 1.4166667222976685, "reward_std": 0.8338384032249451, "rewards/boxed_rate/mean": 0.6333333253860474, "rewards/boxed_rate/std": 0.451508092880249, "rewards/correctness/mean": 0.23333334028720856, "rewards/correctness/std": 0.2882087707519531, "rewards/multi_component_reward/mean": 0.11666668206453323, "rewards/multi_component_reward/std": 0.566602936387062, "rewards/no_answer_rate/mean": 0.36666667759418486, "rewards/no_answer_rate/std": 0.451508092880249, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.10327955484390258, "sampling/importance_sampling_ratio/max": 0.05201358497142792, "sampling/importance_sampling_ratio/mean": 0.017090958543121815, "sampling/importance_sampling_ratio/min": 0.0002041866974779629, "sampling/sampling_logp_difference/max": 0.3399634838104248, "sampling/sampling_logp_difference/mean": 0.015104132890701293, "step": 90, "step_time": 41.00809515919536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06666666865348816, "completions/max_length": 1441.6, "completions/max_terminated_length": 960.8, "completions/mean_length": 802.6000366210938, "completions/mean_terminated_length": 711.3800231933594, "completions/min_length": 525.6, "completions/min_terminated_length": 525.6, "entropy": 0.2225553606947263, "epoch": 0.03460837887067395, "frac_reward_zero_std": 0.4, "grad_norm": 0.01798272132873535, "kl": 0.00032260040970868433, "learning_rate": 3.4306569343065697e-07, "loss": 0.007113450020551681, "num_tokens": 621316.0, "reward": 1.223333477973938, "reward_std": 0.36713925525546076, "rewards/boxed_rate/mean": 0.9333333253860474, "rewards/boxed_rate/std": 0.1632993221282959, "rewards/correctness/mean": 0.06666666865348816, "rewards/correctness/std": 0.1632993221282959, "rewards/multi_component_reward/mean": 0.12333333715796471, "rewards/multi_component_reward/std": 0.2738735854625702, "rewards/no_answer_rate/mean": 0.06666666865348816, "rewards/no_answer_rate/std": 0.1632993221282959, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.15052475407719612, "sampling/importance_sampling_ratio/mean": 0.046577111538499597, "sampling/importance_sampling_ratio/min": 0.012816431716431823, "sampling/sampling_logp_difference/max": 0.34408046007156373, "sampling/sampling_logp_difference/mean": 0.014901423826813698, "step": 95, "step_time": 32.91087112892419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1537.8, "completions/max_terminated_length": 1380.2, "completions/mean_length": 972.9000366210937, "completions/mean_terminated_length": 902.8400146484375, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "entropy": 0.2104227361579736, "epoch": 0.03642987249544627, "frac_reward_zero_std": 0.0, "grad_norm": 0.006778502371162176, "kl": 0.00031352789907638606, "learning_rate": 3.6131386861313867e-07, "loss": -0.0013602237217128278, "num_tokens": 653443.0, "reward": 1.880000078678131, "reward_std": 0.9815195143222809, "rewards/boxed_rate/mean": 0.9, "rewards/boxed_rate/std": 0.18492921590805053, "rewards/correctness/mean": 0.4000000059604645, "rewards/correctness/std": 0.4256481409072876, "rewards/multi_component_reward/mean": 0.48000000715255736, "rewards/multi_component_reward/std": 0.5586783826351166, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.18492921590805053, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.0448111507575959, "sampling/importance_sampling_ratio/mean": 0.01701541549991816, "sampling/importance_sampling_ratio/min": 0.0003750977984182488, "sampling/sampling_logp_difference/max": 0.3470792770385742, "sampling/sampling_logp_difference/mean": 0.013565080799162388, "step": 100, "step_time": 35.31437961217016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 1421.4, "completions/max_terminated_length": 1072.0, "completions/mean_length": 947.7000366210938, "completions/mean_terminated_length": 787.706689453125, "completions/min_length": 551.6, "completions/min_terminated_length": 551.6, "entropy": 0.2239283885806799, "epoch": 0.03825136612021858, "frac_reward_zero_std": 0.0, "grad_norm": 0.011714774183928967, "kl": 0.0003353157648234628, "learning_rate": 3.795620437956204e-07, "loss": 0.009437119215726852, "num_tokens": 683674.0, "reward": 1.4433334589004516, "reward_std": 0.8665581226348877, "rewards/boxed_rate/mean": 0.8333333313465119, "rewards/boxed_rate/std": 0.18492921590805053, "rewards/correctness/mean": 0.20000000596046447, "rewards/correctness/std": 0.36985843181610106, "rewards/multi_component_reward/mean": 0.21000000834465027, "rewards/multi_component_reward/std": 0.5039317846298218, "rewards/no_answer_rate/mean": 0.1666666716337204, "rewards/no_answer_rate/std": 0.18492921590805053, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.1078285550349392, "sampling/importance_sampling_ratio/mean": 0.03690805228543468, "sampling/importance_sampling_ratio/min": 0.0026856323238462275, "sampling/sampling_logp_difference/max": 0.41998746395111086, "sampling/sampling_logp_difference/mean": 0.014628523960709572, "step": 105, "step_time": 32.93989271316677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1471.4, "completions/max_terminated_length": 979.4, "completions/mean_length": 852.2666748046875, "completions/mean_terminated_length": 721.6166748046875, "completions/min_length": 488.4, "completions/min_terminated_length": 488.4, "entropy": 0.14421232057114441, "epoch": 0.04007285974499089, "frac_reward_zero_std": 0.4, "grad_norm": 0.07266412675380707, "kl": 0.00022867745647090488, "learning_rate": 3.978102189781022e-07, "loss": 0.0018386229872703551, "num_tokens": 713286.0, "reward": 1.6733334064483643, "reward_std": 0.7425484180450439, "rewards/boxed_rate/mean": 0.8666666626930237, "rewards/boxed_rate/std": 0.2665788769721985, "rewards/correctness/mean": 0.3000000059604645, "rewards/correctness/std": 0.3161036252975464, "rewards/multi_component_reward/mean": 0.3400000125169754, "rewards/multi_component_reward/std": 0.47312636375427247, "rewards/no_answer_rate/mean": 0.13333333730697633, "rewards/no_answer_rate/std": 0.2665788769721985, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.13587955459952356, "sampling/importance_sampling_ratio/mean": 0.050121305510401726, "sampling/importance_sampling_ratio/min": 0.0025831204319047172, "sampling/sampling_logp_difference/max": 0.345587694644928, "sampling/sampling_logp_difference/mean": 0.00959718506783247, "step": 110, "step_time": 33.89908826816827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2333333373069763, "completions/max_length": 1749.4, "completions/max_terminated_length": 884.2, "completions/mean_length": 869.533349609375, "completions/mean_terminated_length": 539.5833374023438, "completions/min_length": 307.8, "completions/min_terminated_length": 307.8, "entropy": 0.2536199669043223, "epoch": 0.04189435336976321, "frac_reward_zero_std": 0.0, "grad_norm": 0.011132585816085339, "kl": 0.0003656693840942656, "learning_rate": 4.160583941605839e-07, "loss": 0.0021231153979897497, "num_tokens": 741952.0, "reward": 1.7500001192092896, "reward_std": 1.106396782398224, "rewards/boxed_rate/mean": 0.799999988079071, "rewards/boxed_rate/std": 0.35449349880218506, "rewards/correctness/mean": 0.36666667461395264, "rewards/correctness/std": 0.4794029474258423, "rewards/multi_component_reward/mean": 0.383333346247673, "rewards/multi_component_reward/std": 0.6433047533035279, "rewards/no_answer_rate/mean": 0.20000000298023224, "rewards/no_answer_rate/std": 0.35449349880218506, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.25683521628379824, "sampling/importance_sampling_ratio/mean": 0.065315605327487, "sampling/importance_sampling_ratio/min": 0.00023330498968689221, "sampling/sampling_logp_difference/max": 0.35823286771774293, "sampling/sampling_logp_difference/mean": 0.01769500244408846, "step": 115, "step_time": 39.532602636888626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06666666865348816, "completions/max_length": 1573.0, "completions/max_terminated_length": 1324.8, "completions/mean_length": 1093.2000366210937, "completions/mean_terminated_length": 1037.0133666992188, "completions/min_length": 766.8, "completions/min_terminated_length": 766.8, "entropy": 0.15541148359576862, "epoch": 0.04371584699453552, "frac_reward_zero_std": 0.4, "grad_norm": 0.01879267208278179, "kl": 0.00025834220529456314, "learning_rate": 4.343065693430657e-07, "loss": 0.0006856221705675125, "num_tokens": 777766.0, "reward": 1.433333396911621, "reward_std": 0.4934782743453979, "rewards/boxed_rate/mean": 0.9333333253860474, "rewards/boxed_rate/std": 0.1632993221282959, "rewards/correctness/mean": 0.16666666865348817, "rewards/correctness/std": 0.2128240704536438, "rewards/multi_component_reward/mean": 0.23333334028720856, "rewards/multi_component_reward/std": 0.32306827008724215, "rewards/no_answer_rate/mean": 0.06666666865348816, "rewards/no_answer_rate/std": 0.1632993221282959, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.027404608484357596, "sampling/importance_sampling_ratio/mean": 0.012392069399356841, "sampling/importance_sampling_ratio/min": 0.0004944111146869545, "sampling/sampling_logp_difference/max": 0.37873687744140627, "sampling/sampling_logp_difference/mean": 0.010291843488812447, "step": 120, "step_time": 36.481429217197004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1672.2, "completions/max_terminated_length": 1231.2, "completions/mean_length": 1009.0000122070312, "completions/mean_terminated_length": 875.0133422851562, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "entropy": 0.16983376170198122, "epoch": 0.04553734061930783, "frac_reward_zero_std": 0.2, "grad_norm": 0.026680519804358482, "kl": 0.00028378382169951997, "learning_rate": 4.5255474452554743e-07, "loss": 0.002333539165556431, "num_tokens": 811096.0, "reward": 1.3900000691413879, "reward_std": 0.4993088662624359, "rewards/boxed_rate/mean": 0.9, "rewards/boxed_rate/std": 0.18492921590805053, "rewards/correctness/mean": 0.1666666716337204, "rewards/correctness/std": 0.18492921590805053, "rewards/multi_component_reward/mean": 0.22333332896232605, "rewards/multi_component_reward/std": 0.31437968015670775, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.18492921590805053, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.1588123269379139, "sampling/importance_sampling_ratio/mean": 0.04118972420692444, "sampling/importance_sampling_ratio/min": 0.002068098068754896, "sampling/sampling_logp_difference/max": 0.42397408485412597, "sampling/sampling_logp_difference/mean": 0.01178776090964675, "step": 125, "step_time": 38.221346308663485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20000000298023224, "completions/max_length": 1774.8, "completions/max_terminated_length": 1017.6, "completions/mean_length": 970.7666870117188, "completions/mean_terminated_length": 722.2733520507812, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "entropy": 0.20842353130380312, "epoch": 0.04735883424408015, "frac_reward_zero_std": 0.0, "grad_norm": 0.017246264964342117, "kl": 0.00032587918249191714, "learning_rate": 4.7080291970802913e-07, "loss": -0.004489587619900703, "num_tokens": 843681.0, "reward": 1.3233334302902222, "reward_std": 0.6886623933911323, "rewards/boxed_rate/mean": 0.8333333253860473, "rewards/boxed_rate/std": 0.2728438377380371, "rewards/correctness/mean": 0.13333333730697633, "rewards/correctness/std": 0.2665788769721985, "rewards/multi_component_reward/mean": 0.12333333268761634, "rewards/multi_component_reward/std": 0.4415378957986832, "rewards/no_answer_rate/mean": 0.16666666865348817, "rewards/no_answer_rate/std": 0.2728438377380371, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.1632993221282959, "sampling/importance_sampling_ratio/max": 0.057003194093704225, "sampling/importance_sampling_ratio/mean": 0.015649724379181863, "sampling/importance_sampling_ratio/min": 0.0008366983823620787, "sampling/sampling_logp_difference/max": 0.42825262546539306, "sampling/sampling_logp_difference/mean": 0.013275405205786229, "step": 130, "step_time": 40.03070425353944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333730697633, "completions/max_length": 1638.0, "completions/max_terminated_length": 1082.6, "completions/mean_length": 965.3000427246094, "completions/mean_terminated_length": 805.8833435058593, "completions/min_length": 475.4, "completions/min_terminated_length": 475.4, "entropy": 0.20910785247882208, "epoch": 0.04918032786885246, "frac_reward_zero_std": 0.2, "grad_norm": 0.014268649742007256, "kl": 0.00030581061485766744, "learning_rate": 4.89051094890511e-07, "loss": -0.0031251441687345505, "num_tokens": 875316.0, "reward": 1.3700001120567322, "reward_std": 0.6670311987400055, "rewards/boxed_rate/mean": 0.8666666626930237, "rewards/boxed_rate/std": 0.2665788769721985, "rewards/correctness/mean": 0.16666666865348817, "rewards/correctness/std": 0.2728438377380371, "rewards/multi_component_reward/mean": 0.20333334803581238, "rewards/multi_component_reward/std": 0.4036152184009552, "rewards/no_answer_rate/mean": 0.13333333730697633, "rewards/no_answer_rate/std": 0.2665788769721985, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.051390893710777166, "sampling/importance_sampling_ratio/mean": 0.015738325094571338, "sampling/importance_sampling_ratio/min": 0.0017150626291213885, "sampling/sampling_logp_difference/max": 0.3242307841777802, "sampling/sampling_logp_difference/mean": 0.01380961835384369, "step": 135, "step_time": 37.537090591713785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23333334028720856, "completions/max_length": 1763.0, "completions/max_terminated_length": 775.6, "completions/mean_length": 960.7000122070312, "completions/mean_terminated_length": 604.2866760253906, "completions/min_length": 455.4, "completions/min_terminated_length": 455.4, "entropy": 0.23928763965765634, "epoch": 0.051001821493624776, "frac_reward_zero_std": 0.0, "grad_norm": 0.037585072219371796, "kl": 0.0004156571065929408, "learning_rate": 4.996165644171779e-07, "loss": -0.003277498111128807, "num_tokens": 907599.0, "reward": 1.8700001239776611, "reward_std": 1.213301706314087, "rewards/boxed_rate/mean": 0.7666666567325592, "rewards/boxed_rate/std": 0.34822853803634646, "rewards/correctness/mean": 0.4333333343267441, "rewards/correctness/std": 0.5198277235031128, "rewards/multi_component_reward/mean": 0.4366666838526726, "rewards/multi_component_reward/std": 0.7051358222961426, "rewards/no_answer_rate/mean": 0.23333334028720856, "rewards/no_answer_rate/std": 0.34822853803634646, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.03320965538732708, "sampling/importance_sampling_ratio/mean": 0.015124964842107147, "sampling/importance_sampling_ratio/min": 0.0001439221122021143, "sampling/sampling_logp_difference/max": 0.3350650787353516, "sampling/sampling_logp_difference/mean": 0.015135842747986317, "step": 140, "step_time": 39.69141132887453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333432674407, "completions/max_length": 1378.6, "completions/max_terminated_length": 1035.8, "completions/mean_length": 898.6333557128906, "completions/mean_terminated_length": 752.8266784667969, "completions/min_length": 510.2, "completions/min_terminated_length": 510.2, "entropy": 0.15787406551341215, "epoch": 0.052823315118397086, "frac_reward_zero_std": 0.0, "grad_norm": 0.19890737533569336, "kl": 0.0003336386442242656, "learning_rate": 4.986579754601227e-07, "loss": 0.002312604896724224, "num_tokens": 936976.0, "reward": 1.6733333706855773, "reward_std": 0.7778201699256897, "rewards/boxed_rate/mean": 0.8666666626930237, "rewards/boxed_rate/std": 0.19119417667388916, "rewards/correctness/mean": 0.3000000059604645, "rewards/correctness/std": 0.3161036252975464, "rewards/multi_component_reward/mean": 0.34000000953674314, "rewards/multi_component_reward/std": 0.4764533966779709, "rewards/no_answer_rate/mean": 0.13333333432674407, "rewards/no_answer_rate/std": 0.19119417667388916, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.17277967557311058, "sampling/importance_sampling_ratio/mean": 0.0497883933596313, "sampling/importance_sampling_ratio/min": 0.0009090198537052175, "sampling/sampling_logp_difference/max": 0.5492818593978882, "sampling/sampling_logp_difference/mean": 0.010555424820631742, "step": 145, "step_time": 32.12684296686202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333730697633, "completions/max_length": 1774.2, "completions/max_terminated_length": 1617.8, "completions/mean_length": 1212.6333618164062, "completions/mean_terminated_length": 1111.5633544921875, "completions/min_length": 845.2, "completions/min_terminated_length": 845.2, "entropy": 0.17395392432808876, "epoch": 0.0546448087431694, "frac_reward_zero_std": 0.4, "grad_norm": 0.021323509514331818, "kl": 0.00030080197708836444, "learning_rate": 4.976993865030675e-07, "loss": -0.005566118285059929, "num_tokens": 976577.0, "reward": 1.4133334517478944, "reward_std": 0.5511920034885407, "rewards/boxed_rate/mean": 0.8666666626930237, "rewards/boxed_rate/std": 0.2665788769721985, "rewards/correctness/mean": 0.16666666865348817, "rewards/correctness/std": 0.2128240704536438, "rewards/multi_component_reward/mean": 0.17999999672174455, "rewards/multi_component_reward/std": 0.3554683744907379, "rewards/no_answer_rate/mean": 0.13333333730697633, "rewards/no_answer_rate/std": 0.2665788769721985, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.10327955484390258, "sampling/importance_sampling_ratio/max": 0.05498660355806351, "sampling/importance_sampling_ratio/mean": 0.01640337195713073, "sampling/importance_sampling_ratio/min": 1.4973960100639972e-05, "sampling/sampling_logp_difference/max": 0.47727389335632325, "sampling/sampling_logp_difference/mean": 0.011420094780623913, "step": 150, "step_time": 41.61420878618956 }, { "epoch": 0.0546448087431694, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.21666667133569717, "eval_completions/max_length": 1789.4, "eval_completions/max_terminated_length": 1216.88, "eval_completions/mean_length": 1121.4300323486327, "eval_completions/mean_terminated_length": 886.213013305664, "eval_completions/min_length": 641.78, "eval_completions/min_terminated_length": 641.78, "eval_entropy": 0.17698929071426392, "eval_frac_reward_zero_std": 0.08, "eval_kl": 0.00028998344379942864, "eval_loss": -0.0026245773769915104, "eval_num_tokens": 976577.0, "eval_reward": 1.5393334233760834, "eval_reward_std": 0.7298788775503635, "eval_rewards/boxed_rate/mean": 0.7833333307504654, "eval_rewards/boxed_rate/std": 0.32846659898757935, "eval_rewards/correctness/mean": 0.24666666924953462, "eval_rewards/correctness/std": 0.27019834876060483, "eval_rewards/multi_component_reward/mean": 0.21933334972709417, "eval_rewards/multi_component_reward/std": 0.48081229120492935, "eval_rewards/no_answer_rate/mean": 0.21666667133569717, "eval_rewards/no_answer_rate/std": 0.32846659898757935, "eval_rewards/repetition_rate/mean": 0.07333333462476731, "eval_rewards/repetition_rate/std": 0.13300593733787536, "eval_runtime": 1782.1981, "eval_samples_per_second": 0.028, "eval_sampling/importance_sampling_ratio/max": 0.07466922109248117, "eval_sampling/importance_sampling_ratio/mean": 0.026714042400126346, "eval_sampling/importance_sampling_ratio/min": 0.0011027993811156295, "eval_sampling/sampling_logp_difference/max": 0.4103265881538391, "eval_sampling/sampling_logp_difference/mean": 0.011447538509964944, "eval_steps_per_second": 0.005, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1207.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 770.1333679199219, "completions/mean_terminated_length": 770.1333679199219, "completions/min_length": 500.8, "completions/min_terminated_length": 500.8, "entropy": 0.19859646161397299, "epoch": 0.056466302367941715, "frac_reward_zero_std": 0.4, "grad_norm": 0.05886993184685707, "kl": 0.0003771214178414084, "learning_rate": 4.967407975460122e-07, "loss": -0.004334203898906708, "num_tokens": 1003269.0, "reward": 1.4733334302902221, "reward_std": 0.504085260629654, "rewards/boxed_rate/mean": 1.0, "rewards/boxed_rate/std": 0.0, "rewards/correctness/mean": 0.16666666865348817, "rewards/correctness/std": 0.2128240704536438, "rewards/multi_component_reward/mean": 0.27333334758877753, "rewards/multi_component_reward/std": 0.2586013779044151, "rewards/no_answer_rate/mean": 0.0, "rewards/no_answer_rate/std": 0.0, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.10831067697145044, "sampling/importance_sampling_ratio/mean": 0.040180114656686784, "sampling/importance_sampling_ratio/min": 0.0008547768986318261, "sampling/sampling_logp_difference/max": 0.3474395990371704, "sampling/sampling_logp_difference/mean": 0.013024460710585117, "step": 155, "step_time": 28.411310048028827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 1885.8, "completions/max_terminated_length": 1037.6, "completions/mean_length": 888.3666748046875, "completions/mean_terminated_length": 646.7700012207031, "completions/min_length": 398.6, "completions/min_terminated_length": 398.6, "entropy": 0.21153039361039797, "epoch": 0.058287795992714025, "frac_reward_zero_std": 0.0, "grad_norm": 0.030352359637618065, "kl": 0.00036930939143834014, "learning_rate": 4.957822085889571e-07, "loss": -0.015004897117614746, "num_tokens": 1032692.0, "reward": 1.6533334136009217, "reward_std": 0.7879160098731518, "rewards/boxed_rate/mean": 0.8333333253860473, "rewards/boxed_rate/std": 0.34822853803634646, "rewards/correctness/mean": 0.3000000059604645, "rewards/correctness/std": 0.3161036252975464, "rewards/multi_component_reward/mean": 0.3200000114738941, "rewards/multi_component_reward/std": 0.5426497280597686, "rewards/no_answer_rate/mean": 0.1666666716337204, "rewards/no_answer_rate/std": 0.34822853803634646, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.15256396904587746, "sampling/importance_sampling_ratio/mean": 0.05275641949847341, "sampling/importance_sampling_ratio/min": 0.0003334702798868202, "sampling/sampling_logp_difference/max": 0.3136852741241455, "sampling/sampling_logp_difference/mean": 0.014769792556762695, "step": 160, "step_time": 41.816410617157814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2666666716337204, "completions/max_length": 1347.0, "completions/max_terminated_length": 859.2, "completions/mean_length": 907.3000244140625, "completions/mean_terminated_length": 642.22001953125, "completions/min_length": 436.6, "completions/min_terminated_length": 436.6, "entropy": 0.2138912024597327, "epoch": 0.060109289617486336, "frac_reward_zero_std": 0.2, "grad_norm": 0.0335945226252079, "kl": 0.00036641353217419236, "learning_rate": 4.948236196319019e-07, "loss": -0.001109707448631525, "num_tokens": 1062455.0, "reward": 1.733333444595337, "reward_std": 0.6965994656085968, "rewards/boxed_rate/mean": 0.7333333313465118, "rewards/boxed_rate/std": 0.29447373151779177, "rewards/correctness/mean": 0.3666666686534882, "rewards/correctness/std": 0.2728438377380371, "rewards/multi_component_reward/mean": 0.33333335630595684, "rewards/multi_component_reward/std": 0.4374739408493042, "rewards/no_answer_rate/mean": 0.2666666716337204, "rewards/no_answer_rate/std": 0.29447373151779177, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.17867551841773094, "sampling/importance_sampling_ratio/mean": 0.07929584998637437, "sampling/importance_sampling_ratio/min": 0.007277285696347713, "sampling/sampling_logp_difference/max": 0.3139556646347046, "sampling/sampling_logp_difference/mean": 0.014788956940174102, "step": 165, "step_time": 31.849167810566723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666865348817, "completions/max_length": 1693.4, "completions/max_terminated_length": 1086.8, "completions/mean_length": 985.8000244140625, "completions/mean_terminated_length": 765.3133666992187, "completions/min_length": 539.0, "completions/min_terminated_length": 539.0, "entropy": 0.22184887006878853, "epoch": 0.061930783242258654, "frac_reward_zero_std": 0.2, "grad_norm": 0.010766901075839996, "kl": 0.0003837254077855808, "learning_rate": 4.938650306748465e-07, "loss": -0.005718140304088593, "num_tokens": 1094465.0, "reward": 1.5600001335144043, "reward_std": 0.6616819977760315, "rewards/boxed_rate/mean": 0.8333333253860473, "rewards/boxed_rate/std": 0.2728438377380371, "rewards/correctness/mean": 0.2666666656732559, "rewards/correctness/std": 0.2665788769721985, "rewards/multi_component_reward/mean": 0.2933333650231361, "rewards/multi_component_reward/std": 0.4033259034156799, "rewards/no_answer_rate/mean": 0.16666666865348817, "rewards/no_answer_rate/std": 0.2728438377380371, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.04150749985128641, "sampling/importance_sampling_ratio/mean": 0.017618318973109125, "sampling/importance_sampling_ratio/min": 0.0001386483045517349, "sampling/sampling_logp_difference/max": 0.41498665809631347, "sampling/sampling_logp_difference/mean": 0.015049760602414608, "step": 170, "step_time": 38.327568624168634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.33333334028720857, "completions/max_length": 2048.0, "completions/max_terminated_length": 1280.8, "completions/mean_length": 1264.76669921875, "completions/mean_terminated_length": 950.460009765625, "completions/min_length": 712.6, "completions/min_terminated_length": 712.6, "entropy": 0.20655053692559402, "epoch": 0.06375227686703097, "frac_reward_zero_std": 0.0, "grad_norm": 0.1300213485956192, "kl": 0.00040407002428158496, "learning_rate": 4.929064417177914e-07, "loss": -0.002616160549223423, "num_tokens": 1134790.0, "reward": 1.2733334183692933, "reward_std": 0.7226614370942116, "rewards/boxed_rate/mean": 0.6666666567325592, "rewards/boxed_rate/std": 0.45777305364608767, "rewards/correctness/mean": 0.16666666865348817, "rewards/correctness/std": 0.2728438377380371, "rewards/multi_component_reward/mean": 0.0733333457261324, "rewards/multi_component_reward/std": 0.5279393553733825, "rewards/no_answer_rate/mean": 0.33333334028720857, "rewards/no_answer_rate/std": 0.45777305364608767, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.05303769689053297, "sampling/importance_sampling_ratio/mean": 0.015054715669248254, "sampling/importance_sampling_ratio/min": 8.732383148210488e-05, "sampling/sampling_logp_difference/max": 0.29345667362213135, "sampling/sampling_logp_difference/mean": 0.013776615634560585, "step": 175, "step_time": 46.28360453415662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2333333373069763, "completions/max_length": 1848.8, "completions/max_terminated_length": 1419.8, "completions/mean_length": 1218.3667114257812, "completions/mean_terminated_length": 983.3333618164063, "completions/min_length": 721.0, "completions/min_terminated_length": 721.0, "entropy": 0.14731517334779104, "epoch": 0.06557377049180328, "frac_reward_zero_std": 0.0, "grad_norm": 0.03868393227458, "kl": 0.00034544099568544574, "learning_rate": 4.919478527607362e-07, "loss": -0.007346557080745697, "num_tokens": 1175133.0, "reward": 1.7500000834465026, "reward_std": 0.9921784222126007, "rewards/boxed_rate/mean": 0.8, "rewards/boxed_rate/std": 0.29447373151779177, "rewards/correctness/mean": 0.3666666716337204, "rewards/correctness/std": 0.404018247127533, "rewards/multi_component_reward/mean": 0.383333346247673, "rewards/multi_component_reward/std": 0.5928741514682769, "rewards/no_answer_rate/mean": 0.20000000298023224, "rewards/no_answer_rate/std": 0.29447373151779177, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.08876338973641396, "sampling/importance_sampling_ratio/mean": 0.02539586406201124, "sampling/importance_sampling_ratio/min": 0.0011980376816850713, "sampling/sampling_logp_difference/max": 0.3423057317733765, "sampling/sampling_logp_difference/mean": 0.010648915357887746, "step": 180, "step_time": 42.86893743276596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20000000596046447, "completions/max_length": 1706.6, "completions/max_terminated_length": 1151.0, "completions/mean_length": 1097.7666870117187, "completions/mean_terminated_length": 885.9666931152344, "completions/min_length": 678.4, "completions/min_terminated_length": 678.4, "entropy": 0.13618133092919985, "epoch": 0.06739526411657559, "frac_reward_zero_std": 0.4, "grad_norm": 0.04725215584039688, "kl": 0.0003020782351086382, "learning_rate": 4.909892638036809e-07, "loss": -0.0009320625104010105, "num_tokens": 1210850.0, "reward": 1.5866667985916139, "reward_std": 0.3331963121891022, "rewards/boxed_rate/mean": 0.7999999940395355, "rewards/boxed_rate/std": 0.2665788769721985, "rewards/correctness/mean": 0.26666666865348815, "rewards/correctness/std": 0.10327955484390258, "rewards/multi_component_reward/mean": 0.2533333495259285, "rewards/multi_component_reward/std": 0.27605399787425994, "rewards/no_answer_rate/mean": 0.20000000596046447, "rewards/no_answer_rate/std": 0.2665788769721985, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.1632993221282959, "sampling/importance_sampling_ratio/max": 0.0795806871727109, "sampling/importance_sampling_ratio/mean": 0.02610416170209646, "sampling/importance_sampling_ratio/min": 0.002741426149717241, "sampling/sampling_logp_difference/max": 0.3459432005882263, "sampling/sampling_logp_difference/mean": 0.008953130897134542, "step": 185, "step_time": 39.274089214019476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 1897.2, "completions/max_terminated_length": 1270.2, "completions/mean_length": 1048.533349609375, "completions/mean_terminated_length": 838.5800048828125, "completions/min_length": 490.8, "completions/min_terminated_length": 490.8, "entropy": 0.18140921418865522, "epoch": 0.0692167577413479, "frac_reward_zero_std": 0.0, "grad_norm": 0.05130758509039879, "kl": 0.00038463542005047203, "learning_rate": 4.900306748466257e-07, "loss": -0.00950215756893158, "num_tokens": 1245174.0, "reward": 1.2133333802223205, "reward_std": 0.6049932837486267, "rewards/boxed_rate/mean": 0.8, "rewards/boxed_rate/std": 0.36985843181610106, "rewards/correctness/mean": 0.10000000298023223, "rewards/correctness/std": 0.18492921590805053, "rewards/multi_component_reward/mean": 0.08000001087784767, "rewards/multi_component_reward/std": 0.39686688780784607, "rewards/no_answer_rate/mean": 0.20000000596046447, "rewards/no_answer_rate/std": 0.36985843181610106, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.0888688538223505, "sampling/importance_sampling_ratio/mean": 0.03125911168754101, "sampling/importance_sampling_ratio/min": 0.00010974614788835727, "sampling/sampling_logp_difference/max": 0.3610305070877075, "sampling/sampling_logp_difference/mean": 0.01223631165921688, "step": 190, "step_time": 42.93450187277049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2333333373069763, "completions/max_length": 1931.4, "completions/max_terminated_length": 1245.0, "completions/mean_length": 1162.5333740234375, "completions/mean_terminated_length": 899.483349609375, "completions/min_length": 618.2, "completions/min_terminated_length": 618.2, "entropy": 0.1432020000492533, "epoch": 0.07103825136612021, "frac_reward_zero_std": 0.0, "grad_norm": 0.06832960247993469, "kl": 0.00033801408014066204, "learning_rate": 4.890720858895706e-07, "loss": -0.00368819460272789, "num_tokens": 1282900.0, "reward": 1.6600000619888307, "reward_std": 1.03403559923172, "rewards/boxed_rate/mean": 0.7666666626930236, "rewards/boxed_rate/std": 0.3761233925819397, "rewards/correctness/mean": 0.33333333432674406, "rewards/correctness/std": 0.4298781991004944, "rewards/multi_component_reward/mean": 0.3266666799783707, "rewards/multi_component_reward/std": 0.6201063513755798, "rewards/no_answer_rate/mean": 0.2333333373069763, "rewards/no_answer_rate/std": 0.3761233925819397, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.24176665293052793, "sampling/importance_sampling_ratio/mean": 0.05982898166403174, "sampling/importance_sampling_ratio/min": 0.010627524812724437, "sampling/sampling_logp_difference/max": 0.3079464197158813, "sampling/sampling_logp_difference/mean": 0.009482560632750391, "step": 195, "step_time": 43.80696959905326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333730697633, "completions/max_length": 1874.4, "completions/max_terminated_length": 1207.6, "completions/mean_length": 1102.2666748046875, "completions/mean_terminated_length": 956.0200134277344, "completions/min_length": 668.8, "completions/min_terminated_length": 668.8, "entropy": 0.13664823652555544, "epoch": 0.07285974499089254, "frac_reward_zero_std": 0.0, "grad_norm": 0.01865367591381073, "kl": 0.00029515944139954324, "learning_rate": 4.881134969325153e-07, "loss": 0.006076012551784515, "num_tokens": 1318920.0, "reward": 1.9533334493637085, "reward_std": 1.1985347509384154, "rewards/boxed_rate/mean": 0.8666666507720947, "rewards/boxed_rate/std": 0.3265986442565918, "rewards/correctness/mean": 0.4333333373069763, "rewards/correctness/std": 0.5351926565170289, "rewards/multi_component_reward/mean": 0.4866666853427887, "rewards/multi_component_reward/std": 0.7141769766807556, "rewards/no_answer_rate/mean": 0.13333333730697633, "rewards/no_answer_rate/std": 0.3265986442565918, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.08207609243690968, "sampling/importance_sampling_ratio/mean": 0.03297081319615245, "sampling/importance_sampling_ratio/min": 0.003100475773135258, "sampling/sampling_logp_difference/max": 0.40416555404663085, "sampling/sampling_logp_difference/mean": 0.008233289048075677, "step": 200, "step_time": 42.36675942577422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333730697633, "completions/max_length": 1600.0, "completions/max_terminated_length": 1066.6, "completions/mean_length": 959.2000122070312, "completions/mean_terminated_length": 798.0866943359375, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "entropy": 0.14408907170097032, "epoch": 0.07468123861566485, "frac_reward_zero_std": 0.0, "grad_norm": 0.032731793820858, "kl": 0.0003691373499653613, "learning_rate": 4.871549079754601e-07, "loss": -0.001960751600563526, "num_tokens": 1352016.0, "reward": 1.673333430290222, "reward_std": 0.9306211471557617, "rewards/boxed_rate/mean": 0.8666666626930237, "rewards/boxed_rate/std": 0.2665788769721985, "rewards/correctness/mean": 0.3000000029802322, "rewards/correctness/std": 0.404018247127533, "rewards/multi_component_reward/mean": 0.3400000214576721, "rewards/multi_component_reward/std": 0.5803858906030654, "rewards/no_answer_rate/mean": 0.13333333730697633, "rewards/no_answer_rate/std": 0.2665788769721985, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.10455413907766342, "sampling/importance_sampling_ratio/mean": 0.03612115085124969, "sampling/importance_sampling_ratio/min": 0.0026871272621811215, "sampling/sampling_logp_difference/max": 0.3213069200515747, "sampling/sampling_logp_difference/mean": 0.00922326734289527, "step": 205, "step_time": 37.272668573819104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1572.2, "completions/max_terminated_length": 1133.2, "completions/mean_length": 879.3333618164063, "completions/mean_terminated_length": 743.8000244140625, "completions/min_length": 459.6, "completions/min_terminated_length": 459.6, "entropy": 0.19108737086256344, "epoch": 0.07650273224043716, "frac_reward_zero_std": 0.0, "grad_norm": 0.018986158072948456, "kl": 0.0004163069315836765, "learning_rate": 4.86196319018405e-07, "loss": 0.0035621844232082366, "num_tokens": 1380880.0, "reward": 1.740000104904175, "reward_std": 0.947225558757782, "rewards/boxed_rate/mean": 0.9, "rewards/boxed_rate/std": 0.18492921590805053, "rewards/correctness/mean": 0.33333333432674406, "rewards/correctness/std": 0.4298781991004944, "rewards/multi_component_reward/mean": 0.40666669309139253, "rewards/multi_component_reward/std": 0.5280151665210724, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.18492921590805053, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.07124827802181244, "sampling/importance_sampling_ratio/mean": 0.02541533587500453, "sampling/importance_sampling_ratio/min": 0.002891992630936181, "sampling/sampling_logp_difference/max": 0.3407265663146973, "sampling/sampling_logp_difference/mean": 0.012478712201118469, "step": 210, "step_time": 35.72363304812461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 1773.4, "completions/max_terminated_length": 1327.6, "completions/mean_length": 1154.46669921875, "completions/mean_terminated_length": 950.5800048828125, "completions/min_length": 652.4, "completions/min_terminated_length": 652.4, "entropy": 0.18063361259798208, "epoch": 0.07832422586520947, "frac_reward_zero_std": 0.0, "grad_norm": 0.02516000345349312, "kl": 0.0003309295129535409, "learning_rate": 4.852377300613496e-07, "loss": -0.003862759843468666, "num_tokens": 1418892.0, "reward": 1.766666793823242, "reward_std": 0.897074180841446, "rewards/boxed_rate/mean": 0.8333333373069763, "rewards/boxed_rate/std": 0.2882087707519531, "rewards/correctness/mean": 0.33333333432674406, "rewards/correctness/std": 0.36985843181610106, "rewards/multi_component_reward/mean": 0.3333333648741245, "rewards/multi_component_reward/std": 0.5403176128864289, "rewards/no_answer_rate/mean": 0.1666666716337204, "rewards/no_answer_rate/std": 0.2882087707519531, "rewards/repetition_rate/mean": 0.10000000298023223, "rewards/repetition_rate/std": 0.24494898319244385, "sampling/importance_sampling_ratio/max": 0.04162683514878154, "sampling/importance_sampling_ratio/mean": 0.011416873242706061, "sampling/importance_sampling_ratio/min": 0.001456704799248534, "sampling/sampling_logp_difference/max": 0.3141038179397583, "sampling/sampling_logp_difference/mean": 0.011231625638902187, "step": 215, "step_time": 40.99782377649099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333432674408, "completions/max_length": 1087.4, "completions/max_terminated_length": 1005.0, "completions/mean_length": 687.36669921875, "completions/mean_terminated_length": 652.033349609375, "completions/min_length": 441.2, "completions/min_terminated_length": 441.2, "entropy": 0.20975149522225062, "epoch": 0.08014571948998178, "frac_reward_zero_std": 0.0, "grad_norm": 0.026713233441114426, "kl": 0.0004683414850054154, "learning_rate": 4.842791411042944e-07, "loss": -0.0019101161509752274, "num_tokens": 1442585.0, "reward": 1.9400001287460327, "reward_std": 1.0250610709190369, "rewards/boxed_rate/mean": 0.9666666626930237, "rewards/boxed_rate/std": 0.08164966106414795, "rewards/correctness/mean": 0.4, "rewards/correctness/std": 0.45777305364608767, "rewards/multi_component_reward/mean": 0.5066666960716247, "rewards/multi_component_reward/std": 0.49611697196960447, "rewards/no_answer_rate/mean": 0.03333333432674408, "rewards/no_answer_rate/std": 0.08164966106414795, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.11259435554966331, "sampling/importance_sampling_ratio/mean": 0.04658988020382822, "sampling/importance_sampling_ratio/min": 0.0064018826728026255, "sampling/sampling_logp_difference/max": 0.35799312591552734, "sampling/sampling_logp_difference/mean": 0.013789117708802224, "step": 220, "step_time": 25.938872035220264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1644.6, "completions/max_terminated_length": 1458.8, "completions/mean_length": 1090.4333618164062, "completions/mean_terminated_length": 1024.5933349609375, "completions/min_length": 586.4, "completions/min_terminated_length": 586.4, "entropy": 0.15132917563120524, "epoch": 0.08196721311475409, "frac_reward_zero_std": 0.2, "grad_norm": 0.015617283061146736, "kl": 0.00032217044063145297, "learning_rate": 4.833205521472393e-07, "loss": 0.005397457629442215, "num_tokens": 1478730.0, "reward": 1.2733334183692933, "reward_std": 0.49739299416542054, "rewards/boxed_rate/mean": 0.9, "rewards/boxed_rate/std": 0.18492921590805053, "rewards/correctness/mean": 0.10000000298023223, "rewards/correctness/std": 0.18492921590805053, "rewards/multi_component_reward/mean": 0.14000000953674316, "rewards/multi_component_reward/std": 0.3222720980644226, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.18492921590805053, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.09620626708492637, "sampling/importance_sampling_ratio/mean": 0.030177546793129294, "sampling/importance_sampling_ratio/min": 0.0006849504668480222, "sampling/sampling_logp_difference/max": 0.35564944744110105, "sampling/sampling_logp_difference/mean": 0.009922309406101703, "step": 225, "step_time": 38.19373752269894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.26666667461395266, "completions/max_length": 1435.0, "completions/max_terminated_length": 1110.6, "completions/mean_length": 1050.7667114257813, "completions/mean_terminated_length": 875.8666809082031, "completions/min_length": 714.8, "completions/min_terminated_length": 714.8, "entropy": 0.20316853250066438, "epoch": 0.08378870673952642, "frac_reward_zero_std": 0.0, "grad_norm": 0.021429063752293587, "kl": 0.00040220142885421715, "learning_rate": 4.82361963190184e-07, "loss": -0.00029108244925737383, "num_tokens": 1513091.0, "reward": 1.3133334159851073, "reward_std": 0.7383940577507019, "rewards/boxed_rate/mean": 0.7333333343267441, "rewards/boxed_rate/std": 0.19119417667388916, "rewards/correctness/mean": 0.1666666716337204, "rewards/correctness/std": 0.2882087707519531, "rewards/multi_component_reward/mean": 0.11333334147930145, "rewards/multi_component_reward/std": 0.4417478919029236, "rewards/no_answer_rate/mean": 0.2666666626930237, "rewards/no_answer_rate/std": 0.19119417667388916, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.14741267394274474, "sampling/importance_sampling_ratio/mean": 0.06669870759360493, "sampling/importance_sampling_ratio/min": 0.0190697070662623, "sampling/sampling_logp_difference/max": 0.34659633636474607, "sampling/sampling_logp_difference/mean": 0.013104908354580402, "step": 230, "step_time": 33.9486582595855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333432674408, "completions/max_length": 1370.2, "completions/max_terminated_length": 1313.4, "completions/mean_length": 1031.9666870117187, "completions/mean_terminated_length": 1006.8400268554688, "completions/min_length": 716.6, "completions/min_terminated_length": 716.6, "entropy": 0.14171662541727226, "epoch": 0.08561020036429873, "frac_reward_zero_std": 0.0, "grad_norm": 0.031329505145549774, "kl": 0.0002987771406575727, "learning_rate": 4.814033742331288e-07, "loss": -0.0023104714229702948, "num_tokens": 1546912.0, "reward": 1.7100001096725463, "reward_std": 0.7802696824073792, "rewards/boxed_rate/mean": 0.9666666626930237, "rewards/boxed_rate/std": 0.08164966106414795, "rewards/correctness/mean": 0.3, "rewards/correctness/std": 0.34822853803634646, "rewards/multi_component_reward/mean": 0.41000002026557925, "rewards/multi_component_reward/std": 0.43204120099544524, "rewards/no_answer_rate/mean": 0.03333333432674408, "rewards/no_answer_rate/std": 0.08164966106414795, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.06215879218652844, "sampling/importance_sampling_ratio/mean": 0.030630824994295834, "sampling/importance_sampling_ratio/min": 0.007751126746109232, "sampling/sampling_logp_difference/max": 0.3309502720832825, "sampling/sampling_logp_difference/mean": 0.009595037158578634, "step": 235, "step_time": 33.03153369966894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3000000029802322, "completions/max_length": 1976.2, "completions/max_terminated_length": 1429.4, "completions/mean_length": 1217.8667114257812, "completions/mean_terminated_length": 870.3533569335938, "completions/min_length": 525.4, "completions/min_terminated_length": 525.4, "entropy": 0.1393390517681837, "epoch": 0.08743169398907104, "frac_reward_zero_std": 0.0, "grad_norm": 0.05459148809313774, "kl": 0.0003685045870952308, "learning_rate": 4.804447852760736e-07, "loss": -0.0035786613821983337, "num_tokens": 1587708.0, "reward": 1.52666677236557, "reward_std": 0.9796114563941956, "rewards/boxed_rate/mean": 0.7, "rewards/boxed_rate/std": 0.404018247127533, "rewards/correctness/mean": 0.26666666865348815, "rewards/correctness/std": 0.3823883533477783, "rewards/multi_component_reward/mean": 0.19333335161209106, "rewards/multi_component_reward/std": 0.6527607858180999, "rewards/no_answer_rate/mean": 0.3000000029802322, "rewards/no_answer_rate/std": 0.404018247127533, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.1632993221282959, "sampling/importance_sampling_ratio/max": 0.08990185633301735, "sampling/importance_sampling_ratio/mean": 0.030038297548890112, "sampling/importance_sampling_ratio/min": 0.00043211893236647645, "sampling/sampling_logp_difference/max": 0.38284850120544434, "sampling/sampling_logp_difference/mean": 0.008784445654600858, "step": 240, "step_time": 45.60475273691118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333730697633, "completions/max_length": 1608.4, "completions/max_terminated_length": 1074.2, "completions/mean_length": 920.5666870117187, "completions/mean_terminated_length": 754.0433471679687, "completions/min_length": 548.6, "completions/min_terminated_length": 548.6, "entropy": 0.13041699826717376, "epoch": 0.08925318761384335, "frac_reward_zero_std": 0.2, "grad_norm": 0.011656666174530983, "kl": 0.00033540410368004814, "learning_rate": 4.794861963190184e-07, "loss": -0.004278642311692238, "num_tokens": 1618097.0, "reward": 1.3933334350585938, "reward_std": 0.31584063470363616, "rewards/boxed_rate/mean": 0.8666666626930237, "rewards/boxed_rate/std": 0.2665788769721985, "rewards/correctness/mean": 0.13333333730697633, "rewards/correctness/std": 0.10327955484390258, "rewards/multi_component_reward/mean": 0.1266666680574417, "rewards/multi_component_reward/std": 0.3364715576171875, "rewards/no_answer_rate/mean": 0.13333333730697633, "rewards/no_answer_rate/std": 0.2665788769721985, "rewards/repetition_rate/mean": 0.13333333730697633, "rewards/repetition_rate/std": 0.2665788769721985, "sampling/importance_sampling_ratio/max": 0.13740080818533898, "sampling/importance_sampling_ratio/mean": 0.05796990524977445, "sampling/importance_sampling_ratio/min": 0.006626390311389762, "sampling/sampling_logp_difference/max": 0.3156670331954956, "sampling/sampling_logp_difference/mean": 0.008615392539650202, "step": 245, "step_time": 36.637498759664595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333432674408, "completions/max_length": 1533.4, "completions/max_terminated_length": 1395.2, "completions/mean_length": 973.1333923339844, "completions/mean_terminated_length": 939.9467224121094, "completions/min_length": 659.4, "completions/min_terminated_length": 659.4, "entropy": 0.14471415306131044, "epoch": 0.09107468123861566, "frac_reward_zero_std": 0.2, "grad_norm": 0.012578152120113373, "kl": 0.00044469132699305193, "learning_rate": 4.785276073619632e-07, "loss": 0.00015797601081430913, "num_tokens": 1651371.0, "reward": 2.2233334302902223, "reward_std": 0.6588261000812053, "rewards/boxed_rate/mean": 0.9666666626930237, "rewards/boxed_rate/std": 0.08164966106414795, "rewards/correctness/mean": 0.5333333432674408, "rewards/correctness/std": 0.3098386645317078, "rewards/multi_component_reward/mean": 0.656666674464941, "rewards/multi_component_reward/std": 0.41430723667144775, "rewards/no_answer_rate/mean": 0.03333333432674408, "rewards/no_answer_rate/std": 0.08164966106414795, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.18325970163568855, "sampling/importance_sampling_ratio/mean": 0.10900020101107658, "sampling/importance_sampling_ratio/min": 0.058780076946641203, "sampling/sampling_logp_difference/max": 0.28016397953033445, "sampling/sampling_logp_difference/mean": 0.009201886225491762, "step": 250, "step_time": 36.18357318136841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1565.8, "completions/max_terminated_length": 1137.4, "completions/mean_length": 961.133349609375, "completions/mean_terminated_length": 823.0366821289062, "completions/min_length": 616.4, "completions/min_terminated_length": 616.4, "entropy": 0.14495401444534461, "epoch": 0.09289617486338798, "frac_reward_zero_std": 0.2, "grad_norm": 0.010682695545256138, "kl": 0.00035484658534793806, "learning_rate": 4.775690184049079e-07, "loss": 0.0014964478090405465, "num_tokens": 1682425.0, "reward": 1.8333334445953369, "reward_std": 0.6724297642707825, "rewards/boxed_rate/mean": 0.9, "rewards/boxed_rate/std": 0.18492921590805053, "rewards/correctness/mean": 0.3666666716337204, "rewards/correctness/std": 0.2882087707519531, "rewards/multi_component_reward/mean": 0.4333333522081375, "rewards/multi_component_reward/std": 0.4199430406093597, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.18492921590805053, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.10162204280495643, "sampling/importance_sampling_ratio/mean": 0.033536690101027486, "sampling/importance_sampling_ratio/min": 0.00426993980855085, "sampling/sampling_logp_difference/max": 0.33833720684051516, "sampling/sampling_logp_difference/mean": 0.009549971111118794, "step": 255, "step_time": 35.81306515969336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4000000089406967, "completions/max_length": 2048.0, "completions/max_terminated_length": 1168.6, "completions/mean_length": 1366.4000366210937, "completions/mean_terminated_length": 998.9466796875, "completions/min_length": 838.8, "completions/min_terminated_length": 838.8, "entropy": 0.14179731508096058, "epoch": 0.0947176684881603, "frac_reward_zero_std": 0.0, "grad_norm": 0.05714869126677513, "kl": 0.00032895591721171513, "learning_rate": 4.7661042944785273e-07, "loss": -0.004640129953622818, "num_tokens": 1726081.0, "reward": 1.4900000929832458, "reward_std": 0.996655923128128, "rewards/boxed_rate/mean": 0.6000000059604644, "rewards/boxed_rate/std": 0.5010328412055969, "rewards/correctness/mean": 0.2666666716337204, "rewards/correctness/std": 0.3977532863616943, "rewards/multi_component_reward/mean": 0.12333334572613239, "rewards/multi_component_reward/std": 0.6989697873592376, "rewards/no_answer_rate/mean": 0.4000000089406967, "rewards/no_answer_rate/std": 0.5010328412055969, "rewards/repetition_rate/mean": 0.1, "rewards/repetition_rate/std": 0.10954451560974121, "sampling/importance_sampling_ratio/max": 0.043305744789540766, "sampling/importance_sampling_ratio/mean": 0.01504859896376729, "sampling/importance_sampling_ratio/min": 4.04306270474383e-05, "sampling/sampling_logp_difference/max": 0.3569210052490234, "sampling/sampling_logp_difference/mean": 0.009369691275060177, "step": 260, "step_time": 46.90199479162693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2666666716337204, "completions/max_length": 1583.0, "completions/max_terminated_length": 934.6, "completions/mean_length": 1023.133349609375, "completions/mean_terminated_length": 685.6266723632813, "completions/min_length": 537.4, "completions/min_terminated_length": 537.4, "entropy": 0.17780156135559083, "epoch": 0.0965391621129326, "frac_reward_zero_std": 0.0, "grad_norm": 0.018404532223939896, "kl": 0.0003682690895705794, "learning_rate": 4.756518404907975e-07, "loss": 0.012411002814769746, "num_tokens": 1759811.0, "reward": 1.3833333849906921, "reward_std": 0.7943651616573334, "rewards/boxed_rate/mean": 0.7333333313465118, "rewards/boxed_rate/std": 0.29447373151779177, "rewards/correctness/mean": 0.20000000298023224, "rewards/correctness/std": 0.29447373151779177, "rewards/multi_component_reward/mean": 0.15000001043081285, "rewards/multi_component_reward/std": 0.5146282583475112, "rewards/no_answer_rate/mean": 0.2666666716337204, "rewards/no_answer_rate/std": 0.29447373151779177, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.1577057947986759, "sampling/importance_sampling_ratio/mean": 0.09503802299732342, "sampling/importance_sampling_ratio/min": 0.0009112325180752826, "sampling/sampling_logp_difference/max": 0.3926606416702271, "sampling/sampling_logp_difference/mean": 0.012097407877445222, "step": 265, "step_time": 36.53071039505303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333730697633, "completions/max_length": 1533.2, "completions/max_terminated_length": 932.6, "completions/mean_length": 900.7333618164063, "completions/mean_terminated_length": 726.6700073242188, "completions/min_length": 482.4, "completions/min_terminated_length": 482.4, "entropy": 0.18941813930869103, "epoch": 0.09836065573770492, "frac_reward_zero_std": 0.0, "grad_norm": 0.1255546510219574, "kl": 0.0004568617567808057, "learning_rate": 4.746932515337423e-07, "loss": -0.006510256975889206, "num_tokens": 1790271.0, "reward": 1.7666667699813843, "reward_std": 1.1080477476119994, "rewards/boxed_rate/mean": 0.8333333253860473, "rewards/boxed_rate/std": 0.34822853803634646, "rewards/correctness/mean": 0.33333333730697634, "rewards/correctness/std": 0.48566790819168093, "rewards/multi_component_reward/mean": 0.33333334922790525, "rewards/multi_component_reward/std": 0.6733725190162658, "rewards/no_answer_rate/mean": 0.1666666716337204, "rewards/no_answer_rate/std": 0.34822853803634646, "rewards/repetition_rate/mean": 0.10000000298023223, "rewards/repetition_rate/std": 0.18492921590805053, "sampling/importance_sampling_ratio/max": 0.0878407845273614, "sampling/importance_sampling_ratio/mean": 0.026912105677183717, "sampling/importance_sampling_ratio/min": 0.0006732676178655298, "sampling/sampling_logp_difference/max": 0.33626770973205566, "sampling/sampling_logp_difference/mean": 0.012423686124384404, "step": 270, "step_time": 35.3790395591408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1538.0, "completions/max_terminated_length": 1314.2, "completions/mean_length": 987.2333618164063, "completions/mean_terminated_length": 886.1933471679688, "completions/min_length": 572.4, "completions/min_terminated_length": 572.4, "entropy": 0.17079306667049726, "epoch": 0.10018214936247723, "frac_reward_zero_std": 0.0, "grad_norm": 0.0078973937779665, "kl": 0.00039174315752461554, "learning_rate": 4.737346625766871e-07, "loss": -0.0023402150720357893, "num_tokens": 1822306.0, "reward": 1.833333396911621, "reward_std": 0.9216741323471069, "rewards/boxed_rate/mean": 0.899999988079071, "rewards/boxed_rate/std": 0.24494898319244385, "rewards/correctness/mean": 0.3666666716337204, "rewards/correctness/std": 0.404018247127533, "rewards/multi_component_reward/mean": 0.43333334028720855, "rewards/multi_component_reward/std": 0.5590049713850022, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.24494898319244385, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.05113908690400422, "sampling/importance_sampling_ratio/mean": 0.021760703669860958, "sampling/importance_sampling_ratio/min": 0.008651768978314297, "sampling/sampling_logp_difference/max": 0.4047112584114075, "sampling/sampling_logp_difference/mean": 0.01078464426100254, "step": 275, "step_time": 35.81637797001749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20000000298023224, "completions/max_length": 1824.0, "completions/max_terminated_length": 1109.4, "completions/mean_length": 981.833349609375, "completions/mean_terminated_length": 751.3400024414062, "completions/min_length": 517.2, "completions/min_terminated_length": 517.2, "entropy": 0.15676350990931193, "epoch": 0.10200364298724955, "frac_reward_zero_std": 0.0, "grad_norm": 0.01452692411839962, "kl": 0.0003485525502280022, "learning_rate": 4.727760736196319e-07, "loss": -0.004090853407979012, "num_tokens": 1855067.0, "reward": 1.4933334350585938, "reward_std": 0.7522169463336468, "rewards/boxed_rate/mean": 0.799999988079071, "rewards/boxed_rate/std": 0.35449349880218506, "rewards/correctness/mean": 0.23333333432674408, "rewards/correctness/std": 0.30073869228363037, "rewards/multi_component_reward/mean": 0.2266666792333126, "rewards/multi_component_reward/std": 0.5248861283063888, "rewards/no_answer_rate/mean": 0.20000000298023224, "rewards/no_answer_rate/std": 0.35449349880218506, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.12218071203678846, "sampling/importance_sampling_ratio/mean": 0.04405408292077482, "sampling/importance_sampling_ratio/min": 0.00501076535471764, "sampling/sampling_logp_difference/max": 0.37753498554229736, "sampling/sampling_logp_difference/mean": 0.009699719026684761, "step": 280, "step_time": 40.97467570956796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 1744.4, "completions/max_terminated_length": 1173.6, "completions/mean_length": 970.86669921875, "completions/mean_terminated_length": 758.7533447265625, "completions/min_length": 479.4, "completions/min_terminated_length": 479.4, "entropy": 0.15489849572380385, "epoch": 0.10382513661202186, "frac_reward_zero_std": 0.2, "grad_norm": 0.09483744204044342, "kl": 0.000336567038417949, "learning_rate": 4.718174846625766e-07, "loss": -0.004700837284326553, "num_tokens": 1886869.0, "reward": 1.7000001311302184, "reward_std": 0.8241950333118438, "rewards/boxed_rate/mean": 0.8333333253860473, "rewards/boxed_rate/std": 0.34822853803634646, "rewards/correctness/mean": 0.33333333730697634, "rewards/correctness/std": 0.32236858606338503, "rewards/multi_component_reward/mean": 0.36666667461395264, "rewards/multi_component_reward/std": 0.5094490587711334, "rewards/no_answer_rate/mean": 0.1666666716337204, "rewards/no_answer_rate/std": 0.34822853803634646, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.13168737590312957, "sampling/importance_sampling_ratio/mean": 0.06171476859599352, "sampling/importance_sampling_ratio/min": 0.010470659041346208, "sampling/sampling_logp_difference/max": 0.344563889503479, "sampling/sampling_logp_difference/mean": 0.010558286216109991, "step": 285, "step_time": 39.89412596244365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23333333432674408, "completions/max_length": 1809.2, "completions/max_terminated_length": 1132.6, "completions/mean_length": 1119.7000366210937, "completions/mean_terminated_length": 832.9400268554688, "completions/min_length": 590.4, "completions/min_terminated_length": 590.4, "entropy": 0.15704507902264594, "epoch": 0.10564663023679417, "frac_reward_zero_std": 0.2, "grad_norm": 0.023737642914056778, "kl": 0.0003448050100511561, "learning_rate": 4.7085889570552147e-07, "loss": 0.0019955366849899294, "num_tokens": 1923040.0, "reward": 1.380000114440918, "reward_std": 0.5739886239171028, "rewards/boxed_rate/mean": 0.7666666626930236, "rewards/boxed_rate/std": 0.30073869228363037, "rewards/correctness/mean": 0.16666666865348817, "rewards/correctness/std": 0.2128240704536438, "rewards/multi_component_reward/mean": 0.11333334371447563, "rewards/multi_component_reward/std": 0.46365060210227965, "rewards/no_answer_rate/mean": 0.23333333432674408, "rewards/no_answer_rate/std": 0.30073869228363037, "rewards/repetition_rate/mean": 0.10000000298023223, "rewards/repetition_rate/std": 0.18492921590805053, "sampling/importance_sampling_ratio/max": 0.04411560408771038, "sampling/importance_sampling_ratio/mean": 0.01753278383985162, "sampling/importance_sampling_ratio/min": 0.0009861058487468544, "sampling/sampling_logp_difference/max": 0.3644953012466431, "sampling/sampling_logp_difference/mean": 0.010005151480436325, "step": 290, "step_time": 41.05979204457253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333730697633, "completions/max_length": 1592.8, "completions/max_terminated_length": 1143.4, "completions/mean_length": 987.1000366210938, "completions/mean_terminated_length": 825.9633544921875, "completions/min_length": 591.8, "completions/min_terminated_length": 591.8, "entropy": 0.16288827657699584, "epoch": 0.10746812386156648, "frac_reward_zero_std": 0.4, "grad_norm": 0.026686564087867737, "kl": 0.0003618090704549104, "learning_rate": 4.6990030674846625e-07, "loss": -0.0018155150115489959, "num_tokens": 1955647.0, "reward": 1.7666668176651001, "reward_std": 0.4796482801437378, "rewards/boxed_rate/mean": 0.8666666626930237, "rewards/boxed_rate/std": 0.2665788769721985, "rewards/correctness/mean": 0.33333333432674406, "rewards/correctness/std": 0.19119417667388916, "rewards/multi_component_reward/mean": 0.36666668206453323, "rewards/multi_component_reward/std": 0.34831122159957884, "rewards/no_answer_rate/mean": 0.13333333730697633, "rewards/no_answer_rate/std": 0.2665788769721985, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.1632993221282959, "sampling/importance_sampling_ratio/max": 0.17200291641056537, "sampling/importance_sampling_ratio/mean": 0.09221775899641216, "sampling/importance_sampling_ratio/min": 0.053504807552870126, "sampling/sampling_logp_difference/max": 0.31802849769592284, "sampling/sampling_logp_difference/mean": 0.010488973837345839, "step": 295, "step_time": 36.921972643770275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333432674408, "completions/max_length": 1181.2, "completions/max_terminated_length": 1068.4, "completions/mean_length": 714.5666870117187, "completions/mean_terminated_length": 675.57333984375, "completions/min_length": 433.8, "completions/min_terminated_length": 433.8, "entropy": 0.20909111325939497, "epoch": 0.1092896174863388, "frac_reward_zero_std": 0.0, "grad_norm": 0.9235589504241943, "kl": 0.0007714893268712331, "learning_rate": 4.68941717791411e-07, "loss": 0.0028855174779891966, "num_tokens": 1980240.0, "reward": 1.8966667413711549, "reward_std": 1.0613920927047729, "rewards/boxed_rate/mean": 0.9666666626930237, "rewards/boxed_rate/std": 0.08164966106414795, "rewards/correctness/mean": 0.3666666716337204, "rewards/correctness/std": 0.5072978019714356, "rewards/multi_component_reward/mean": 0.4633333578705788, "rewards/multi_component_reward/std": 0.5980847001075744, "rewards/no_answer_rate/mean": 0.03333333432674408, "rewards/no_answer_rate/std": 0.08164966106414795, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.10327955484390258, "sampling/importance_sampling_ratio/max": 0.11060594655573368, "sampling/importance_sampling_ratio/mean": 0.05734073922503739, "sampling/importance_sampling_ratio/min": 0.024227803308775008, "sampling/sampling_logp_difference/max": 0.31456995010375977, "sampling/sampling_logp_difference/mean": 0.013103757984936237, "step": 300, "step_time": 27.952531585469842 }, { "epoch": 0.1092896174863388, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.22666667222976686, "eval_completions/max_length": 1758.54, "eval_completions/max_terminated_length": 1125.4, "eval_completions/mean_length": 1107.770029296875, "eval_completions/mean_terminated_length": 846.1800146484375, "eval_completions/min_length": 606.8, "eval_completions/min_terminated_length": 606.8, "eval_entropy": 0.17873862124979495, "eval_frac_reward_zero_std": 0.08, "eval_kl": 0.0003903971218096558, "eval_loss": -0.0051070391200482845, "eval_num_tokens": 1980240.0, "eval_reward": 1.5980000698566437, "eval_reward_std": 0.782314371317625, "eval_rewards/boxed_rate/mean": 0.7766666629910469, "eval_rewards/boxed_rate/std": 0.33167909026145936, "eval_rewards/correctness/mean": 0.28666666984558103, "eval_rewards/correctness/std": 0.30271870851516725, "eval_rewards/multi_component_reward/mean": 0.26800001293420794, "eval_rewards/multi_component_reward/std": 0.5066750717163085, "eval_rewards/no_answer_rate/mean": 0.22333333760499954, "eval_rewards/no_answer_rate/std": 0.33167909026145936, "eval_rewards/repetition_rate/mean": 0.04333333432674408, "eval_rewards/repetition_rate/std": 0.08060015916824341, "eval_runtime": 1753.4069, "eval_samples_per_second": 0.029, "eval_sampling/importance_sampling_ratio/max": 0.0897229278186569, "eval_sampling/importance_sampling_ratio/mean": 0.035299012192626836, "eval_sampling/importance_sampling_ratio/min": 0.0025020323396462286, "eval_sampling/sampling_logp_difference/max": 0.5019185048341751, "eval_sampling/sampling_logp_difference/mean": 0.012026464603841304, "eval_steps_per_second": 0.005, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1, "completions/max_length": 1605.6, "completions/max_terminated_length": 1384.6, "completions/mean_length": 1010.7000366210938, "completions/mean_terminated_length": 876.26669921875, "completions/min_length": 573.8, "completions/min_terminated_length": 573.8, "entropy": 0.1340769293407599, "epoch": 0.1111111111111111, "frac_reward_zero_std": 0.2, "grad_norm": 0.034374091774225235, "kl": 0.000321884588629473, "learning_rate": 4.6798312883435583e-07, "loss": -0.014604152739048004, "num_tokens": 2014467.0, "reward": 2.043333411216736, "reward_std": 0.8047043204307556, "rewards/boxed_rate/mean": 0.9, "rewards/boxed_rate/std": 0.10954451560974121, "rewards/correctness/mean": 0.4666666716337204, "rewards/correctness/std": 0.36985843181610106, "rewards/multi_component_reward/mean": 0.5433333471417428, "rewards/multi_component_reward/std": 0.4605102062225342, "rewards/no_answer_rate/mean": 0.1, "rewards/no_answer_rate/std": 0.10954451560974121, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.12428842782974243, "sampling/importance_sampling_ratio/mean": 0.046624291501939294, "sampling/importance_sampling_ratio/min": 0.0006754371514054114, "sampling/sampling_logp_difference/max": 0.32104220390319826, "sampling/sampling_logp_difference/mean": 0.008986939676105976, "step": 305, "step_time": 36.810268136300145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666865348817, "completions/max_length": 1557.0, "completions/max_terminated_length": 1291.8, "completions/mean_length": 1068.033349609375, "completions/mean_terminated_length": 916.2, "completions/min_length": 631.6, "completions/min_terminated_length": 631.6, "entropy": 0.15544118496278922, "epoch": 0.11293260473588343, "frac_reward_zero_std": 0.0, "grad_norm": 0.01212399173527956, "kl": 0.00047503276291536165, "learning_rate": 4.670245398773006e-07, "loss": -8.383383974432945e-05, "num_tokens": 2049616.0, "reward": 1.6533334255218506, "reward_std": 1.070458745956421, "rewards/boxed_rate/mean": 0.8333333373069763, "rewards/boxed_rate/std": 0.2128240704536438, "rewards/correctness/mean": 0.3000000059604645, "rewards/correctness/std": 0.4794029474258423, "rewards/multi_component_reward/mean": 0.3200000122189522, "rewards/multi_component_reward/std": 0.6258516371250152, "rewards/no_answer_rate/mean": 0.16666666865348817, "rewards/no_answer_rate/std": 0.2128240704536438, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.08939264337532223, "sampling/importance_sampling_ratio/mean": 0.030286337621510027, "sampling/importance_sampling_ratio/min": 0.0005470938450967466, "sampling/sampling_logp_difference/max": 0.34738160371780397, "sampling/sampling_logp_difference/mean": 0.010354180075228214, "step": 310, "step_time": 36.12143337074667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3333333432674408, "completions/max_length": 1634.2, "completions/max_terminated_length": 1091.4, "completions/mean_length": 1218.6333618164062, "completions/mean_terminated_length": 858.4666870117187, "completions/min_length": 677.6, "completions/min_terminated_length": 677.6, "entropy": 0.17260866140325865, "epoch": 0.11475409836065574, "frac_reward_zero_std": 0.0, "grad_norm": 0.05682944133877754, "kl": 0.0005160604792763479, "learning_rate": 4.6606595092024536e-07, "loss": -0.006619427353143692, "num_tokens": 2088659.0, "reward": 2.090000092983246, "reward_std": 1.1096696853637695, "rewards/boxed_rate/mean": 0.6666666716337204, "rewards/boxed_rate/std": 0.29447373151779177, "rewards/correctness/mean": 0.566666665673256, "rewards/correctness/std": 0.451508092880249, "rewards/multi_component_reward/mean": 0.523333391547203, "rewards/multi_component_reward/std": 0.6607022762298584, "rewards/no_answer_rate/mean": 0.33333333134651183, "rewards/no_answer_rate/std": 0.29447373151779177, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.05110827875323594, "sampling/importance_sampling_ratio/mean": 0.018865692079998553, "sampling/importance_sampling_ratio/min": 0.0009169460441439363, "sampling/sampling_logp_difference/max": 0.3731674671173096, "sampling/sampling_logp_difference/mean": 0.01166527420282364, "step": 315, "step_time": 38.15295873656869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1484.2, "completions/max_terminated_length": 1039.2, "completions/mean_length": 931.7000244140625, "completions/mean_terminated_length": 790.5233520507812, "completions/min_length": 563.4, "completions/min_terminated_length": 563.4, "entropy": 0.143128818521897, "epoch": 0.11657559198542805, "frac_reward_zero_std": 0.6, "grad_norm": 0.005422326270490885, "kl": 0.0004371852917150439, "learning_rate": 4.6510736196319015e-07, "loss": 0.004143273830413819, "num_tokens": 2119250.0, "reward": 1.5300001502037048, "reward_std": 0.24947773814201354, "rewards/boxed_rate/mean": 0.9, "rewards/boxed_rate/std": 0.18492921590805053, "rewards/correctness/mean": 0.23333333432674408, "rewards/correctness/std": 0.08164966106414795, "rewards/multi_component_reward/mean": 0.29666668176651, "rewards/multi_component_reward/std": 0.17254199385643004, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.18492921590805053, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.12071083066985011, "sampling/importance_sampling_ratio/mean": 0.05540300235152244, "sampling/importance_sampling_ratio/min": 0.013269457093102949, "sampling/sampling_logp_difference/max": 0.36359539031982424, "sampling/sampling_logp_difference/mean": 0.009230617992579937, "step": 320, "step_time": 34.49489349294454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666865348817, "completions/max_length": 1798.0, "completions/max_terminated_length": 1537.6, "completions/mean_length": 1144.8333618164063, "completions/mean_terminated_length": 965.6733520507812, "completions/min_length": 634.0, "completions/min_terminated_length": 634.0, "entropy": 0.18388984675208728, "epoch": 0.11839708561020036, "frac_reward_zero_std": 0.0, "grad_norm": 0.14283621311187744, "kl": 0.0006492346903542057, "learning_rate": 4.64148773006135e-07, "loss": -0.0025676295161247255, "num_tokens": 2157267.0, "reward": 1.630000066757202, "reward_std": 0.89271479845047, "rewards/boxed_rate/mean": 0.8333333253860473, "rewards/boxed_rate/std": 0.2728438377380371, "rewards/correctness/mean": 0.3000000059604645, "rewards/correctness/std": 0.3761233925819397, "rewards/multi_component_reward/mean": 0.3300000101327896, "rewards/multi_component_reward/std": 0.5265865564346314, "rewards/no_answer_rate/mean": 0.16666666865348817, "rewards/no_answer_rate/std": 0.2728438377380371, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.04229360092431307, "sampling/importance_sampling_ratio/mean": 0.019075372768566013, "sampling/importance_sampling_ratio/min": 8.805350876674933e-05, "sampling/sampling_logp_difference/max": 0.3247460603713989, "sampling/sampling_logp_difference/mean": 0.01177409002557397, "step": 325, "step_time": 41.04225417692214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.4, "completions/max_terminated_length": 823.4, "completions/mean_length": 544.1333435058593, "completions/mean_terminated_length": 544.1333435058593, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "entropy": 0.22606497158606847, "epoch": 0.12021857923497267, "frac_reward_zero_std": 0.2, "grad_norm": 0.9119893908500671, "kl": 0.0007021349136872838, "learning_rate": 4.631901840490797e-07, "loss": -0.006102682277560234, "num_tokens": 2175259.0, "reward": 1.8700001001358033, "reward_std": 0.8484382510185242, "rewards/boxed_rate/mean": 1.0, "rewards/boxed_rate/std": 0.0, "rewards/correctness/mean": 0.3666666716337204, "rewards/correctness/std": 0.404018247127533, "rewards/multi_component_reward/mean": 0.503333343565464, "rewards/multi_component_reward/std": 0.44442008137702943, "rewards/no_answer_rate/mean": 0.0, "rewards/no_answer_rate/std": 0.0, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.13480582600459456, "sampling/importance_sampling_ratio/mean": 0.07863845487590879, "sampling/importance_sampling_ratio/min": 0.01574344330747408, "sampling/sampling_logp_difference/max": 0.3204054594039917, "sampling/sampling_logp_difference/mean": 0.014066471531987191, "step": 330, "step_time": 20.059878361225127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06666666865348816, "completions/max_length": 1395.2, "completions/max_terminated_length": 910.6, "completions/mean_length": 742.1333679199219, "completions/mean_terminated_length": 640.480029296875, "completions/min_length": 509.6, "completions/min_terminated_length": 509.6, "entropy": 0.2093804210424423, "epoch": 0.122040072859745, "frac_reward_zero_std": 0.0, "grad_norm": 0.028569765388965607, "kl": 0.0006724427085525046, "learning_rate": 4.622315950920245e-07, "loss": -0.0013746816664934158, "num_tokens": 2200067.0, "reward": 1.6200001239776611, "reward_std": 0.8776960492134094, "rewards/boxed_rate/mean": 0.9333333253860474, "rewards/boxed_rate/std": 0.1632993221282959, "rewards/correctness/mean": 0.26666666865348815, "rewards/correctness/std": 0.3823883533477783, "rewards/multi_component_reward/mean": 0.3533333420753479, "rewards/multi_component_reward/std": 0.49811467826366423, "rewards/no_answer_rate/mean": 0.06666666865348816, "rewards/no_answer_rate/std": 0.1632993221282959, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.07016745954751968, "sampling/importance_sampling_ratio/mean": 0.027400266751646994, "sampling/importance_sampling_ratio/min": 0.0014501588469148616, "sampling/sampling_logp_difference/max": 0.36046857833862306, "sampling/sampling_logp_difference/mean": 0.014280413649976254, "step": 335, "step_time": 31.59101020414382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20000000596046447, "completions/max_length": 1939.2, "completions/max_terminated_length": 988.8, "completions/mean_length": 976.86669921875, "completions/mean_terminated_length": 681.2466857910156, "completions/min_length": 446.6, "completions/min_terminated_length": 446.6, "entropy": 0.2083836982647578, "epoch": 0.12386156648451731, "frac_reward_zero_std": 0.0, "grad_norm": 0.06303755939006805, "kl": 0.0005712434819239812, "learning_rate": 4.612730061349693e-07, "loss": -0.0044234395027160645, "num_tokens": 2232145.0, "reward": 1.74333336353302, "reward_std": 0.9833520889282227, "rewards/boxed_rate/mean": 0.8333333253860473, "rewards/boxed_rate/std": 0.34822853803634646, "rewards/correctness/mean": 0.33333334028720857, "rewards/correctness/std": 0.3977532863616943, "rewards/multi_component_reward/mean": 0.3433333486318588, "rewards/multi_component_reward/std": 0.6024900764226914, "rewards/no_answer_rate/mean": 0.1666666716337204, "rewards/no_answer_rate/std": 0.34822853803634646, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.1632993221282959, "sampling/importance_sampling_ratio/max": 0.1221102561801672, "sampling/importance_sampling_ratio/mean": 0.029469438176602126, "sampling/importance_sampling_ratio/min": 5.1707089343915565e-05, "sampling/sampling_logp_difference/max": 0.3329290866851807, "sampling/sampling_logp_difference/mean": 0.012941631115972995, "step": 340, "step_time": 43.22656154055149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1, "completions/max_length": 1163.2, "completions/max_terminated_length": 1144.4, "completions/mean_length": 778.86669921875, "completions/mean_terminated_length": 688.9000244140625, "completions/min_length": 431.8, "completions/min_terminated_length": 431.8, "entropy": 0.1657411351799965, "epoch": 0.12568306010928962, "frac_reward_zero_std": 0.0, "grad_norm": 0.1018831878900528, "kl": 0.0005899470415897667, "learning_rate": 4.603144171779141e-07, "loss": -0.006323814392089844, "num_tokens": 2259027.0, "reward": 1.5300001144409179, "reward_std": 0.9171831011772156, "rewards/boxed_rate/mean": 0.9, "rewards/boxed_rate/std": 0.10954451560974121, "rewards/correctness/mean": 0.20000000596046447, "rewards/correctness/std": 0.4298781991004944, "rewards/multi_component_reward/mean": 0.23000000715255736, "rewards/multi_component_reward/std": 0.5341363847255707, "rewards/no_answer_rate/mean": 0.1, "rewards/no_answer_rate/std": 0.10954451560974121, "rewards/repetition_rate/mean": 0.1, "rewards/repetition_rate/std": 0.10954451560974121, "sampling/importance_sampling_ratio/max": 0.12837026529014112, "sampling/importance_sampling_ratio/mean": 0.05263339746743441, "sampling/importance_sampling_ratio/min": 0.0009968833521444183, "sampling/sampling_logp_difference/max": 0.31432443857192993, "sampling/sampling_logp_difference/mean": 0.011136513762176036, "step": 345, "step_time": 27.575114846229553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 1772.4, "completions/max_terminated_length": 1112.4, "completions/mean_length": 983.0333618164062, "completions/mean_terminated_length": 776.57333984375, "completions/min_length": 516.6, "completions/min_terminated_length": 516.6, "entropy": 0.18533113847176233, "epoch": 0.12750455373406194, "frac_reward_zero_std": 0.2, "grad_norm": 0.1743193119764328, "kl": 0.0007398304010469777, "learning_rate": 4.593558282208589e-07, "loss": 0.0035149652510881426, "num_tokens": 2292724.0, "reward": 1.5366667270660401, "reward_std": 0.7906980156898499, "rewards/boxed_rate/mean": 0.8333333253860473, "rewards/boxed_rate/std": 0.34822853803634646, "rewards/correctness/mean": 0.23333334028720856, "rewards/correctness/std": 0.34822853803634646, "rewards/multi_component_reward/mean": 0.2366666778922081, "rewards/multi_component_reward/std": 0.52751624584198, "rewards/no_answer_rate/mean": 0.1666666716337204, "rewards/no_answer_rate/std": 0.34822853803634646, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.1632993221282959, "sampling/importance_sampling_ratio/max": 0.16946832239627838, "sampling/importance_sampling_ratio/mean": 0.04226875379681587, "sampling/importance_sampling_ratio/min": 0.00029637051690457807, "sampling/sampling_logp_difference/max": 0.34900493621826173, "sampling/sampling_logp_difference/mean": 0.011652881279587746, "step": 350, "step_time": 40.440953285992144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333432674408, "completions/max_length": 1092.8, "completions/max_terminated_length": 992.2, "completions/mean_length": 720.4666931152344, "completions/mean_terminated_length": 686.5133483886718, "completions/min_length": 467.2, "completions/min_terminated_length": 467.2, "entropy": 0.1794678675631682, "epoch": 0.12932604735883424, "frac_reward_zero_std": 0.2, "grad_norm": 0.020413219928741455, "kl": 0.0006120980954923046, "learning_rate": 4.583972392638036e-07, "loss": -0.0014667985960841179, "num_tokens": 2316852.0, "reward": 2.060000109672546, "reward_std": 0.7259637594223023, "rewards/boxed_rate/mean": 0.9666666626930237, "rewards/boxed_rate/std": 0.08164966106414795, "rewards/correctness/mean": 0.46666666865348816, "rewards/correctness/std": 0.32236858606338503, "rewards/multi_component_reward/mean": 0.59333336353302, "rewards/multi_component_reward/std": 0.4035952419042587, "rewards/no_answer_rate/mean": 0.03333333432674408, "rewards/no_answer_rate/std": 0.08164966106414795, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.14755254588089883, "sampling/importance_sampling_ratio/mean": 0.07640535607933999, "sampling/importance_sampling_ratio/min": 0.03205070712427443, "sampling/sampling_logp_difference/max": 0.3078671216964722, "sampling/sampling_logp_difference/mean": 0.011428364552557468, "step": 355, "step_time": 26.221613752283154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333730697633, "completions/max_length": 1573.4, "completions/max_terminated_length": 1007.8, "completions/mean_length": 839.6000122070312, "completions/mean_terminated_length": 670.27333984375, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "entropy": 0.2592687545965115, "epoch": 0.13114754098360656, "frac_reward_zero_std": 0.0, "grad_norm": 0.21311414241790771, "kl": 0.0006711701275586772, "learning_rate": 4.5743865030674845e-07, "loss": -0.0009625215083360672, "num_tokens": 2345424.0, "reward": 1.6000000953674316, "reward_std": 0.7632826209068299, "rewards/boxed_rate/mean": 0.9, "rewards/boxed_rate/std": 0.18492921590805053, "rewards/correctness/mean": 0.2666666656732559, "rewards/correctness/std": 0.3265986442565918, "rewards/multi_component_reward/mean": 0.33333335518836976, "rewards/multi_component_reward/std": 0.44263782203197477, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.18492921590805053, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.06810822673141956, "sampling/importance_sampling_ratio/mean": 0.026497452543117105, "sampling/importance_sampling_ratio/min": 0.0005043454068991798, "sampling/sampling_logp_difference/max": 0.46866347789764407, "sampling/sampling_logp_difference/mean": 0.01661387998610735, "step": 360, "step_time": 35.86930139716715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23333334028720856, "completions/max_length": 1906.6, "completions/max_terminated_length": 1405.8, "completions/mean_length": 1317.56669921875, "completions/mean_terminated_length": 1097.8466796875, "completions/min_length": 801.0, "completions/min_terminated_length": 801.0, "entropy": 0.11761416780451933, "epoch": 0.13296903460837886, "frac_reward_zero_std": 0.0, "grad_norm": 0.015462023206055164, "kl": 0.0003710240945414019, "learning_rate": 4.5648006134969324e-07, "loss": -0.007586926966905594, "num_tokens": 2388119.0, "reward": 1.7300000667572022, "reward_std": 1.0704036235809327, "rewards/boxed_rate/mean": 0.7666666686534882, "rewards/boxed_rate/std": 0.2882087707519531, "rewards/correctness/mean": 0.33333333730697634, "rewards/correctness/std": 0.48566790819168093, "rewards/multi_component_reward/mean": 0.2966666899621487, "rewards/multi_component_reward/std": 0.6780295848846436, "rewards/no_answer_rate/mean": 0.23333334028720856, "rewards/no_answer_rate/std": 0.2882087707519531, "rewards/repetition_rate/mean": 0.10000000298023223, "rewards/repetition_rate/std": 0.24494898319244385, "sampling/importance_sampling_ratio/max": 0.04545931427273899, "sampling/importance_sampling_ratio/mean": 0.02320328297209926, "sampling/importance_sampling_ratio/min": 0.001607318179527617, "sampling/sampling_logp_difference/max": 0.35903769731521606, "sampling/sampling_logp_difference/mean": 0.007444971334189176, "step": 365, "step_time": 43.916714335232975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333730697633, "completions/max_length": 1418.0, "completions/max_terminated_length": 1135.4, "completions/mean_length": 861.5666931152343, "completions/mean_terminated_length": 710.4833435058594, "completions/min_length": 346.6, "completions/min_terminated_length": 346.6, "entropy": 0.22315610001484554, "epoch": 0.13479052823315119, "frac_reward_zero_std": 0.0, "grad_norm": 1.7706270217895508, "kl": 0.0008742218895349651, "learning_rate": 4.55521472392638e-07, "loss": 0.003686445206403732, "num_tokens": 2416660.0, "reward": 1.5800000548362731, "reward_std": 0.8671060860157013, "rewards/boxed_rate/mean": 0.8666666746139526, "rewards/boxed_rate/std": 0.20655910968780516, "rewards/correctness/mean": 0.26666667461395266, "rewards/correctness/std": 0.36985843181610106, "rewards/multi_component_reward/mean": 0.31333333253860474, "rewards/multi_component_reward/std": 0.5032015323638916, "rewards/no_answer_rate/mean": 0.13333333730697633, "rewards/no_answer_rate/std": 0.20655910968780516, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.20161812007427216, "sampling/importance_sampling_ratio/mean": 0.09829787239432335, "sampling/importance_sampling_ratio/min": 0.056043115374653285, "sampling/sampling_logp_difference/max": 0.35296247005462644, "sampling/sampling_logp_difference/mean": 0.014671148918569087, "step": 370, "step_time": 33.350730242021385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333432674407, "completions/max_length": 1741.8, "completions/max_terminated_length": 1278.2, "completions/mean_length": 958.4000366210937, "completions/mean_terminated_length": 774.8600219726562, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "entropy": 0.19409313425421715, "epoch": 0.1366120218579235, "frac_reward_zero_std": 0.0, "grad_norm": 0.10258854180574417, "kl": 0.0005339390016160905, "learning_rate": 4.545628834355828e-07, "loss": 0.0006846859585493803, "num_tokens": 2448526.0, "reward": 1.6966667890548706, "reward_std": 1.038489294052124, "rewards/boxed_rate/mean": 0.8666666626930237, "rewards/boxed_rate/std": 0.19119417667388916, "rewards/correctness/mean": 0.3000000059604645, "rewards/correctness/std": 0.4794029474258423, "rewards/multi_component_reward/mean": 0.3300000175833702, "rewards/multi_component_reward/std": 0.6378163933753968, "rewards/no_answer_rate/mean": 0.13333333432674407, "rewards/no_answer_rate/std": 0.19119417667388916, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.1632993221282959, "sampling/importance_sampling_ratio/max": 0.023186815530061723, "sampling/importance_sampling_ratio/mean": 0.013102744333446026, "sampling/importance_sampling_ratio/min": 0.00010366742491344506, "sampling/sampling_logp_difference/max": 0.3774606704711914, "sampling/sampling_logp_difference/mean": 0.011916628293693065, "step": 375, "step_time": 30.433830817975103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23333334028720856, "completions/max_length": 1764.6, "completions/max_terminated_length": 1185.0, "completions/mean_length": 1108.233349609375, "completions/mean_terminated_length": 801.3400024414062, "completions/min_length": 541.8, "completions/min_terminated_length": 541.8, "entropy": 0.25079266702135405, "epoch": 0.1384335154826958, "frac_reward_zero_std": 0.0, "grad_norm": 0.015607405453920364, "kl": 0.000638273500953801, "learning_rate": 4.536042944785276e-07, "loss": -0.004855351522564888, "num_tokens": 2484623.0, "reward": 1.6300000548362732, "reward_std": 0.6760274231433868, "rewards/boxed_rate/mean": 0.799999988079071, "rewards/boxed_rate/std": 0.35449349880218506, "rewards/correctness/mean": 0.2666666656732559, "rewards/correctness/std": 0.2665788769721985, "rewards/multi_component_reward/mean": 0.23000002317130566, "rewards/multi_component_reward/std": 0.4806109189987183, "rewards/no_answer_rate/mean": 0.20000000298023224, "rewards/no_answer_rate/std": 0.35449349880218506, "rewards/repetition_rate/mean": 0.13333333432674407, "rewards/repetition_rate/std": 0.19119417667388916, "sampling/importance_sampling_ratio/max": 0.030251298751682042, "sampling/importance_sampling_ratio/mean": 0.01690196809358895, "sampling/importance_sampling_ratio/min": 0.0027151843596084375, "sampling/sampling_logp_difference/max": 0.3498866558074951, "sampling/sampling_logp_difference/mean": 0.016159016452729703, "step": 380, "step_time": 15.216538714803756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20000000596046447, "completions/max_length": 1980.6, "completions/max_terminated_length": 1102.2, "completions/mean_length": 1102.2000122070312, "completions/mean_terminated_length": 820.3700073242187, "completions/min_length": 554.4, "completions/min_terminated_length": 554.4, "entropy": 0.21501451556881268, "epoch": 0.14025500910746813, "frac_reward_zero_std": 0.2, "grad_norm": 0.039810437709093094, "kl": 0.0005428103109200795, "learning_rate": 4.5264570552147234e-07, "loss": -0.001014799065887928, "num_tokens": 2521715.0, "reward": 1.33000009059906, "reward_std": 0.713744330406189, "rewards/boxed_rate/mean": 0.8, "rewards/boxed_rate/std": 0.36985843181610106, "rewards/correctness/mean": 0.16666666865348817, "rewards/correctness/std": 0.2728438377380371, "rewards/multi_component_reward/mean": 0.16333335041999816, "rewards/multi_component_reward/std": 0.4538270115852356, "rewards/no_answer_rate/mean": 0.20000000596046447, "rewards/no_answer_rate/std": 0.36985843181610106, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.050251276697963476, "sampling/importance_sampling_ratio/mean": 0.014141232566908002, "sampling/importance_sampling_ratio/min": 1.2361302467010191e-05, "sampling/sampling_logp_difference/max": 0.30763993263244627, "sampling/sampling_logp_difference/mean": 0.014410833641886712, "step": 385, "step_time": 16.672282354161144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.33333333730697634, "completions/max_length": 1734.4, "completions/max_terminated_length": 1527.6, "completions/mean_length": 1290.500048828125, "completions/mean_terminated_length": 1057.36669921875, "completions/min_length": 721.2, "completions/min_terminated_length": 721.2, "entropy": 0.17866969083746273, "epoch": 0.14207650273224043, "frac_reward_zero_std": 0.0, "grad_norm": 0.009094689041376114, "kl": 0.0004922606613642226, "learning_rate": 4.5168711656441713e-07, "loss": -0.0016124360263347626, "num_tokens": 2563196.0, "reward": 1.5066667079925538, "reward_std": 0.9714008927345276, "rewards/boxed_rate/mean": 0.6666666686534881, "rewards/boxed_rate/std": 0.32236858606338503, "rewards/correctness/mean": 0.26666667461395266, "rewards/correctness/std": 0.36985843181610106, "rewards/multi_component_reward/mean": 0.17333333306014537, "rewards/multi_component_reward/std": 0.564999234676361, "rewards/no_answer_rate/mean": 0.33333333730697634, "rewards/no_answer_rate/std": 0.32236858606338503, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.10327955484390258, "sampling/importance_sampling_ratio/max": 0.09146881643682718, "sampling/importance_sampling_ratio/mean": 0.027744953380897643, "sampling/importance_sampling_ratio/min": 0.0008299978789871564, "sampling/sampling_logp_difference/max": 0.44667046070098876, "sampling/sampling_logp_difference/mean": 0.011561266146600246, "step": 390, "step_time": 15.62863542791456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666865348817, "completions/max_length": 1713.0, "completions/max_terminated_length": 1077.4, "completions/mean_length": 995.9333801269531, "completions/mean_terminated_length": 786.106689453125, "completions/min_length": 546.4, "completions/min_terminated_length": 546.4, "entropy": 0.1970411961277326, "epoch": 0.14389799635701275, "frac_reward_zero_std": 0.0, "grad_norm": 0.018740270286798477, "kl": 0.0007653967642302935, "learning_rate": 4.50728527607362e-07, "loss": 0.005758083239197731, "num_tokens": 2595858.0, "reward": 1.4666666984558105, "reward_std": 0.6528747193515301, "rewards/boxed_rate/mean": 0.8333333253860473, "rewards/boxed_rate/std": 0.2728438377380371, "rewards/correctness/mean": 0.20000000596046447, "rewards/correctness/std": 0.2665788769721985, "rewards/multi_component_reward/mean": 0.20000000223517417, "rewards/multi_component_reward/std": 0.46670507490634916, "rewards/no_answer_rate/mean": 0.16666666865348817, "rewards/no_answer_rate/std": 0.2728438377380371, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.1632993221282959, "sampling/importance_sampling_ratio/max": 0.09698612228967249, "sampling/importance_sampling_ratio/mean": 0.030241910181939603, "sampling/importance_sampling_ratio/min": 0.0021489314406773754, "sampling/sampling_logp_difference/max": 0.34886856079101564, "sampling/sampling_logp_difference/mean": 0.012738966010510921, "step": 395, "step_time": 14.849660068005324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666865348817, "completions/max_length": 1575.0, "completions/max_terminated_length": 948.8, "completions/mean_length": 935.7000366210938, "completions/mean_terminated_length": 702.1866821289062, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "entropy": 0.2041997093707323, "epoch": 0.14571948998178508, "frac_reward_zero_std": 0.2, "grad_norm": 0.10930619388818741, "kl": 0.0005768983881959381, "learning_rate": 4.497699386503067e-07, "loss": -0.004532752931118012, "num_tokens": 2626437.0, "reward": 1.7466667890548706, "reward_std": 0.8040140867233276, "rewards/boxed_rate/mean": 0.8333333253860473, "rewards/boxed_rate/std": 0.2728438377380371, "rewards/correctness/mean": 0.33333333134651183, "rewards/correctness/std": 0.35449349880218506, "rewards/multi_component_reward/mean": 0.34666669368743896, "rewards/multi_component_reward/std": 0.5015990376472473, "rewards/no_answer_rate/mean": 0.16666666865348817, "rewards/no_answer_rate/std": 0.2728438377380371, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.10327955484390258, "sampling/importance_sampling_ratio/max": 0.09340296909213067, "sampling/importance_sampling_ratio/mean": 0.03166428431868553, "sampling/importance_sampling_ratio/min": 0.007079391362589009, "sampling/sampling_logp_difference/max": 0.4040722370147705, "sampling/sampling_logp_difference/mean": 0.013007087912410498, "step": 400, "step_time": 13.86912056710571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333730697633, "completions/max_length": 1673.6, "completions/max_terminated_length": 1493.4, "completions/mean_length": 1166.2000366210937, "completions/mean_terminated_length": 1052.9500122070312, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "entropy": 0.1420179379483064, "epoch": 0.14754098360655737, "frac_reward_zero_std": 0.4, "grad_norm": 0.021460620686411858, "kl": 0.0005037313070109425, "learning_rate": 4.488113496932515e-07, "loss": -0.0034239646047353745, "num_tokens": 2665191.0, "reward": 1.6500000953674316, "reward_std": 0.5699952363967895, "rewards/boxed_rate/mean": 0.8666666746139526, "rewards/boxed_rate/std": 0.20655910968780516, "rewards/correctness/mean": 0.2666666656732559, "rewards/correctness/std": 0.2665788769721985, "rewards/multi_component_reward/mean": 0.2833333633840084, "rewards/multi_component_reward/std": 0.40620826482772826, "rewards/no_answer_rate/mean": 0.13333333730697633, "rewards/no_answer_rate/std": 0.20655910968780516, "rewards/repetition_rate/mean": 0.10000000298023223, "rewards/repetition_rate/std": 0.18492921590805053, "sampling/importance_sampling_ratio/max": 0.047187263565137985, "sampling/importance_sampling_ratio/mean": 0.016343344282358886, "sampling/importance_sampling_ratio/min": 0.0005224805446630398, "sampling/sampling_logp_difference/max": 0.33698827028274536, "sampling/sampling_logp_difference/mean": 0.009091775678098202, "step": 405, "step_time": 14.932769652456045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2333333373069763, "completions/max_length": 1674.6, "completions/max_terminated_length": 1293.0, "completions/mean_length": 1194.5333679199218, "completions/mean_terminated_length": 945.5333374023437, "completions/min_length": 665.2, "completions/min_terminated_length": 665.2, "entropy": 0.19754986638824146, "epoch": 0.1493624772313297, "frac_reward_zero_std": 0.4, "grad_norm": 0.12479820102453232, "kl": 0.000494376394393233, "learning_rate": 4.4785276073619634e-07, "loss": -0.0048382353037595745, "num_tokens": 2707231.0, "reward": 1.240000104904175, "reward_std": 0.5429712414741517, "rewards/boxed_rate/mean": 0.7666666746139527, "rewards/boxed_rate/std": 0.3161036252975464, "rewards/correctness/mean": 0.13333333432674407, "rewards/correctness/std": 0.19119417667388916, "rewards/multi_component_reward/mean": 0.10666667819023132, "rewards/multi_component_reward/std": 0.35998966693878176, "rewards/no_answer_rate/mean": 0.2333333373069763, "rewards/no_answer_rate/std": 0.3161036252975464, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.10651108641177416, "sampling/importance_sampling_ratio/mean": 0.03249359540641308, "sampling/importance_sampling_ratio/min": 2.793159140067336e-05, "sampling/sampling_logp_difference/max": 0.34277451038360596, "sampling/sampling_logp_difference/mean": 0.01319133285433054, "step": 410, "step_time": 15.258313446864486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20000000596046447, "completions/max_length": 2048.0, "completions/max_terminated_length": 1373.4, "completions/mean_length": 1174.6666870117188, "completions/mean_terminated_length": 982.5400207519531, "completions/min_length": 717.8, "completions/min_terminated_length": 717.8, "entropy": 0.17395039709905782, "epoch": 0.151183970856102, "frac_reward_zero_std": 0.0, "grad_norm": 0.1571768969297409, "kl": 0.0006294176423883376, "learning_rate": 4.468941717791411e-07, "loss": -0.013487279415130615, "num_tokens": 2744805.0, "reward": 1.6100000858306884, "reward_std": 0.9736106634140015, "rewards/boxed_rate/mean": 0.799999988079071, "rewards/boxed_rate/std": 0.4298781991004944, "rewards/correctness/mean": 0.3000000089406967, "rewards/correctness/std": 0.3914883255958557, "rewards/multi_component_reward/mean": 0.31000002175569535, "rewards/multi_component_reward/std": 0.5971025705337525, "rewards/no_answer_rate/mean": 0.20000000596046447, "rewards/no_answer_rate/std": 0.4298781991004944, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.09501664619892836, "sampling/importance_sampling_ratio/mean": 0.02351754056289792, "sampling/importance_sampling_ratio/min": 1.3345446579433804e-05, "sampling/sampling_logp_difference/max": 0.35430474281311036, "sampling/sampling_logp_difference/mean": 0.012131126876920462, "step": 415, "step_time": 17.18821715861559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1233.4, "completions/max_terminated_length": 934.0, "completions/mean_length": 838.5667053222656, "completions/mean_terminated_length": 744.7300109863281, "completions/min_length": 594.4, "completions/min_terminated_length": 594.4, "entropy": 0.17896714533368746, "epoch": 0.15300546448087432, "frac_reward_zero_std": 0.0, "grad_norm": 0.08533581346273422, "kl": 0.00047911926230881363, "learning_rate": 4.4593558282208586e-07, "loss": -0.00437382236123085, "num_tokens": 2771858.0, "reward": 1.5300000667572022, "reward_std": 0.8672845721244812, "rewards/boxed_rate/mean": 0.9, "rewards/boxed_rate/std": 0.18492921590805053, "rewards/correctness/mean": 0.2333333373069763, "rewards/correctness/std": 0.3761233925819397, "rewards/multi_component_reward/mean": 0.29666668176651, "rewards/multi_component_reward/std": 0.49711505472660067, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.18492921590805053, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.07356768772006035, "sampling/importance_sampling_ratio/mean": 0.026540174335241317, "sampling/importance_sampling_ratio/min": 0.0027435134909879364, "sampling/sampling_logp_difference/max": 0.2972338914871216, "sampling/sampling_logp_difference/mean": 0.011641639284789562, "step": 420, "step_time": 11.845613337121904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333730697633, "completions/max_length": 1578.6, "completions/max_terminated_length": 1112.6, "completions/mean_length": 957.2333740234375, "completions/mean_terminated_length": 789.5533447265625, "completions/min_length": 506.8, "completions/min_terminated_length": 506.8, "entropy": 0.22455186421672504, "epoch": 0.15482695810564662, "frac_reward_zero_std": 0.0, "grad_norm": 0.011058821342885494, "kl": 0.0005187941482290625, "learning_rate": 4.449769938650307e-07, "loss": -0.0018246347084641457, "num_tokens": 2803527.0, "reward": 1.6033333539962769, "reward_std": 0.8526834607124328, "rewards/boxed_rate/mean": 0.8666666626930237, "rewards/boxed_rate/std": 0.2665788769721985, "rewards/correctness/mean": 0.26666667461395266, "rewards/correctness/std": 0.36985843181610106, "rewards/multi_component_reward/mean": 0.3033333346247673, "rewards/multi_component_reward/std": 0.5273210048675537, "rewards/no_answer_rate/mean": 0.13333333730697633, "rewards/no_answer_rate/std": 0.2665788769721985, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.040888981963507834, "sampling/importance_sampling_ratio/mean": 0.012350313179194928, "sampling/importance_sampling_ratio/min": 4.1122279969854155e-05, "sampling/sampling_logp_difference/max": 0.3840524673461914, "sampling/sampling_logp_difference/mean": 0.014351606741547585, "step": 425, "step_time": 13.965443047881127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666865348817, "completions/max_length": 1702.0, "completions/max_terminated_length": 1172.4, "completions/mean_length": 1089.1000244140625, "completions/mean_terminated_length": 908.7400146484375, "completions/min_length": 716.8, "completions/min_terminated_length": 716.8, "entropy": 0.1683452881872654, "epoch": 0.15664845173041894, "frac_reward_zero_std": 0.2, "grad_norm": 0.014243739657104015, "kl": 0.0004299223937171822, "learning_rate": 4.4401840490797544e-07, "loss": -5.882064579054713e-05, "num_tokens": 2838744.0, "reward": 1.5133334517478942, "reward_std": 0.6114970922470093, "rewards/boxed_rate/mean": 0.8333333253860473, "rewards/boxed_rate/std": 0.2728438377380371, "rewards/correctness/mean": 0.23333333134651185, "rewards/correctness/std": 0.24494898319244385, "rewards/multi_component_reward/mean": 0.24666669517755507, "rewards/multi_component_reward/std": 0.39071274995803834, "rewards/no_answer_rate/mean": 0.16666666865348817, "rewards/no_answer_rate/std": 0.2728438377380371, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.08568799644708633, "sampling/importance_sampling_ratio/mean": 0.032788149639964105, "sampling/importance_sampling_ratio/min": 0.0002142171015298561, "sampling/sampling_logp_difference/max": 0.3399065971374512, "sampling/sampling_logp_difference/mean": 0.011240929551422595, "step": 430, "step_time": 14.933206058479845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2666666716337204, "completions/max_length": 1818.2, "completions/max_terminated_length": 1232.2, "completions/mean_length": 1174.3333618164063, "completions/mean_terminated_length": 910.4933471679688, "completions/min_length": 678.2, "completions/min_terminated_length": 678.2, "entropy": 0.1602409650882085, "epoch": 0.15846994535519127, "frac_reward_zero_std": 0.0, "grad_norm": 0.07029227167367935, "kl": 0.00047036608011694623, "learning_rate": 4.4305981595092023e-07, "loss": -0.004705151170492172, "num_tokens": 2877094.0, "reward": 1.4300000429153443, "reward_std": 0.7005062401294708, "rewards/boxed_rate/mean": 0.7333333373069764, "rewards/boxed_rate/std": 0.3977532863616943, "rewards/correctness/mean": 0.20000000596046447, "rewards/correctness/std": 0.2665788769721985, "rewards/multi_component_reward/mean": 0.13000000715255738, "rewards/multi_component_reward/std": 0.5194472134113312, "rewards/no_answer_rate/mean": 0.2666666716337204, "rewards/no_answer_rate/std": 0.3977532863616943, "rewards/repetition_rate/mean": 0.10000000298023223, "rewards/repetition_rate/std": 0.24494898319244385, "sampling/importance_sampling_ratio/max": 0.13022951409220695, "sampling/importance_sampling_ratio/mean": 0.05298698227852583, "sampling/importance_sampling_ratio/min": 9.12893192064404e-05, "sampling/sampling_logp_difference/max": 0.3845913648605347, "sampling/sampling_logp_difference/mean": 0.01090236771851778, "step": 435, "step_time": 15.83292675986886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06666666865348816, "completions/max_length": 1450.6, "completions/max_terminated_length": 1089.0, "completions/mean_length": 925.2000366210938, "completions/mean_terminated_length": 851.3733581542969, "completions/min_length": 552.8, "completions/min_terminated_length": 552.8, "entropy": 0.19520388320088386, "epoch": 0.16029143897996356, "frac_reward_zero_std": 0.0, "grad_norm": 0.11756877601146698, "kl": 0.0004913257944281213, "learning_rate": 4.4210122699386497e-07, "loss": -0.009752783924341202, "num_tokens": 2907568.0, "reward": 1.783333420753479, "reward_std": 0.8172009825706482, "rewards/boxed_rate/mean": 0.9333333253860474, "rewards/boxed_rate/std": 0.1632993221282959, "rewards/correctness/mean": 0.33333333134651183, "rewards/correctness/std": 0.35449349880218506, "rewards/multi_component_reward/mean": 0.41666669249534605, "rewards/multi_component_reward/std": 0.4753227561712265, "rewards/no_answer_rate/mean": 0.06666666865348816, "rewards/no_answer_rate/std": 0.1632993221282959, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.07866446217522025, "sampling/importance_sampling_ratio/mean": 0.03093269495293498, "sampling/importance_sampling_ratio/min": 0.002900865976889124, "sampling/sampling_logp_difference/max": 0.3607002258300781, "sampling/sampling_logp_difference/mean": 0.01270847897976637, "step": 440, "step_time": 13.279750570654869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333432674408, "completions/max_length": 1239.0, "completions/max_terminated_length": 991.2, "completions/mean_length": 793.8333618164063, "completions/mean_terminated_length": 740.3466857910156, "completions/min_length": 537.4, "completions/min_terminated_length": 537.4, "entropy": 0.172708131869634, "epoch": 0.1621129326047359, "frac_reward_zero_std": 0.4, "grad_norm": 0.1338849663734436, "kl": 0.0005728138513707866, "learning_rate": 4.411426380368098e-07, "loss": -0.0031532272696495056, "num_tokens": 2937617.0, "reward": 1.6400001287460326, "reward_std": 0.6522666096687317, "rewards/boxed_rate/mean": 0.9666666626930237, "rewards/boxed_rate/std": 0.08164966106414795, "rewards/correctness/mean": 0.2666666716337204, "rewards/correctness/std": 0.29447373151779177, "rewards/multi_component_reward/mean": 0.37333334386348727, "rewards/multi_component_reward/std": 0.35980162024497986, "rewards/no_answer_rate/mean": 0.03333333432674408, "rewards/no_answer_rate/std": 0.08164966106414795, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.07935073487460613, "sampling/importance_sampling_ratio/mean": 0.024918343313038348, "sampling/importance_sampling_ratio/min": 0.001050433111959137, "sampling/sampling_logp_difference/max": 0.37157559394836426, "sampling/sampling_logp_difference/mean": 0.011105079017579556, "step": 445, "step_time": 11.855768189579248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333730697633, "completions/max_length": 1251.4, "completions/max_terminated_length": 829.6, "completions/mean_length": 769.7333618164063, "completions/mean_terminated_length": 598.9666748046875, "completions/min_length": 440.8, "completions/min_terminated_length": 440.8, "entropy": 0.16670421635111174, "epoch": 0.16393442622950818, "frac_reward_zero_std": 0.0, "grad_norm": 0.47747763991355896, "kl": 0.00048284496394141265, "learning_rate": 4.401840490797546e-07, "loss": -0.013033266365528106, "num_tokens": 2963841.0, "reward": 1.9300000429153443, "reward_std": 0.8960777290165425, "rewards/boxed_rate/mean": 0.8666666746139526, "rewards/boxed_rate/std": 0.20655910968780516, "rewards/correctness/mean": 0.40000001192092893, "rewards/correctness/std": 0.41311821937561033, "rewards/multi_component_reward/mean": 0.42999999821186063, "rewards/multi_component_reward/std": 0.601305240392685, "rewards/no_answer_rate/mean": 0.13333333730697633, "rewards/no_answer_rate/std": 0.20655910968780516, "rewards/repetition_rate/mean": 0.10000000298023223, "rewards/repetition_rate/std": 0.18492921590805053, "sampling/importance_sampling_ratio/max": 0.14179300479590892, "sampling/importance_sampling_ratio/mean": 0.06188145540654659, "sampling/importance_sampling_ratio/min": 0.017718057095797223, "sampling/sampling_logp_difference/max": 0.3239980101585388, "sampling/sampling_logp_difference/mean": 0.010319609567523003, "step": 450, "step_time": 11.826861386746168 }, { "epoch": 0.16393442622950818, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.20666667073965073, "eval_completions/max_length": 1688.02, "eval_completions/max_terminated_length": 1185.42, "eval_completions/mean_length": 1123.9933685302735, "eval_completions/mean_terminated_length": 846.4790167236329, "eval_completions/min_length": 676.62, "eval_completions/min_terminated_length": 594.7, "eval_entropy": 0.18630945876240732, "eval_frac_reward_zero_std": 0.16, "eval_kl": 0.0004587844273191877, "eval_loss": -0.0019339508144184947, "eval_num_tokens": 2963841.0, "eval_reward": 1.5870000886917115, "eval_reward_std": 0.7317568999528885, "eval_rewards/boxed_rate/mean": 0.7933333313465118, "eval_rewards/boxed_rate/std": 0.2689453566074371, "eval_rewards/correctness/mean": 0.2733333346247673, "eval_rewards/correctness/std": 0.2873582673072815, "eval_rewards/multi_component_reward/mean": 0.2603333519771695, "eval_rewards/multi_component_reward/std": 0.4510752035677433, "eval_rewards/no_answer_rate/mean": 0.20666667073965073, "eval_rewards/no_answer_rate/std": 0.2689453566074371, "eval_rewards/repetition_rate/mean": 0.05333333492279053, "eval_rewards/repetition_rate/std": 0.09630359530448913, "eval_runtime": 669.3873, "eval_samples_per_second": 0.075, "eval_sampling/importance_sampling_ratio/max": 0.10398212056839838, "eval_sampling/importance_sampling_ratio/mean": 0.035226596340653483, "eval_sampling/importance_sampling_ratio/min": 0.0024896363094269146, "eval_sampling/sampling_logp_difference/max": 0.5326641929149628, "eval_sampling/sampling_logp_difference/mean": 0.012828467870131135, "eval_steps_per_second": 0.013, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333730697633, "completions/max_length": 1987.6, "completions/max_terminated_length": 1503.0, "completions/mean_length": 1150.6666870117188, "completions/mean_terminated_length": 990.6999938964843, "completions/min_length": 602.2, "completions/min_terminated_length": 602.2, "entropy": 0.2162185865143935, "epoch": 0.1657559198542805, "frac_reward_zero_std": 0.0, "grad_norm": 0.01271678227931261, "kl": 0.0005497580714290962, "learning_rate": 4.3922546012269933e-07, "loss": 0.0011618516407907009, "num_tokens": 3000941.0, "reward": 1.6966667413711547, "reward_std": 0.9000118732452392, "rewards/boxed_rate/mean": 0.8666666626930237, "rewards/boxed_rate/std": 0.2665788769721985, "rewards/correctness/mean": 0.3000000089406967, "rewards/correctness/std": 0.3914883255958557, "rewards/multi_component_reward/mean": 0.3300000101327896, "rewards/multi_component_reward/std": 0.5810832887887954, "rewards/no_answer_rate/mean": 0.13333333730697633, "rewards/no_answer_rate/std": 0.2665788769721985, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.10327955484390258, "sampling/importance_sampling_ratio/max": 0.11817909283563495, "sampling/importance_sampling_ratio/mean": 0.02859312845394015, "sampling/importance_sampling_ratio/min": 0.00019200471108481247, "sampling/sampling_logp_difference/max": 0.40274404287338256, "sampling/sampling_logp_difference/mean": 0.014035884011536836, "step": 455, "step_time": 16.644585143402217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 1850.4, "completions/max_terminated_length": 1004.8, "completions/mean_length": 914.5666870117187, "completions/mean_terminated_length": 717.0500061035157, "completions/min_length": 396.2, "completions/min_terminated_length": 396.2, "entropy": 0.18004637969036896, "epoch": 0.16757741347905283, "frac_reward_zero_std": 0.0, "grad_norm": 0.051200754940509796, "kl": 0.0005335340790528183, "learning_rate": 4.3826687116564417e-07, "loss": -0.01714204400777817, "num_tokens": 3030898.0, "reward": 1.5366667270660401, "reward_std": 0.7392153739929199, "rewards/boxed_rate/mean": 0.8333333253860473, "rewards/boxed_rate/std": 0.34822853803634646, "rewards/correctness/mean": 0.23333334028720856, "rewards/correctness/std": 0.2882087707519531, "rewards/multi_component_reward/mean": 0.23666667342185974, "rewards/multi_component_reward/std": 0.4870242774486542, "rewards/no_answer_rate/mean": 0.1666666716337204, "rewards/no_answer_rate/std": 0.34822853803634646, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.10327955484390258, "sampling/importance_sampling_ratio/max": 0.1253473086282611, "sampling/importance_sampling_ratio/mean": 0.06275567463599145, "sampling/importance_sampling_ratio/min": 0.0048506295410769285, "sampling/sampling_logp_difference/max": 0.35658023357391355, "sampling/sampling_logp_difference/mean": 0.012955071125179528, "step": 460, "step_time": 15.270139714889229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06666666865348816, "completions/max_length": 1409.8, "completions/max_terminated_length": 850.4, "completions/mean_length": 728.4666931152344, "completions/mean_terminated_length": 629.5000183105469, "completions/min_length": 469.8, "completions/min_terminated_length": 469.8, "entropy": 0.16985683316985767, "epoch": 0.16939890710382513, "frac_reward_zero_std": 0.4, "grad_norm": 0.02098464034497738, "kl": 0.0005443188643160586, "learning_rate": 4.3730828220858896e-07, "loss": -0.007486963272094726, "num_tokens": 3056190.0, "reward": 1.9700001001358032, "reward_std": 0.4863309383392334, "rewards/boxed_rate/mean": 0.9333333253860474, "rewards/boxed_rate/std": 0.1632993221282959, "rewards/correctness/mean": 0.43333333134651186, "rewards/correctness/std": 0.18492921590805053, "rewards/multi_component_reward/mean": 0.5366666868329049, "rewards/multi_component_reward/std": 0.3014017343521118, "rewards/no_answer_rate/mean": 0.06666666865348816, "rewards/no_answer_rate/std": 0.1632993221282959, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.12007005885243416, "sampling/importance_sampling_ratio/mean": 0.05674390830099583, "sampling/importance_sampling_ratio/min": 0.0004069998664532642, "sampling/sampling_logp_difference/max": 0.5247022390365601, "sampling/sampling_logp_difference/mean": 0.012214668095111847, "step": 465, "step_time": 12.395325169526041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333730697633, "completions/max_length": 1901.0, "completions/max_terminated_length": 1594.8, "completions/mean_length": 1356.2000244140625, "completions/mean_terminated_length": 1266.056689453125, "completions/min_length": 993.0, "completions/min_terminated_length": 993.0, "entropy": 0.10780975048740706, "epoch": 0.17122040072859745, "frac_reward_zero_std": 0.0, "grad_norm": 0.05852176249027252, "kl": 0.0004035008855377479, "learning_rate": 4.363496932515337e-07, "loss": -0.0037086054682731627, "num_tokens": 3100044.0, "reward": 1.6266667366027832, "reward_std": 0.9011179387569428, "rewards/boxed_rate/mean": 0.8666666626930237, "rewards/boxed_rate/std": 0.2665788769721985, "rewards/correctness/mean": 0.26666666865348815, "rewards/correctness/std": 0.3823883533477783, "rewards/multi_component_reward/mean": 0.2933333463966846, "rewards/multi_component_reward/std": 0.5466254830360413, "rewards/no_answer_rate/mean": 0.13333333730697633, "rewards/no_answer_rate/std": 0.2665788769721985, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.1632993221282959, "sampling/importance_sampling_ratio/max": 0.062121391948312524, "sampling/importance_sampling_ratio/mean": 0.0237305941991508, "sampling/importance_sampling_ratio/min": 0.004560414611228225, "sampling/sampling_logp_difference/max": 0.33158082962036134, "sampling/sampling_logp_difference/mean": 0.007295731455087662, "step": 470, "step_time": 16.880542963929475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1782.4, "completions/max_terminated_length": 1526.4, "completions/mean_length": 1103.5000427246093, "completions/mean_terminated_length": 1016.5600524902344, "completions/min_length": 741.8, "completions/min_terminated_length": 741.8, "entropy": 0.1433272872120142, "epoch": 0.17304189435336975, "frac_reward_zero_std": 0.2, "grad_norm": 0.03871724009513855, "kl": 0.0003871701846946962, "learning_rate": 4.353911042944785e-07, "loss": -0.0024786345660686494, "num_tokens": 3136959.0, "reward": 1.1100000619888306, "reward_std": 0.318433678150177, "rewards/boxed_rate/mean": 0.899999988079071, "rewards/boxed_rate/std": 0.24494898319244385, "rewards/correctness/mean": 0.03333333432674408, "rewards/correctness/std": 0.08164966106414795, "rewards/multi_component_reward/mean": 0.0766666665673256, "rewards/multi_component_reward/std": 0.23678402006626129, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.24494898319244385, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.0617107892408967, "sampling/importance_sampling_ratio/mean": 0.02746242703869939, "sampling/importance_sampling_ratio/min": 8.897909594907105e-05, "sampling/sampling_logp_difference/max": 0.3906729698181152, "sampling/sampling_logp_difference/mean": 0.009637977462261915, "step": 475, "step_time": 15.594477642700076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20000000298023224, "completions/max_length": 1867.2, "completions/max_terminated_length": 1634.6, "completions/mean_length": 1205.7000244140625, "completions/mean_terminated_length": 989.493359375, "completions/min_length": 550.8, "completions/min_terminated_length": 550.8, "entropy": 0.16859658484657605, "epoch": 0.17486338797814208, "frac_reward_zero_std": 0.0, "grad_norm": 0.0642157569527626, "kl": 0.0005781898080992202, "learning_rate": 4.3443251533742333e-07, "loss": -0.0028929352760314942, "num_tokens": 3175776.0, "reward": 1.3066667556762694, "reward_std": 0.6569535873830319, "rewards/boxed_rate/mean": 0.8, "rewards/boxed_rate/std": 0.29447373151779177, "rewards/correctness/mean": 0.13333333730697633, "rewards/correctness/std": 0.2665788769721985, "rewards/multi_component_reward/mean": 0.1066666692495346, "rewards/multi_component_reward/std": 0.4782795637845993, "rewards/no_answer_rate/mean": 0.20000000298023224, "rewards/no_answer_rate/std": 0.29447373151779177, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.10327955484390258, "sampling/importance_sampling_ratio/max": 0.06810643598437309, "sampling/importance_sampling_ratio/mean": 0.01840525190345943, "sampling/importance_sampling_ratio/min": 0.0006505777751514535, "sampling/sampling_logp_difference/max": 0.43449815511703493, "sampling/sampling_logp_difference/mean": 0.011115654464811086, "step": 480, "step_time": 15.972666030749679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666865348817, "completions/max_length": 1663.0, "completions/max_terminated_length": 1303.0, "completions/mean_length": 1173.3333740234375, "completions/mean_terminated_length": 989.41669921875, "completions/min_length": 683.0, "completions/min_terminated_length": 683.0, "entropy": 0.14007456911106905, "epoch": 0.1766848816029144, "frac_reward_zero_std": 0.0, "grad_norm": 0.010275622829794884, "kl": 0.00036965057806810365, "learning_rate": 4.3347392638036806e-07, "loss": 0.0001720041735097766, "num_tokens": 3214522.0, "reward": 1.9766667842864991, "reward_std": 0.9272954225540161, "rewards/boxed_rate/mean": 0.8666666626930237, "rewards/boxed_rate/std": 0.19119417667388916, "rewards/correctness/mean": 0.43333333134651186, "rewards/correctness/std": 0.404018247127533, "rewards/multi_component_reward/mean": 0.47666670083999635, "rewards/multi_component_reward/std": 0.5632504820823669, "rewards/no_answer_rate/mean": 0.13333333432674407, "rewards/no_answer_rate/std": 0.19119417667388916, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.10327955484390258, "sampling/importance_sampling_ratio/max": 0.05611779941245913, "sampling/importance_sampling_ratio/mean": 0.021316760312765838, "sampling/importance_sampling_ratio/min": 0.0011236143288555823, "sampling/sampling_logp_difference/max": 0.3524420738220215, "sampling/sampling_logp_difference/mean": 0.00864975331351161, "step": 485, "step_time": 15.02983694653958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1206.8, "completions/max_terminated_length": 934.0, "completions/mean_length": 728.1666748046875, "completions/mean_terminated_length": 595.9966796875, "completions/min_length": 426.2, "completions/min_terminated_length": 426.2, "entropy": 0.23062201142311095, "epoch": 0.1785063752276867, "frac_reward_zero_std": 0.2, "grad_norm": 0.018157748505473137, "kl": 0.001033562931115739, "learning_rate": 4.3251533742331285e-07, "loss": 0.014268814027309418, "num_tokens": 3238755.0, "reward": 1.833333384990692, "reward_std": 0.6647640764713287, "rewards/boxed_rate/mean": 0.9, "rewards/boxed_rate/std": 0.18492921590805053, "rewards/correctness/mean": 0.3666666686534882, "rewards/correctness/std": 0.2882087707519531, "rewards/multi_component_reward/mean": 0.43333335071802137, "rewards/multi_component_reward/std": 0.4189693987369537, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.18492921590805053, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.12710050344467164, "sampling/importance_sampling_ratio/mean": 0.05628317659720779, "sampling/importance_sampling_ratio/min": 0.015683322838595436, "sampling/sampling_logp_difference/max": 0.36403354406356814, "sampling/sampling_logp_difference/mean": 0.015313766896724701, "step": 490, "step_time": 11.357261310145258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 1498.2, "completions/max_terminated_length": 1076.4, "completions/mean_length": 1013.5000305175781, "completions/mean_terminated_length": 849.3533508300782, "completions/min_length": 602.6, "completions/min_terminated_length": 602.6, "entropy": 0.14790264442563056, "epoch": 0.18032786885245902, "frac_reward_zero_std": 0.4, "grad_norm": 0.034156352281570435, "kl": 0.0005029304617589029, "learning_rate": 4.315567484662577e-07, "loss": -0.010992055386304855, "num_tokens": 3274110.0, "reward": 1.4433334589004516, "reward_std": 0.6818739175796509, "rewards/boxed_rate/mean": 0.8333333313465119, "rewards/boxed_rate/std": 0.18492921590805053, "rewards/correctness/mean": 0.20000000298023224, "rewards/correctness/std": 0.29447373151779177, "rewards/multi_component_reward/mean": 0.21000000536441804, "rewards/multi_component_reward/std": 0.4088459849357605, "rewards/no_answer_rate/mean": 0.1666666716337204, "rewards/no_answer_rate/std": 0.18492921590805053, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.10836826749145985, "sampling/importance_sampling_ratio/mean": 0.04134368039667606, "sampling/importance_sampling_ratio/min": 0.0007355699201900378, "sampling/sampling_logp_difference/max": 0.5285765409469605, "sampling/sampling_logp_difference/mean": 0.00972840515896678, "step": 495, "step_time": 13.717384218610823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 1768.6, "completions/max_terminated_length": 1172.4, "completions/mean_length": 1021.0333618164062, "completions/mean_terminated_length": 829.9766723632813, "completions/min_length": 574.6, "completions/min_terminated_length": 574.6, "entropy": 0.1277496966222922, "epoch": 0.18214936247723132, "frac_reward_zero_std": 0.2, "grad_norm": 0.036998555064201355, "kl": 0.00041782591021425713, "learning_rate": 4.3059815950920243e-07, "loss": -0.014814507961273194, "num_tokens": 3306649.0, "reward": 1.9100001335144043, "reward_std": 0.8856898427009583, "rewards/boxed_rate/mean": 0.8333333373069763, "rewards/boxed_rate/std": 0.2882087707519531, "rewards/correctness/mean": 0.4333333373069763, "rewards/correctness/std": 0.3761233925819397, "rewards/multi_component_reward/mean": 0.4766666904091835, "rewards/multi_component_reward/std": 0.5212879002094268, "rewards/no_answer_rate/mean": 0.1666666716337204, "rewards/no_answer_rate/std": 0.2882087707519531, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.2542514935135841, "sampling/importance_sampling_ratio/mean": 0.09570314064621925, "sampling/importance_sampling_ratio/min": 0.0020768470163739948, "sampling/sampling_logp_difference/max": 0.34144341945648193, "sampling/sampling_logp_difference/mean": 0.010077364556491376, "step": 500, "step_time": 15.174134342558682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1232.6, "completions/max_terminated_length": 1232.6, "completions/mean_length": 928.7333618164063, "completions/mean_terminated_length": 928.7333618164063, "completions/min_length": 612.8, "completions/min_terminated_length": 612.8, "entropy": 0.15919652469456197, "epoch": 0.18397085610200364, "frac_reward_zero_std": 0.4, "grad_norm": 0.03940599039196968, "kl": 0.0004976143633636335, "learning_rate": 4.296395705521472e-07, "loss": 8.051458280533552e-05, "num_tokens": 3338765.0, "reward": 1.6133334398269654, "reward_std": 0.5944257378578186, "rewards/boxed_rate/mean": 1.0, "rewards/boxed_rate/std": 0.0, "rewards/correctness/mean": 0.23333334028720856, "rewards/correctness/std": 0.2882087707519531, "rewards/multi_component_reward/mean": 0.34666666090488435, "rewards/multi_component_reward/std": 0.3269805133342743, "rewards/no_answer_rate/mean": 0.0, "rewards/no_answer_rate/std": 0.0, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.12865251041948794, "sampling/importance_sampling_ratio/mean": 0.046758548077195886, "sampling/importance_sampling_ratio/min": 0.001370607664284762, "sampling/sampling_logp_difference/max": 0.37642626762390136, "sampling/sampling_logp_difference/mean": 0.010083456197753549, "step": 505, "step_time": 12.115066050365566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333432674408, "completions/max_length": 1233.6, "completions/max_terminated_length": 1011.6, "completions/mean_length": 741.0666870117187, "completions/mean_terminated_length": 699.6933471679688, "completions/min_length": 502.8, "completions/min_terminated_length": 502.8, "entropy": 0.14653264259298643, "epoch": 0.18579234972677597, "frac_reward_zero_std": 0.0, "grad_norm": 0.022397559136152267, "kl": 0.0004972440418593275, "learning_rate": 4.28680981595092e-07, "loss": -0.010462169349193574, "num_tokens": 3363841.0, "reward": 2.0600001335144045, "reward_std": 0.9187898755073547, "rewards/boxed_rate/mean": 0.9666666626930237, "rewards/boxed_rate/std": 0.08164966106414795, "rewards/correctness/mean": 0.4666666626930237, "rewards/correctness/std": 0.4298781991004944, "rewards/multi_component_reward/mean": 0.5933333843946457, "rewards/multi_component_reward/std": 0.49362565875053405, "rewards/no_answer_rate/mean": 0.03333333432674408, "rewards/no_answer_rate/std": 0.08164966106414795, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.16412589382380247, "sampling/importance_sampling_ratio/mean": 0.0629353643860668, "sampling/importance_sampling_ratio/min": 0.011790907313888965, "sampling/sampling_logp_difference/max": 0.3437652587890625, "sampling/sampling_logp_difference/mean": 0.009886026009917259, "step": 510, "step_time": 11.563566098548472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333730697633, "completions/max_length": 1427.2, "completions/max_terminated_length": 666.4, "completions/mean_length": 705.2000061035156, "completions/mean_terminated_length": 511.7700134277344, "completions/min_length": 344.2, "completions/min_terminated_length": 344.2, "entropy": 0.26261382003625233, "epoch": 0.18761384335154827, "frac_reward_zero_std": 0.2, "grad_norm": 0.10859289020299911, "kl": 0.0008684398761639992, "learning_rate": 4.277223926380368e-07, "loss": -0.002386706694960594, "num_tokens": 3386899.0, "reward": 1.3700001001358033, "reward_std": 0.7070007205009461, "rewards/boxed_rate/mean": 0.8666666626930237, "rewards/boxed_rate/std": 0.2665788769721985, "rewards/correctness/mean": 0.1666666716337204, "rewards/correctness/std": 0.2882087707519531, "rewards/multi_component_reward/mean": 0.20333334654569626, "rewards/multi_component_reward/std": 0.4274008393287659, "rewards/no_answer_rate/mean": 0.13333333730697633, "rewards/no_answer_rate/std": 0.2665788769721985, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.07567991334944964, "sampling/importance_sampling_ratio/mean": 0.024421159457415343, "sampling/importance_sampling_ratio/min": 0.001837286539412988, "sampling/sampling_logp_difference/max": 0.34601569175720215, "sampling/sampling_logp_difference/mean": 0.017737336456775665, "step": 515, "step_time": 12.585359536856412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333730697633, "completions/max_length": 1618.2, "completions/max_terminated_length": 1045.2, "completions/mean_length": 936.9666870117187, "completions/mean_terminated_length": 758.3166870117187, "completions/min_length": 469.8, "completions/min_terminated_length": 469.8, "entropy": 0.1936116027335326, "epoch": 0.1894353369763206, "frac_reward_zero_std": 0.0, "grad_norm": 0.0316350944340229, "kl": 0.0006832293069843824, "learning_rate": 4.267638036809816e-07, "loss": -0.008644417673349381, "num_tokens": 3417600.0, "reward": 1.5800000786781312, "reward_std": 0.7550430953502655, "rewards/boxed_rate/mean": 0.8666666626930237, "rewards/boxed_rate/std": 0.2665788769721985, "rewards/correctness/mean": 0.2666666716337204, "rewards/correctness/std": 0.29447373151779177, "rewards/multi_component_reward/mean": 0.3133333310484886, "rewards/multi_component_reward/std": 0.46337632834911346, "rewards/no_answer_rate/mean": 0.13333333730697633, "rewards/no_answer_rate/std": 0.2665788769721985, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.0883142702281475, "sampling/importance_sampling_ratio/mean": 0.030587387550622226, "sampling/importance_sampling_ratio/min": 0.0063572791577344466, "sampling/sampling_logp_difference/max": 0.3835047721862793, "sampling/sampling_logp_difference/mean": 0.013410749472677707, "step": 520, "step_time": 13.994114908203482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06666666865348816, "completions/max_length": 1382.6, "completions/max_terminated_length": 1288.6, "completions/mean_length": 903.3000122070313, "completions/mean_terminated_length": 852.4833374023438, "completions/min_length": 593.0, "completions/min_terminated_length": 593.0, "entropy": 0.18077843909462293, "epoch": 0.1912568306010929, "frac_reward_zero_std": 0.2, "grad_norm": 0.024261396378278732, "kl": 0.0005541147484715718, "learning_rate": 4.2580521472392637e-07, "loss": -0.0031598765403032304, "num_tokens": 3448035.0, "reward": 1.8300000786781312, "reward_std": 0.6672061026096344, "rewards/boxed_rate/mean": 0.9333333373069763, "rewards/boxed_rate/std": 0.10327955484390258, "rewards/correctness/mean": 0.3666666686534882, "rewards/correctness/std": 0.2882087707519531, "rewards/multi_component_reward/mean": 0.4633333504199982, "rewards/multi_component_reward/std": 0.378997391462326, "rewards/no_answer_rate/mean": 0.06666666865348816, "rewards/no_answer_rate/std": 0.10327955484390258, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.10047482601366937, "sampling/importance_sampling_ratio/mean": 0.02871947007952258, "sampling/importance_sampling_ratio/min": 0.0007279034301063802, "sampling/sampling_logp_difference/max": 0.41403778791427615, "sampling/sampling_logp_difference/mean": 0.011369843780994416, "step": 525, "step_time": 12.759401306509972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20000000596046447, "completions/max_length": 1729.4, "completions/max_terminated_length": 1090.4, "completions/mean_length": 1117.200048828125, "completions/mean_terminated_length": 875.8833618164062, "completions/min_length": 682.2, "completions/min_terminated_length": 682.2, "entropy": 0.18201239357391993, "epoch": 0.1930783242258652, "frac_reward_zero_std": 0.0, "grad_norm": 0.00892422441393137, "kl": 0.000488846098111632, "learning_rate": 4.2484662576687116e-07, "loss": -0.004933690279722213, "num_tokens": 3483861.0, "reward": 1.3033334016799927, "reward_std": 0.6930439114570618, "rewards/boxed_rate/mean": 0.8333333253860473, "rewards/boxed_rate/std": 0.34822853803634646, "rewards/correctness/mean": 0.13333333730697633, "rewards/correctness/std": 0.2665788769721985, "rewards/multi_component_reward/mean": 0.13666666969656943, "rewards/multi_component_reward/std": 0.4622148424386978, "rewards/no_answer_rate/mean": 0.1666666716337204, "rewards/no_answer_rate/std": 0.34822853803634646, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.06577782724052668, "sampling/importance_sampling_ratio/mean": 0.02595797423273325, "sampling/importance_sampling_ratio/min": 0.0002929158277353877, "sampling/sampling_logp_difference/max": 0.355899715423584, "sampling/sampling_logp_difference/mean": 0.012041485961526632, "step": 530, "step_time": 15.400034189224243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 1707.4, "completions/max_terminated_length": 958.8, "completions/mean_length": 998.1000122070312, "completions/mean_terminated_length": 766.7933349609375, "completions/min_length": 525.4, "completions/min_terminated_length": 525.4, "entropy": 0.22935964514811832, "epoch": 0.19489981785063754, "frac_reward_zero_std": 0.2, "grad_norm": 0.020105650648474693, "kl": 0.000557902626072367, "learning_rate": 4.2388803680981595e-07, "loss": -0.0007442331407219172, "num_tokens": 3518562.0, "reward": 1.2100000977516174, "reward_std": 0.6208420097827911, "rewards/boxed_rate/mean": 0.8333333373069763, "rewards/boxed_rate/std": 0.2882087707519531, "rewards/correctness/mean": 0.10000000298023223, "rewards/correctness/std": 0.24494898319244385, "rewards/multi_component_reward/mean": 0.11000000834465026, "rewards/multi_component_reward/std": 0.38656076192855837, "rewards/no_answer_rate/mean": 0.1666666716337204, "rewards/no_answer_rate/std": 0.2882087707519531, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.015947306994348764, "sampling/importance_sampling_ratio/mean": 0.004562706989236176, "sampling/importance_sampling_ratio/min": 2.5106548244674746e-05, "sampling/sampling_logp_difference/max": 0.36506946086883546, "sampling/sampling_logp_difference/mean": 0.014524615183472634, "step": 535, "step_time": 14.785470444336534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333730697633, "completions/max_length": 2011.6, "completions/max_terminated_length": 1252.0, "completions/mean_length": 974.2000366210938, "completions/mean_terminated_length": 799.5733703613281, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "entropy": 0.20504479110240936, "epoch": 0.19672131147540983, "frac_reward_zero_std": 0.2, "grad_norm": 0.016418447718024254, "kl": 0.0007665569457458333, "learning_rate": 4.229294478527607e-07, "loss": -0.004444469138979912, "num_tokens": 3550332.0, "reward": 1.5333333969116212, "reward_std": 0.6269734501838684, "rewards/boxed_rate/mean": 0.8666666507720947, "rewards/boxed_rate/std": 0.3265986442565918, "rewards/correctness/mean": 0.23333333134651185, "rewards/correctness/std": 0.24494898319244385, "rewards/multi_component_reward/mean": 0.266666679084301, "rewards/multi_component_reward/std": 0.4252805233001709, "rewards/no_answer_rate/mean": 0.13333333730697633, "rewards/no_answer_rate/std": 0.3265986442565918, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.0710077840834856, "sampling/importance_sampling_ratio/mean": 0.0242210254073143, "sampling/importance_sampling_ratio/min": 2.3673296637852498e-05, "sampling/sampling_logp_difference/max": 0.37494783401489257, "sampling/sampling_logp_difference/mean": 0.013403672352433204, "step": 540, "step_time": 16.360854005627335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1755.4, "completions/max_terminated_length": 1206.4, "completions/mean_length": 827.3000122070313, "completions/mean_terminated_length": 693.3333374023438, "completions/min_length": 476.2, "completions/min_terminated_length": 476.2, "entropy": 0.20222537256777287, "epoch": 0.19854280510018216, "frac_reward_zero_std": 0.0, "grad_norm": 0.07017825543880463, "kl": 0.0005556545128153327, "learning_rate": 4.219708588957055e-07, "loss": -0.0067356817424297334, "num_tokens": 3577779.0, "reward": 1.8100000619888306, "reward_std": 0.7233119249343872, "rewards/boxed_rate/mean": 0.899999988079071, "rewards/boxed_rate/std": 0.24494898319244385, "rewards/correctness/mean": 0.3666666686534882, "rewards/correctness/std": 0.2882087707519531, "rewards/multi_component_reward/mean": 0.44333335757255554, "rewards/multi_component_reward/std": 0.43861203789711, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.24494898319244385, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.07237468343228101, "sampling/importance_sampling_ratio/mean": 0.02907364722341299, "sampling/importance_sampling_ratio/min": 0.003791734886874308, "sampling/sampling_logp_difference/max": 0.3440469026565552, "sampling/sampling_logp_difference/mean": 0.013698535226285457, "step": 545, "step_time": 14.568762712925672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333730697633, "completions/max_length": 1740.6, "completions/max_terminated_length": 1098.0, "completions/mean_length": 972.5000366210937, "completions/mean_terminated_length": 813.2100280761719, "completions/min_length": 548.8, "completions/min_terminated_length": 548.8, "entropy": 0.15388319492340088, "epoch": 0.20036429872495445, "frac_reward_zero_std": 0.0, "grad_norm": 0.09412358701229095, "kl": 0.00047789589831760774, "learning_rate": 4.210122699386503e-07, "loss": 0.0004236143082380295, "num_tokens": 3610650.0, "reward": 1.7666667222976684, "reward_std": 0.748459929227829, "rewards/boxed_rate/mean": 0.8666666626930237, "rewards/boxed_rate/std": 0.2665788769721985, "rewards/correctness/mean": 0.3333333432674408, "rewards/correctness/std": 0.3098386645317078, "rewards/multi_component_reward/mean": 0.3666666656732559, "rewards/multi_component_reward/std": 0.5095339387655258, "rewards/no_answer_rate/mean": 0.13333333730697633, "rewards/no_answer_rate/std": 0.2665788769721985, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.1632993221282959, "sampling/importance_sampling_ratio/max": 0.07994540650397539, "sampling/importance_sampling_ratio/mean": 0.0390994424931705, "sampling/importance_sampling_ratio/min": 0.0009056368981219976, "sampling/sampling_logp_difference/max": 0.3753530740737915, "sampling/sampling_logp_difference/mean": 0.009989727940410375, "step": 550, "step_time": 15.042225486785174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333730697633, "completions/max_length": 1604.4, "completions/max_terminated_length": 979.4, "completions/mean_length": 832.6666870117188, "completions/mean_terminated_length": 665.1333374023437, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "entropy": 0.2048481193681558, "epoch": 0.20218579234972678, "frac_reward_zero_std": 0.0, "grad_norm": 0.08716036379337311, "kl": 0.000548629568462881, "learning_rate": 4.2005368098159505e-07, "loss": -0.005523241683840751, "num_tokens": 3638024.0, "reward": 1.6033334016799927, "reward_std": 0.797308748960495, "rewards/boxed_rate/mean": 0.8666666626930237, "rewards/boxed_rate/std": 0.2665788769721985, "rewards/correctness/mean": 0.26666666865348815, "rewards/correctness/std": 0.32236858606338503, "rewards/multi_component_reward/mean": 0.30333334803581236, "rewards/multi_component_reward/std": 0.5069476455450058, "rewards/no_answer_rate/mean": 0.13333333730697633, "rewards/no_answer_rate/std": 0.2665788769721985, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.07022089585661888, "sampling/importance_sampling_ratio/mean": 0.022322331066243352, "sampling/importance_sampling_ratio/min": 0.000656013962759859, "sampling/sampling_logp_difference/max": 0.3337650179862976, "sampling/sampling_logp_difference/mean": 0.013166856206953525, "step": 555, "step_time": 13.850998834148049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06666666865348816, "completions/max_length": 1198.2, "completions/max_terminated_length": 1077.2, "completions/mean_length": 757.1333618164062, "completions/mean_terminated_length": 690.2466918945313, "completions/min_length": 448.2, "completions/min_terminated_length": 448.2, "entropy": 0.18011298303802808, "epoch": 0.2040072859744991, "frac_reward_zero_std": 0.0, "grad_norm": 0.01096227765083313, "kl": 0.0006569843031077956, "learning_rate": 4.1909509202453984e-07, "loss": -0.007073364406824112, "num_tokens": 3663726.0, "reward": 1.9000000715255738, "reward_std": 0.8417381644248962, "rewards/boxed_rate/mean": 0.9333333253860474, "rewards/boxed_rate/std": 0.1632993221282959, "rewards/correctness/mean": 0.40000000298023225, "rewards/correctness/std": 0.36985843181610106, "rewards/multi_component_reward/mean": 0.5000000268220901, "rewards/multi_component_reward/std": 0.4765937089920044, "rewards/no_answer_rate/mean": 0.06666666865348816, "rewards/no_answer_rate/std": 0.1632993221282959, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.09752159416675568, "sampling/importance_sampling_ratio/mean": 0.05281178932636976, "sampling/importance_sampling_ratio/min": 0.003194372950383695, "sampling/sampling_logp_difference/max": 0.3607391357421875, "sampling/sampling_logp_difference/mean": 0.011687950044870377, "step": 560, "step_time": 11.587789290212095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20000000298023224, "completions/max_length": 1599.4, "completions/max_terminated_length": 1067.8, "completions/mean_length": 1085.8000244140626, "completions/mean_terminated_length": 867.1133544921875, "completions/min_length": 651.6, "completions/min_terminated_length": 651.6, "entropy": 0.12313016926248868, "epoch": 0.2058287795992714, "frac_reward_zero_std": 0.2, "grad_norm": 0.05608590692281723, "kl": 0.00031730876992999886, "learning_rate": 4.181365030674847e-07, "loss": 0.0087483711540699, "num_tokens": 3699090.0, "reward": 1.7966667652130126, "reward_std": 0.8426974892616272, "rewards/boxed_rate/mean": 0.8, "rewards/boxed_rate/std": 0.29447373151779177, "rewards/correctness/mean": 0.36666667461395264, "rewards/correctness/std": 0.3161036252975464, "rewards/multi_component_reward/mean": 0.36333334594964983, "rewards/multi_component_reward/std": 0.5058179676532746, "rewards/no_answer_rate/mean": 0.20000000298023224, "rewards/no_answer_rate/std": 0.29447373151779177, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.10327955484390258, "sampling/importance_sampling_ratio/max": 0.10868240725831128, "sampling/importance_sampling_ratio/mean": 0.04146801241877256, "sampling/importance_sampling_ratio/min": 0.008836518857719966, "sampling/sampling_logp_difference/max": 0.3564323544502258, "sampling/sampling_logp_difference/mean": 0.007883247546851635, "step": 565, "step_time": 14.454759103059768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.33333333432674406, "completions/max_length": 1746.6, "completions/max_terminated_length": 973.4, "completions/mean_length": 1263.9000244140625, "completions/mean_terminated_length": 730.0800231933594, "completions/min_length": 952.8, "completions/min_terminated_length": 543.2, "entropy": 0.14238746762275695, "epoch": 0.20765027322404372, "frac_reward_zero_std": 0.4, "grad_norm": 0.01621309295296669, "kl": 0.0003668556613168524, "learning_rate": 4.171779141104294e-07, "loss": 0.00620177686214447, "num_tokens": 3740091.0, "reward": 1.4800001382827759, "reward_std": 0.5089709401130676, "rewards/boxed_rate/mean": 0.6666666626930237, "rewards/boxed_rate/std": 0.19119417667388916, "rewards/correctness/mean": 0.2666666626930237, "rewards/correctness/std": 0.19119417667388916, "rewards/multi_component_reward/mean": 0.17999999970197678, "rewards/multi_component_reward/std": 0.30394219756126406, "rewards/no_answer_rate/mean": 0.33333333432674406, "rewards/no_answer_rate/std": 0.19119417667388916, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.09319847710430622, "sampling/importance_sampling_ratio/mean": 0.04575341045856476, "sampling/importance_sampling_ratio/min": 0.004630656010958226, "sampling/sampling_logp_difference/max": 0.3321978569030762, "sampling/sampling_logp_difference/mean": 0.009725131979212164, "step": 570, "step_time": 15.637936590984463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333432674407, "completions/max_length": 1493.2, "completions/max_terminated_length": 1230.4, "completions/mean_length": 1086.3333618164063, "completions/mean_terminated_length": 990.8466918945312, "completions/min_length": 756.6, "completions/min_terminated_length": 756.6, "entropy": 0.1577371135354042, "epoch": 0.20947176684881602, "frac_reward_zero_std": 0.2, "grad_norm": 0.017710594460368156, "kl": 0.0003999709859878446, "learning_rate": 4.162193251533742e-07, "loss": -0.0008659601211547851, "num_tokens": 3778573.0, "reward": 1.2533334612846374, "reward_std": 0.6350648760795593, "rewards/boxed_rate/mean": 0.8666666626930237, "rewards/boxed_rate/std": 0.19119417667388916, "rewards/correctness/mean": 0.10000000298023223, "rewards/correctness/std": 0.24494898319244385, "rewards/multi_component_reward/mean": 0.12000000327825547, "rewards/multi_component_reward/std": 0.34919504523277284, "rewards/no_answer_rate/mean": 0.13333333432674407, "rewards/no_answer_rate/std": 0.19119417667388916, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.021646471600979565, "sampling/importance_sampling_ratio/mean": 0.009971876302734018, "sampling/importance_sampling_ratio/min": 0.0019394792594557403, "sampling/sampling_logp_difference/max": 0.331281840801239, "sampling/sampling_logp_difference/mean": 0.01047133533284068, "step": 575, "step_time": 13.984978386759758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333432674408, "completions/max_length": 1077.0, "completions/max_terminated_length": 843.4, "completions/mean_length": 618.633349609375, "completions/mean_terminated_length": 570.0800170898438, "completions/min_length": 312.4, "completions/min_terminated_length": 312.4, "entropy": 0.18354702678819498, "epoch": 0.21129326047358835, "frac_reward_zero_std": 0.2, "grad_norm": 0.01734095625579357, "kl": 0.0007040846471985181, "learning_rate": 4.15260736196319e-07, "loss": -0.034613341093063354, "num_tokens": 3799994.0, "reward": 1.733333420753479, "reward_std": 0.8469935655593872, "rewards/boxed_rate/mean": 0.9666666626930237, "rewards/boxed_rate/std": 0.08164966106414795, "rewards/correctness/mean": 0.3000000029802322, "rewards/correctness/std": 0.404018247127533, "rewards/multi_component_reward/mean": 0.4000000163912773, "rewards/multi_component_reward/std": 0.481517493724823, "rewards/no_answer_rate/mean": 0.03333333432674408, "rewards/no_answer_rate/std": 0.08164966106414795, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.2370181828737259, "sampling/importance_sampling_ratio/mean": 0.07491586618125438, "sampling/importance_sampling_ratio/min": 0.017679853770823685, "sampling/sampling_logp_difference/max": 0.2730631709098816, "sampling/sampling_logp_difference/mean": 0.011196557991206646, "step": 580, "step_time": 10.380646949820221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1423.0, "completions/max_terminated_length": 1241.8, "completions/mean_length": 999.9000366210937, "completions/mean_terminated_length": 926.8766967773438, "completions/min_length": 742.8, "completions/min_terminated_length": 742.8, "entropy": 0.17347905735174815, "epoch": 0.21311475409836064, "frac_reward_zero_std": 0.2, "grad_norm": 0.019545914605259895, "kl": 0.0004457860350764046, "learning_rate": 4.143021472392638e-07, "loss": -0.0001349434838630259, "num_tokens": 3834629.0, "reward": 1.4100001215934754, "reward_std": 0.6349397718906402, "rewards/boxed_rate/mean": 0.9333333373069763, "rewards/boxed_rate/std": 0.10327955484390258, "rewards/correctness/mean": 0.16666666865348817, "rewards/correctness/std": 0.2728438377380371, "rewards/multi_component_reward/mean": 0.2433333396911621, "rewards/multi_component_reward/std": 0.36209596991539, "rewards/no_answer_rate/mean": 0.06666666865348816, "rewards/no_answer_rate/std": 0.10327955484390258, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.04343400467187166, "sampling/importance_sampling_ratio/mean": 0.013024529488757253, "sampling/importance_sampling_ratio/min": 0.001408253621775657, "sampling/sampling_logp_difference/max": 0.5604913473129273, "sampling/sampling_logp_difference/mean": 0.010900301579385995, "step": 585, "step_time": 13.423828819952906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1522.6, "completions/max_terminated_length": 1304.6, "completions/mean_length": 1048.300048828125, "completions/mean_terminated_length": 953.2133544921875, "completions/min_length": 677.2, "completions/min_terminated_length": 677.2, "entropy": 0.13777906013031802, "epoch": 0.21493624772313297, "frac_reward_zero_std": 0.2, "grad_norm": 0.020055213943123817, "kl": 0.00046802494471194224, "learning_rate": 4.1334355828220857e-07, "loss": 0.0007817570120096206, "num_tokens": 3868946.0, "reward": 1.3900001525878907, "reward_std": 0.7840524554252625, "rewards/boxed_rate/mean": 0.9, "rewards/boxed_rate/std": 0.18492921590805053, "rewards/correctness/mean": 0.1666666716337204, "rewards/correctness/std": 0.34822853803634646, "rewards/multi_component_reward/mean": 0.22333333939313887, "rewards/multi_component_reward/std": 0.44443279504776, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.18492921590805053, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.05379525385797024, "sampling/importance_sampling_ratio/mean": 0.019492477364838123, "sampling/importance_sampling_ratio/min": 0.0019421720592784908, "sampling/sampling_logp_difference/max": 0.36746039390563967, "sampling/sampling_logp_difference/mean": 0.009275554586201907, "step": 590, "step_time": 13.933117091469466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1637.0, "completions/max_terminated_length": 1173.6, "completions/mean_length": 936.9000122070313, "completions/mean_terminated_length": 814.3366760253906, "completions/min_length": 551.2, "completions/min_terminated_length": 551.2, "entropy": 0.1666003334025542, "epoch": 0.2167577413479053, "frac_reward_zero_std": 0.2, "grad_norm": 0.011394626460969448, "kl": 0.00048624937092730153, "learning_rate": 4.1238496932515336e-07, "loss": -6.75798102747649e-05, "num_tokens": 3899387.0, "reward": 1.483333444595337, "reward_std": 0.6897397696971893, "rewards/boxed_rate/mean": 0.9, "rewards/boxed_rate/std": 0.18492921590805053, "rewards/correctness/mean": 0.20000000298023224, "rewards/correctness/std": 0.29447373151779177, "rewards/multi_component_reward/mean": 0.2500000074505806, "rewards/multi_component_reward/std": 0.42727351784706114, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.18492921590805053, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.03627656940370798, "sampling/importance_sampling_ratio/mean": 0.014781518559902906, "sampling/importance_sampling_ratio/min": 0.0008682889689286911, "sampling/sampling_logp_difference/max": 0.3049093961715698, "sampling/sampling_logp_difference/mean": 0.010832818690687418, "step": 595, "step_time": 14.285567329451442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 1661.6, "completions/max_terminated_length": 1382.2, "completions/mean_length": 1118.2000244140625, "completions/mean_terminated_length": 938.136669921875, "completions/min_length": 657.6, "completions/min_terminated_length": 657.6, "entropy": 0.17573260565598806, "epoch": 0.2185792349726776, "frac_reward_zero_std": 0.2, "grad_norm": 0.020734934136271477, "kl": 0.0004598392901243642, "learning_rate": 4.1142638036809815e-07, "loss": -0.002467634528875351, "num_tokens": 3936269.0, "reward": 1.0933334231376648, "reward_std": 0.33367283940315245, "rewards/boxed_rate/mean": 0.8333333373069763, "rewards/boxed_rate/std": 0.2882087707519531, "rewards/correctness/mean": 0.03333333432674408, "rewards/correctness/std": 0.08164966106414795, "rewards/multi_component_reward/mean": 0.02666666656732559, "rewards/multi_component_reward/std": 0.2805217713117599, "rewards/no_answer_rate/mean": 0.1666666716337204, "rewards/no_answer_rate/std": 0.2882087707519531, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.044962253980338575, "sampling/importance_sampling_ratio/mean": 0.011017974140122533, "sampling/importance_sampling_ratio/min": 0.0003941454909745216, "sampling/sampling_logp_difference/max": 0.38812339305877686, "sampling/sampling_logp_difference/mean": 0.01122636515647173, "step": 600, "step_time": 14.878706483915447 }, { "epoch": 0.2185792349726776, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.18333333790302275, "eval_completions/max_length": 1633.0, "eval_completions/max_terminated_length": 1261.48, "eval_completions/mean_length": 1074.5033642578126, "eval_completions/mean_terminated_length": 884.9103503417969, "eval_completions/min_length": 608.76, "eval_completions/min_terminated_length": 608.76, "eval_entropy": 0.1780784809589386, "eval_frac_reward_zero_std": 0.1, "eval_kl": 0.000503081139177084, "eval_loss": -0.002404627623036504, "eval_num_tokens": 3936269.0, "eval_reward": 1.5616667437553406, "eval_reward_std": 0.7698906905949116, "eval_rewards/boxed_rate/mean": 0.816666665673256, "eval_rewards/boxed_rate/std": 0.28143630146980286, "eval_rewards/correctness/mean": 0.260000002682209, "eval_rewards/correctness/std": 0.3175121474266052, "eval_rewards/multi_component_reward/mean": 0.26500001683831215, "eval_rewards/multi_component_reward/std": 0.49374800592660906, "eval_rewards/no_answer_rate/mean": 0.18333333790302275, "eval_rewards/no_answer_rate/std": 0.28143630146980286, "eval_rewards/repetition_rate/mean": 0.036666667759418486, "eval_rewards/repetition_rate/std": 0.08381265044212341, "eval_runtime": 648.1074, "eval_samples_per_second": 0.077, "eval_sampling/importance_sampling_ratio/max": 0.08480258989613504, "eval_sampling/importance_sampling_ratio/mean": 0.03125391862238757, "eval_sampling/importance_sampling_ratio/min": 0.0025257051857090485, "eval_sampling/sampling_logp_difference/max": 0.5640374529361725, "eval_sampling/sampling_logp_difference/mean": 0.01284005825407803, "eval_steps_per_second": 0.014, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1519.2, "completions/max_terminated_length": 1256.6, "completions/mean_length": 946.06669921875, "completions/mean_terminated_length": 839.1633544921875, "completions/min_length": 563.2, "completions/min_terminated_length": 563.2, "entropy": 0.14678182030717532, "epoch": 0.2204007285974499, "frac_reward_zero_std": 0.4, "grad_norm": 0.023046690970659256, "kl": 0.0005303417919397664, "learning_rate": 4.1046779141104294e-07, "loss": -0.0009497999213635922, "num_tokens": 3967489.0, "reward": 1.2500001072883606, "reward_std": 0.46636478304862977, "rewards/boxed_rate/mean": 0.9, "rewards/boxed_rate/std": 0.18492921590805053, "rewards/correctness/mean": 0.10000000298023223, "rewards/correctness/std": 0.18492921590805053, "rewards/multi_component_reward/mean": 0.15000001192092896, "rewards/multi_component_reward/std": 0.28614950776100156, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.18492921590805053, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.15675322916358708, "sampling/importance_sampling_ratio/mean": 0.057061844225972894, "sampling/importance_sampling_ratio/min": 0.0010045379561626475, "sampling/sampling_logp_difference/max": 0.3354309320449829, "sampling/sampling_logp_difference/mean": 0.010376049391925334, "step": 605, "step_time": 13.543810763396323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333432674408, "completions/max_length": 1038.6, "completions/max_terminated_length": 971.6, "completions/mean_length": 741.6333618164062, "completions/mean_terminated_length": 710.4466918945312, "completions/min_length": 523.2, "completions/min_terminated_length": 523.2, "entropy": 0.17811203648646673, "epoch": 0.2222222222222222, "frac_reward_zero_std": 0.0, "grad_norm": 1.1156023740768433, "kl": 0.0007145934030025577, "learning_rate": 4.095092024539877e-07, "loss": 0.001612265408039093, "num_tokens": 3992006.0, "reward": 1.8733334302902223, "reward_std": 0.8630595266819, "rewards/boxed_rate/mean": 0.9666666626930237, "rewards/boxed_rate/std": 0.08164966106414795, "rewards/correctness/mean": 0.3666666656732559, "rewards/correctness/std": 0.3761233925819397, "rewards/multi_component_reward/mean": 0.4733333714306355, "rewards/multi_component_reward/std": 0.4589902624487877, "rewards/no_answer_rate/mean": 0.03333333432674408, "rewards/no_answer_rate/std": 0.08164966106414795, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.12649268209934234, "sampling/importance_sampling_ratio/mean": 0.06289250953122974, "sampling/importance_sampling_ratio/min": 0.03086609955687436, "sampling/sampling_logp_difference/max": 0.3575249195098877, "sampling/sampling_logp_difference/mean": 0.011591816507279872, "step": 610, "step_time": 10.539130839891731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1739.8, "completions/max_terminated_length": 1545.4, "completions/mean_length": 1192.700048828125, "completions/mean_terminated_length": 1113.4033569335938, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "entropy": 0.1430244820813338, "epoch": 0.22404371584699453, "frac_reward_zero_std": 0.2, "grad_norm": 0.008428337052464485, "kl": 0.00040708660574940346, "learning_rate": 4.0855061349693246e-07, "loss": 0.0004372816067188978, "num_tokens": 4030967.0, "reward": 2.020000123977661, "reward_std": 0.8252323269844055, "rewards/boxed_rate/mean": 0.9, "rewards/boxed_rate/std": 0.18492921590805053, "rewards/correctness/mean": 0.4666666746139526, "rewards/correctness/std": 0.36985843181610106, "rewards/multi_component_reward/mean": 0.5533333420753479, "rewards/multi_component_reward/std": 0.4648365914821625, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.18492921590805053, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.06130031887441874, "sampling/importance_sampling_ratio/mean": 0.01900877393782139, "sampling/importance_sampling_ratio/min": 0.00034280085452338406, "sampling/sampling_logp_difference/max": 0.37778170108795167, "sampling/sampling_logp_difference/mean": 0.009492585808038712, "step": 615, "step_time": 15.555601417645812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1618.8, "completions/max_terminated_length": 932.0, "completions/mean_length": 796.36669921875, "completions/mean_terminated_length": 658.6666809082031, "completions/min_length": 447.4, "completions/min_terminated_length": 447.4, "entropy": 0.24623483493924142, "epoch": 0.22586520947176686, "frac_reward_zero_std": 0.4, "grad_norm": 0.031970929354429245, "kl": 0.0006379926751833409, "learning_rate": 4.075920245398773e-07, "loss": -0.0030921820551157, "num_tokens": 4057612.0, "reward": 1.3433334350585937, "reward_std": 0.5004158258438111, "rewards/boxed_rate/mean": 0.899999988079071, "rewards/boxed_rate/std": 0.24494898319244385, "rewards/correctness/mean": 0.13333333730697633, "rewards/correctness/std": 0.20655910968780516, "rewards/multi_component_reward/mean": 0.1766666740179062, "rewards/multi_component_reward/std": 0.33977961242198945, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.24494898319244385, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.06918634744361044, "sampling/importance_sampling_ratio/mean": 0.02087077551987022, "sampling/importance_sampling_ratio/min": 0.0005975837658579486, "sampling/sampling_logp_difference/max": 0.34571409225463867, "sampling/sampling_logp_difference/mean": 0.01637477772310376, "step": 620, "step_time": 13.71192037537694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666865348817, "completions/max_length": 1644.8, "completions/max_terminated_length": 1458.6, "completions/mean_length": 1135.6000244140625, "completions/mean_terminated_length": 976.4267211914063, "completions/min_length": 573.8, "completions/min_terminated_length": 573.8, "entropy": 0.15917381308972836, "epoch": 0.22768670309653916, "frac_reward_zero_std": 0.2, "grad_norm": 0.03386072441935539, "kl": 0.0005424051094450987, "learning_rate": 4.066334355828221e-07, "loss": 0.0008449406363070011, "num_tokens": 4094980.0, "reward": 1.4666667938232423, "reward_std": 0.5768125057220459, "rewards/boxed_rate/mean": 0.8333333253860473, "rewards/boxed_rate/std": 0.2728438377380371, "rewards/correctness/mean": 0.20000000596046447, "rewards/correctness/std": 0.20655910968780516, "rewards/multi_component_reward/mean": 0.20000001415610313, "rewards/multi_component_reward/std": 0.3994357705116272, "rewards/no_answer_rate/mean": 0.16666666865348817, "rewards/no_answer_rate/std": 0.2728438377380371, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.1632993221282959, "sampling/importance_sampling_ratio/max": 0.054127467796206474, "sampling/importance_sampling_ratio/mean": 0.016182091273367404, "sampling/importance_sampling_ratio/min": 0.002016719186418556, "sampling/sampling_logp_difference/max": 0.3768587350845337, "sampling/sampling_logp_difference/mean": 0.009851652942597867, "step": 625, "step_time": 14.74142508301884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333432674408, "completions/max_length": 1218.6, "completions/max_terminated_length": 1002.4, "completions/mean_length": 759.3666870117188, "completions/mean_terminated_length": 716.2800170898438, "completions/min_length": 491.2, "completions/min_terminated_length": 491.2, "entropy": 0.19249504134058953, "epoch": 0.22950819672131148, "frac_reward_zero_std": 0.2, "grad_norm": 0.040883224457502365, "kl": 0.0008160913091463347, "learning_rate": 4.0567484662576683e-07, "loss": -0.02089705318212509, "num_tokens": 4121865.0, "reward": 1.6600001096725463, "reward_std": 0.8352818369865418, "rewards/boxed_rate/mean": 1.0, "rewards/boxed_rate/std": 0.0, "rewards/correctness/mean": 0.2666666716337204, "rewards/correctness/std": 0.3977532863616943, "rewards/multi_component_reward/mean": 0.3933333531022072, "rewards/multi_component_reward/std": 0.4375286281108856, "rewards/no_answer_rate/mean": 0.0, "rewards/no_answer_rate/std": 0.0, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.12170692235231399, "sampling/importance_sampling_ratio/mean": 0.031014200672507285, "sampling/importance_sampling_ratio/min": 0.001588601156254299, "sampling/sampling_logp_difference/max": 0.3645863771438599, "sampling/sampling_logp_difference/mean": 0.01216810904443264, "step": 630, "step_time": 11.676486384868621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1104.8, "completions/max_terminated_length": 1104.8, "completions/mean_length": 809.733349609375, "completions/mean_terminated_length": 809.733349609375, "completions/min_length": 576.0, "completions/min_terminated_length": 576.0, "entropy": 0.17598983322580655, "epoch": 0.23132969034608378, "frac_reward_zero_std": 0.4, "grad_norm": 0.016296500340104103, "kl": 0.0005092226871056482, "learning_rate": 4.0471625766871167e-07, "loss": -0.004767249897122383, "num_tokens": 4148941.0, "reward": 1.520000123977661, "reward_std": 0.6183947920799255, "rewards/boxed_rate/mean": 1.0, "rewards/boxed_rate/std": 0.0, "rewards/correctness/mean": 0.20000000298023224, "rewards/correctness/std": 0.29447373151779177, "rewards/multi_component_reward/mean": 0.3200000077486038, "rewards/multi_component_reward/std": 0.3239211142063141, "rewards/no_answer_rate/mean": 0.0, "rewards/no_answer_rate/std": 0.0, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.17697620438411832, "sampling/importance_sampling_ratio/mean": 0.06787280989810825, "sampling/importance_sampling_ratio/min": 0.03228051500591391, "sampling/sampling_logp_difference/max": 0.34911930561065674, "sampling/sampling_logp_difference/mean": 0.011234280280768871, "step": 635, "step_time": 10.972807624004782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333432674408, "completions/max_length": 1261.6, "completions/max_terminated_length": 1218.8, "completions/mean_length": 815.0000244140625, "completions/mean_terminated_length": 784.780029296875, "completions/min_length": 532.2, "completions/min_terminated_length": 532.2, "entropy": 0.14789476320147515, "epoch": 0.2331511839708561, "frac_reward_zero_std": 0.2, "grad_norm": 0.08125295490026474, "kl": 0.0006361469063752641, "learning_rate": 4.037576687116564e-07, "loss": -0.008612464368343353, "num_tokens": 4176637.0, "reward": 1.7100001096725463, "reward_std": 0.8422193884849548, "rewards/boxed_rate/mean": 0.9666666626930237, "rewards/boxed_rate/std": 0.08164966106414795, "rewards/correctness/mean": 0.3000000089406967, "rewards/correctness/std": 0.3914883255958557, "rewards/multi_component_reward/mean": 0.41000000983476637, "rewards/multi_component_reward/std": 0.4542399704456329, "rewards/no_answer_rate/mean": 0.03333333432674408, "rewards/no_answer_rate/std": 0.08164966106414795, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.14553753491491078, "sampling/importance_sampling_ratio/mean": 0.0640607855282724, "sampling/importance_sampling_ratio/min": 0.0038686898650384326, "sampling/sampling_logp_difference/max": 0.3177733182907104, "sampling/sampling_logp_difference/mean": 0.00996503084897995, "step": 640, "step_time": 11.762126618809997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 1707.4, "completions/max_terminated_length": 1151.8, "completions/mean_length": 1026.000048828125, "completions/mean_terminated_length": 833.5666870117187, "completions/min_length": 601.6, "completions/min_terminated_length": 601.6, "entropy": 0.15211746419469516, "epoch": 0.23497267759562843, "frac_reward_zero_std": 0.0, "grad_norm": 0.04184797778725624, "kl": 0.0004896697394239406, "learning_rate": 4.027990797546012e-07, "loss": -0.0011240935884416104, "num_tokens": 4210123.0, "reward": 1.7233334302902221, "reward_std": 0.8806208968162537, "rewards/boxed_rate/mean": 0.8333333373069763, "rewards/boxed_rate/std": 0.2882087707519531, "rewards/correctness/mean": 0.33333333432674406, "rewards/correctness/std": 0.36985843181610106, "rewards/multi_component_reward/mean": 0.35666669607162477, "rewards/multi_component_reward/std": 0.5503794223070144, "rewards/no_answer_rate/mean": 0.1666666716337204, "rewards/no_answer_rate/std": 0.2882087707519531, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.10542454989627004, "sampling/importance_sampling_ratio/mean": 0.03725480753928423, "sampling/importance_sampling_ratio/min": 0.0006887080645215871, "sampling/sampling_logp_difference/max": 0.3569879293441772, "sampling/sampling_logp_difference/mean": 0.011003307159990072, "step": 645, "step_time": 14.911480633541942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06666666865348816, "completions/max_length": 1574.6, "completions/max_terminated_length": 1237.0, "completions/mean_length": 920.3000244140625, "completions/mean_terminated_length": 845.8533569335938, "completions/min_length": 600.0, "completions/min_terminated_length": 600.0, "entropy": 0.2023964905490478, "epoch": 0.23679417122040072, "frac_reward_zero_std": 0.4, "grad_norm": 0.023950839415192604, "kl": 0.0005954526874120347, "learning_rate": 4.01840490797546e-07, "loss": 0.0016510456800460816, "num_tokens": 4241086.0, "reward": 1.4766667366027832, "reward_std": 0.5016432344913483, "rewards/boxed_rate/mean": 0.9666666626930237, "rewards/boxed_rate/std": 0.08164966106414795, "rewards/correctness/mean": 0.16666666865348817, "rewards/correctness/std": 0.2128240704536438, "rewards/multi_component_reward/mean": 0.24333334639668464, "rewards/multi_component_reward/std": 0.29857338666915895, "rewards/no_answer_rate/mean": 0.03333333432674408, "rewards/no_answer_rate/std": 0.08164966106414795, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.1632993221282959, "sampling/importance_sampling_ratio/max": 0.02175317257642746, "sampling/importance_sampling_ratio/mean": 0.007160785421729088, "sampling/importance_sampling_ratio/min": 0.00033338346147502305, "sampling/sampling_logp_difference/max": 0.387990665435791, "sampling/sampling_logp_difference/mean": 0.012314715608954429, "step": 650, "step_time": 13.970132818445563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03333333432674408, "completions/max_length": 1193.4, "completions/max_terminated_length": 1015.4, "completions/mean_length": 716.5000244140625, "completions/mean_terminated_length": 674.5733520507813, "completions/min_length": 464.6, "completions/min_terminated_length": 464.6, "entropy": 0.16268871029218038, "epoch": 0.23861566484517305, "frac_reward_zero_std": 0.2, "grad_norm": 0.06329948455095291, "kl": 0.0005586101295193657, "learning_rate": 4.0088190184049077e-07, "loss": -0.0020235760137438776, "num_tokens": 4265005.0, "reward": 1.5000001192092896, "reward_std": 0.7927483677864074, "rewards/boxed_rate/mean": 0.9666666626930237, "rewards/boxed_rate/std": 0.08164966106414795, "rewards/correctness/mean": 0.20000000596046447, "rewards/correctness/std": 0.36985843181610106, "rewards/multi_component_reward/mean": 0.30000000447034836, "rewards/multi_component_reward/std": 0.4276039183139801, "rewards/no_answer_rate/mean": 0.03333333432674408, "rewards/no_answer_rate/std": 0.08164966106414795, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.08887637332081795, "sampling/importance_sampling_ratio/mean": 0.04327146988362074, "sampling/importance_sampling_ratio/min": 0.01085926350569757, "sampling/sampling_logp_difference/max": 0.34261035919189453, "sampling/sampling_logp_difference/mean": 0.010840693116188049, "step": 655, "step_time": 11.254522551037372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1448.4, "completions/max_terminated_length": 1037.6, "completions/mean_length": 916.8000366210938, "completions/mean_terminated_length": 796.9666870117187, "completions/min_length": 576.2, "completions/min_terminated_length": 576.2, "entropy": 0.1381769967575868, "epoch": 0.24043715846994534, "frac_reward_zero_std": 0.2, "grad_norm": 0.018603617325425148, "kl": 0.0004810667795633587, "learning_rate": 3.9992331288343556e-07, "loss": -0.00833008587360382, "num_tokens": 4294873.0, "reward": 1.530000126361847, "reward_std": 0.5924062907695771, "rewards/boxed_rate/mean": 0.9, "rewards/boxed_rate/std": 0.18492921590805053, "rewards/correctness/mean": 0.23333333134651185, "rewards/correctness/std": 0.24494898319244385, "rewards/multi_component_reward/mean": 0.29666669070720675, "rewards/multi_component_reward/std": 0.35217125415802003, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.18492921590805053, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.19538256973028184, "sampling/importance_sampling_ratio/mean": 0.0801917064934969, "sampling/importance_sampling_ratio/min": 0.002801872471235356, "sampling/sampling_logp_difference/max": 0.36407305002212526, "sampling/sampling_logp_difference/mean": 0.009397336840629577, "step": 660, "step_time": 13.188869021646678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 611.6000183105468, "completions/mean_terminated_length": 611.6000183105468, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "entropy": 0.21465270320574442, "epoch": 0.24225865209471767, "frac_reward_zero_std": 0.2, "grad_norm": 0.021906884387135506, "kl": 0.0005874778813449666, "learning_rate": 3.9896472392638035e-07, "loss": 0.0034822095185518265, "num_tokens": 4315237.0, "reward": 1.9400000810623168, "reward_std": 0.8352818369865418, "rewards/boxed_rate/mean": 1.0, "rewards/boxed_rate/std": 0.0, "rewards/correctness/mean": 0.4000000089406967, "rewards/correctness/std": 0.3977532863616943, "rewards/multi_component_reward/mean": 0.5399999991059303, "rewards/multi_component_reward/std": 0.4375286281108856, "rewards/no_answer_rate/mean": 0.0, "rewards/no_answer_rate/std": 0.0, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.08231882937252522, "sampling/importance_sampling_ratio/mean": 0.03466434141155332, "sampling/importance_sampling_ratio/min": 0.0024839662786689588, "sampling/sampling_logp_difference/max": 0.35751450061798096, "sampling/sampling_logp_difference/mean": 0.013147631753236055, "step": 665, "step_time": 9.326043193787337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1493.4, "completions/max_terminated_length": 1139.0, "completions/mean_length": 914.633349609375, "completions/mean_terminated_length": 779.0366821289062, "completions/min_length": 560.8, "completions/min_terminated_length": 560.8, "entropy": 0.15737735082705814, "epoch": 0.24408014571949, "frac_reward_zero_std": 0.2, "grad_norm": 0.01330585964024067, "kl": 0.0005065599815376724, "learning_rate": 3.9800613496932514e-07, "loss": -0.006443501263856888, "num_tokens": 4345802.0, "reward": 1.32000013589859, "reward_std": 0.6418773233890533, "rewards/boxed_rate/mean": 0.9, "rewards/boxed_rate/std": 0.18492921590805053, "rewards/correctness/mean": 0.13333333730697633, "rewards/correctness/std": 0.2665788769721985, "rewards/multi_component_reward/mean": 0.18666667491197586, "rewards/multi_component_reward/std": 0.3788073122501373, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.18492921590805053, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.10360531127080322, "sampling/importance_sampling_ratio/mean": 0.05636316137388349, "sampling/importance_sampling_ratio/min": 0.022174429874797168, "sampling/sampling_logp_difference/max": 0.37078907489776614, "sampling/sampling_logp_difference/mean": 0.010791171807795763, "step": 670, "step_time": 13.329693194106222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06666666865348816, "completions/max_length": 1439.8, "completions/max_terminated_length": 1202.4, "completions/mean_length": 803.2666870117188, "completions/mean_terminated_length": 723.4000122070313, "completions/min_length": 409.0, "completions/min_terminated_length": 409.0, "entropy": 0.17263933196663855, "epoch": 0.2459016393442623, "frac_reward_zero_std": 0.2, "grad_norm": 0.1460752934217453, "kl": 0.0005690245316751923, "learning_rate": 3.970475460122699e-07, "loss": -0.012128566205501557, "num_tokens": 4373710.0, "reward": 1.6166667461395263, "reward_std": 0.8088707447052002, "rewards/boxed_rate/mean": 0.9666666626930237, "rewards/boxed_rate/std": 0.08164966106414795, "rewards/correctness/mean": 0.23333334028720856, "rewards/correctness/std": 0.3914883255958557, "rewards/multi_component_reward/mean": 0.31666667610406873, "rewards/multi_component_reward/std": 0.4805600345134735, "rewards/no_answer_rate/mean": 0.03333333432674408, "rewards/no_answer_rate/std": 0.08164966106414795, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.1632993221282959, "sampling/importance_sampling_ratio/max": 0.10961102806031704, "sampling/importance_sampling_ratio/mean": 0.043116622418165204, "sampling/importance_sampling_ratio/min": 0.001660405399086784, "sampling/sampling_logp_difference/max": 0.3272708892822266, "sampling/sampling_logp_difference/mean": 0.011050505377352238, "step": 675, "step_time": 12.905224220454693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23333333432674408, "completions/max_length": 1583.4, "completions/max_terminated_length": 1101.4, "completions/mean_length": 1081.7000366210937, "completions/mean_terminated_length": 832.26669921875, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "entropy": 0.17594740043083826, "epoch": 0.24772313296903462, "frac_reward_zero_std": 0.0, "grad_norm": 0.07136747241020203, "kl": 0.0005246107791511653, "learning_rate": 3.960889570552147e-07, "loss": 0.003443557769060135, "num_tokens": 4409875.0, "reward": 1.4266667485237121, "reward_std": 0.781731303036213, "rewards/boxed_rate/mean": 0.7666666626930236, "rewards/boxed_rate/std": 0.30073869228363037, "rewards/correctness/mean": 0.20000000298023224, "rewards/correctness/std": 0.29447373151779177, "rewards/multi_component_reward/mean": 0.16000001206994058, "rewards/multi_component_reward/std": 0.509483140707016, "rewards/no_answer_rate/mean": 0.23333333432674408, "rewards/no_answer_rate/std": 0.30073869228363037, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.1632993221282959, "sampling/importance_sampling_ratio/max": 0.07207991853356362, "sampling/importance_sampling_ratio/mean": 0.018585072737187146, "sampling/importance_sampling_ratio/min": 0.00033181315377639464, "sampling/sampling_logp_difference/max": 0.35654211044311523, "sampling/sampling_logp_difference/mean": 0.011251153517514467, "step": 680, "step_time": 14.236122718639672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23333334028720856, "completions/max_length": 1406.2, "completions/max_terminated_length": 905.2, "completions/mean_length": 937.7333770751953, "completions/mean_terminated_length": 666.7800079345703, "completions/min_length": 499.4, "completions/min_terminated_length": 499.4, "entropy": 0.15866130826373895, "epoch": 0.2495446265938069, "frac_reward_zero_std": 0.2, "grad_norm": 0.019822267815470695, "kl": 0.0025144809878838714, "learning_rate": 3.9513036809815945e-07, "loss": 0.0014299599453806878, "num_tokens": 4441367.0, "reward": 1.5433334589004517, "reward_std": 0.8197273433208465, "rewards/boxed_rate/mean": 0.7666666686534882, "rewards/boxed_rate/std": 0.2882087707519531, "rewards/correctness/mean": 0.26666666865348815, "rewards/correctness/std": 0.32236858606338503, "rewards/multi_component_reward/mean": 0.2433333471417427, "rewards/multi_component_reward/std": 0.5061476826667786, "rewards/no_answer_rate/mean": 0.23333334028720856, "rewards/no_answer_rate/std": 0.2882087707519531, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.19941046759486197, "sampling/importance_sampling_ratio/mean": 0.10512657705694436, "sampling/importance_sampling_ratio/min": 0.052634853626081814, "sampling/sampling_logp_difference/max": 0.39468674659729003, "sampling/sampling_logp_difference/mean": 0.010894744656980038, "step": 685, "step_time": 13.195306334272027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06666666865348816, "completions/max_length": 1319.4, "completions/max_terminated_length": 1112.6, "completions/mean_length": 827.1333801269532, "completions/mean_terminated_length": 760.5666931152343, "completions/min_length": 536.4, "completions/min_terminated_length": 536.4, "entropy": 0.18860391999284426, "epoch": 0.25136612021857924, "frac_reward_zero_std": 0.2, "grad_norm": 0.028540268540382385, "kl": 0.0006184172428523501, "learning_rate": 3.941717791411043e-07, "loss": -0.008557839691638947, "num_tokens": 4468635.0, "reward": 2.013333463668823, "reward_std": 0.8281045794487, "rewards/boxed_rate/mean": 0.9333333253860474, "rewards/boxed_rate/std": 0.1632993221282959, "rewards/correctness/mean": 0.4333333343267441, "rewards/correctness/std": 0.3761233925819397, "rewards/multi_component_reward/mean": 0.513333360850811, "rewards/multi_component_reward/std": 0.4702867269515991, "rewards/no_answer_rate/mean": 0.06666666865348816, "rewards/no_answer_rate/std": 0.1632993221282959, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.10327955484390258, "sampling/importance_sampling_ratio/max": 0.06727873869240283, "sampling/importance_sampling_ratio/mean": 0.03433487480506301, "sampling/importance_sampling_ratio/min": 0.00048347263269847286, "sampling/sampling_logp_difference/max": 0.42853312492370604, "sampling/sampling_logp_difference/mean": 0.012394568137824535, "step": 690, "step_time": 12.268561779148877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1604.2, "completions/max_terminated_length": 1084.2, "completions/mean_length": 907.400048828125, "completions/mean_terminated_length": 778.053369140625, "completions/min_length": 563.2, "completions/min_terminated_length": 563.2, "entropy": 0.14893935720125834, "epoch": 0.25318761384335153, "frac_reward_zero_std": 0.2, "grad_norm": 0.018119433894753456, "kl": 0.00045477717115621394, "learning_rate": 3.932131901840491e-07, "loss": -0.002763097546994686, "num_tokens": 4499319.0, "reward": 1.3900001049041748, "reward_std": 0.6540532588958741, "rewards/boxed_rate/mean": 0.899999988079071, "rewards/boxed_rate/std": 0.24494898319244385, "rewards/correctness/mean": 0.16666666865348817, "rewards/correctness/std": 0.2728438377380371, "rewards/multi_component_reward/mean": 0.2233333483338356, "rewards/multi_component_reward/std": 0.3906372755765915, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.24494898319244385, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.09045649766921997, "sampling/importance_sampling_ratio/mean": 0.034694204293191436, "sampling/importance_sampling_ratio/min": 0.0024610218745878347, "sampling/sampling_logp_difference/max": 0.3914134740829468, "sampling/sampling_logp_difference/mean": 0.010633220244199038, "step": 695, "step_time": 13.952417024224996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1497.8, "completions/max_terminated_length": 771.4, "completions/mean_length": 722.333349609375, "completions/mean_terminated_length": 584.6333435058593, "completions/min_length": 428.8, "completions/min_terminated_length": 428.8, "entropy": 0.20499998504916828, "epoch": 0.2550091074681239, "frac_reward_zero_std": 0.0, "grad_norm": 0.06937753409147263, "kl": 0.0006178093899507076, "learning_rate": 3.922546012269938e-07, "loss": -0.01590590924024582, "num_tokens": 4523047.0, "reward": 1.4600000619888305, "reward_std": 0.7687346935272217, "rewards/boxed_rate/mean": 0.899999988079071, "rewards/boxed_rate/std": 0.24494898319244385, "rewards/correctness/mean": 0.20000000596046447, "rewards/correctness/std": 0.3098386645317078, "rewards/multi_component_reward/mean": 0.2600000202655792, "rewards/multi_component_reward/std": 0.4624049305915833, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.24494898319244385, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.16114517226815223, "sampling/importance_sampling_ratio/mean": 0.06391621977090836, "sampling/importance_sampling_ratio/min": 0.010175443676988779, "sampling/sampling_logp_difference/max": 0.3058194637298584, "sampling/sampling_logp_difference/mean": 0.013935025222599507, "step": 700, "step_time": 13.013086147420108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666865348817, "completions/max_length": 1429.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 947.1667053222657, "completions/mean_terminated_length": 811.6733581542969, "completions/min_length": 627.2, "completions/min_terminated_length": 627.2, "entropy": 0.17254207134246827, "epoch": 0.2568306010928962, "frac_reward_zero_std": 0.0, "grad_norm": 0.012989591807126999, "kl": 0.0013695822664885782, "learning_rate": 3.9129601226993866e-07, "loss": 0.02462577521800995, "num_tokens": 4554762.0, "reward": 2.000000071525574, "reward_std": 1.0529863953590393, "rewards/boxed_rate/mean": 0.8333333253860473, "rewards/boxed_rate/std": 0.2728438377380371, "rewards/correctness/mean": 0.4666666626930237, "rewards/correctness/std": 0.4298781991004944, "rewards/multi_component_reward/mean": 0.5000000357627868, "rewards/multi_component_reward/std": 0.5574271440505981, "rewards/no_answer_rate/mean": 0.16666666865348817, "rewards/no_answer_rate/std": 0.2728438377380371, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.1705483404919505, "sampling/importance_sampling_ratio/mean": 0.062803098349832, "sampling/importance_sampling_ratio/min": 0.011600531370969901, "sampling/sampling_logp_difference/max": 0.36405808925628663, "sampling/sampling_logp_difference/mean": 0.01186208426952362, "step": 705, "step_time": 13.669213125482202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 1826.4, "completions/max_terminated_length": 1155.4, "completions/mean_length": 1059.6667114257812, "completions/mean_terminated_length": 841.7533569335938, "completions/min_length": 515.2, "completions/min_terminated_length": 515.2, "entropy": 0.23051666493217152, "epoch": 0.2586520947176685, "frac_reward_zero_std": 0.0, "grad_norm": 0.04497559741139412, "kl": 0.000664207028845946, "learning_rate": 3.9033742331288344e-07, "loss": -0.00020618487615138293, "num_tokens": 4589378.0, "reward": 1.8600000977516173, "reward_std": 0.9444810330867768, "rewards/boxed_rate/mean": 0.8333333373069763, "rewards/boxed_rate/std": 0.2882087707519531, "rewards/correctness/mean": 0.40000000298023225, "rewards/correctness/std": 0.36985843181610106, "rewards/multi_component_reward/mean": 0.4266666829586029, "rewards/multi_component_reward/std": 0.552643096446991, "rewards/no_answer_rate/mean": 0.1666666716337204, "rewards/no_answer_rate/std": 0.2882087707519531, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.04061034563928843, "sampling/importance_sampling_ratio/mean": 0.008759467327035964, "sampling/importance_sampling_ratio/min": 0.0002094253270666409, "sampling/sampling_logp_difference/max": 0.3215909957885742, "sampling/sampling_logp_difference/mean": 0.014831000193953513, "step": 710, "step_time": 15.66096483822912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06666666865348816, "completions/max_length": 1843.8, "completions/max_terminated_length": 1615.2, "completions/mean_length": 1085.3667114257812, "completions/mean_terminated_length": 1008.9400390625, "completions/min_length": 637.2, "completions/min_terminated_length": 637.2, "entropy": 0.19345977554718655, "epoch": 0.2604735883424408, "frac_reward_zero_std": 0.2, "grad_norm": 0.04418517276644707, "kl": 0.0008112916897516697, "learning_rate": 3.893788343558282e-07, "loss": -0.0014110441319644452, "num_tokens": 4626127.0, "reward": 1.5000000953674317, "reward_std": 0.7173020958900451, "rewards/boxed_rate/mean": 0.9333333253860474, "rewards/boxed_rate/std": 0.1632993221282959, "rewards/correctness/mean": 0.20000000298023224, "rewards/correctness/std": 0.29447373151779177, "rewards/multi_component_reward/mean": 0.266666679084301, "rewards/multi_component_reward/std": 0.3763957768678665, "rewards/no_answer_rate/mean": 0.06666666865348816, "rewards/no_answer_rate/std": 0.1632993221282959, "rewards/repetition_rate/mean": 0.03333333432674408, "rewards/repetition_rate/std": 0.08164966106414795, "sampling/importance_sampling_ratio/max": 0.04309340715408325, "sampling/importance_sampling_ratio/mean": 0.012624496826902032, "sampling/importance_sampling_ratio/min": 0.00014569541402083248, "sampling/sampling_logp_difference/max": 0.3676969051361084, "sampling/sampling_logp_difference/mean": 0.01275388365611434, "step": 715, "step_time": 15.848505285196007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 1794.8, "completions/max_terminated_length": 1234.6, "completions/mean_length": 973.4000183105469, "completions/mean_terminated_length": 763.0933410644532, "completions/min_length": 480.8, "completions/min_terminated_length": 480.8, "entropy": 0.19203649039069812, "epoch": 0.26229508196721313, "frac_reward_zero_std": 0.2, "grad_norm": 0.01788559928536415, "kl": 0.0007585312666681906, "learning_rate": 3.8842024539877297e-07, "loss": -0.0016064226627349853, "num_tokens": 4659067.0, "reward": 1.3966667771339416, "reward_std": 0.3670642614364624, "rewards/boxed_rate/mean": 0.8333333373069763, "rewards/boxed_rate/std": 0.2882087707519531, "rewards/correctness/mean": 0.1666666626930237, "rewards/correctness/std": 0.08164966106414795, "rewards/multi_component_reward/mean": 0.16333335414528846, "rewards/multi_component_reward/std": 0.28173157572746277, "rewards/no_answer_rate/mean": 0.1666666716337204, "rewards/no_answer_rate/std": 0.2882087707519531, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.1632993221282959, "sampling/importance_sampling_ratio/max": 0.2071637876331806, "sampling/importance_sampling_ratio/mean": 0.051915486436337234, "sampling/importance_sampling_ratio/min": 3.873565276598517e-06, "sampling/sampling_logp_difference/max": 0.34898459911346436, "sampling/sampling_logp_difference/mean": 0.013108029030263424, "step": 720, "step_time": 15.256685120798647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20000000596046447, "completions/max_length": 1793.6, "completions/max_terminated_length": 1117.6, "completions/mean_length": 1055.9667114257813, "completions/mean_terminated_length": 758.4666870117187, "completions/min_length": 497.0, "completions/min_terminated_length": 497.0, "entropy": 0.20671888664364815, "epoch": 0.2641165755919854, "frac_reward_zero_std": 0.4, "grad_norm": 0.023016415536403656, "kl": 0.0006531716018798762, "learning_rate": 3.874616564417178e-07, "loss": 0.0004923004657030105, "num_tokens": 4692996.0, "reward": 1.6100001096725465, "reward_std": 0.5153546214103699, "rewards/boxed_rate/mean": 0.7999999940395355, "rewards/boxed_rate/std": 0.2665788769721985, "rewards/correctness/mean": 0.3000000029802322, "rewards/correctness/std": 0.18492921590805053, "rewards/multi_component_reward/mean": 0.310000017285347, "rewards/multi_component_reward/std": 0.3351393073797226, "rewards/no_answer_rate/mean": 0.20000000596046447, "rewards/no_answer_rate/std": 0.2665788769721985, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.050453129876405, "sampling/importance_sampling_ratio/mean": 0.014345021429471671, "sampling/importance_sampling_ratio/min": 0.0007461899496814892, "sampling/sampling_logp_difference/max": 0.4133040189743042, "sampling/sampling_logp_difference/mean": 0.012775494437664747, "step": 725, "step_time": 15.257122083753348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20000000298023224, "completions/max_length": 1528.6, "completions/max_terminated_length": 1156.6, "completions/mean_length": 1054.86669921875, "completions/mean_terminated_length": 859.533349609375, "completions/min_length": 625.0, "completions/min_terminated_length": 625.0, "entropy": 0.21140047113100688, "epoch": 0.2659380692167577, "frac_reward_zero_std": 0.0, "grad_norm": 0.018997710198163986, "kl": 0.0006346999696688726, "learning_rate": 3.8650306748466255e-07, "loss": 0.0016422737389802933, "num_tokens": 4727378.0, "reward": 1.5333333969116212, "reward_std": 0.7559072077274323, "rewards/boxed_rate/mean": 0.8333333253860473, "rewards/boxed_rate/std": 0.2728438377380371, "rewards/correctness/mean": 0.23333333432674408, "rewards/correctness/std": 0.30073869228363037, "rewards/multi_component_reward/mean": 0.23333334922790527, "rewards/multi_component_reward/std": 0.4953298598527908, "rewards/no_answer_rate/mean": 0.16666666865348817, "rewards/no_answer_rate/std": 0.2728438377380371, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.10327955484390258, "sampling/importance_sampling_ratio/max": 0.08731604730710388, "sampling/importance_sampling_ratio/mean": 0.020629775966517626, "sampling/importance_sampling_ratio/min": 0.0005315004007400171, "sampling/sampling_logp_difference/max": 0.3350215911865234, "sampling/sampling_logp_difference/mean": 0.013690494932234287, "step": 730, "step_time": 13.985034779645503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13333333730697633, "completions/max_length": 1815.2, "completions/max_terminated_length": 1340.4, "completions/mean_length": 1031.46669921875, "completions/mean_terminated_length": 883.0133483886718, "completions/min_length": 518.6, "completions/min_terminated_length": 518.6, "entropy": 0.18139945759127538, "epoch": 0.2677595628415301, "frac_reward_zero_std": 0.0, "grad_norm": 0.021211547777056694, "kl": 0.000629005294952852, "learning_rate": 3.8554447852760733e-07, "loss": 0.0014220052398741245, "num_tokens": 4761916.0, "reward": 1.4866667628288268, "reward_std": 0.8185165703296662, "rewards/boxed_rate/mean": 0.8666666626930237, "rewards/boxed_rate/std": 0.2665788769721985, "rewards/correctness/mean": 0.20000000298023224, "rewards/correctness/std": 0.35449349880218506, "rewards/multi_component_reward/mean": 0.2200000137090683, "rewards/multi_component_reward/std": 0.5127273321151733, "rewards/no_answer_rate/mean": 0.13333333730697633, "rewards/no_answer_rate/std": 0.2665788769721985, "rewards/repetition_rate/mean": 0.06666666865348816, "rewards/repetition_rate/std": 0.10327955484390258, "sampling/importance_sampling_ratio/max": 0.06554706878960133, "sampling/importance_sampling_ratio/mean": 0.02508344785310328, "sampling/importance_sampling_ratio/min": 0.0035423538342001966, "sampling/sampling_logp_difference/max": 0.31146485805511476, "sampling/sampling_logp_difference/mean": 0.0113022543489933, "step": 735, "step_time": 15.48011973593384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.8, "completions/max_terminated_length": 883.8, "completions/mean_length": 542.2000122070312, "completions/mean_terminated_length": 542.2000122070312, "completions/min_length": 352.4, "completions/min_terminated_length": 352.4, "entropy": 0.19473386829098066, "epoch": 0.26958105646630237, "frac_reward_zero_std": 0.4, "grad_norm": 2.118746042251587, "kl": 0.0021321156071887042, "learning_rate": 3.845858895705521e-07, "loss": -0.0064903564751148226, "num_tokens": 4781002.0, "reward": 1.7300001382827759, "reward_std": 0.5729720234870911, "rewards/boxed_rate/mean": 1.0, "rewards/boxed_rate/std": 0.0, "rewards/correctness/mean": 0.29999999701976776, "rewards/correctness/std": 0.2728438377380371, "rewards/multi_component_reward/mean": 0.430000028014183, "rewards/multi_component_reward/std": 0.3001282274723053, "rewards/no_answer_rate/mean": 0.0, "rewards/no_answer_rate/std": 0.0, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.28090705554932355, "sampling/importance_sampling_ratio/mean": 0.15755705069750547, "sampling/importance_sampling_ratio/min": 0.0855626948821282, "sampling/sampling_logp_difference/max": 0.3150986671447754, "sampling/sampling_logp_difference/mean": 0.01262940764427185, "step": 740, "step_time": 9.168580058775841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10000000298023223, "completions/max_length": 1972.8, "completions/max_terminated_length": 1486.4, "completions/mean_length": 1172.000048828125, "completions/mean_terminated_length": 1050.6600280761718, "completions/min_length": 720.8, "completions/min_terminated_length": 720.8, "entropy": 0.11767480584482352, "epoch": 0.27140255009107467, "frac_reward_zero_std": 0.0, "grad_norm": 0.08145014941692352, "kl": 0.0006028921416145749, "learning_rate": 3.836273006134969e-07, "loss": 0.0025216279551386832, "num_tokens": 4818496.0, "reward": 1.6700001001358031, "reward_std": 0.8164093613624572, "rewards/boxed_rate/mean": 0.899999988079071, "rewards/boxed_rate/std": 0.24494898319244385, "rewards/correctness/mean": 0.3, "rewards/correctness/std": 0.34822853803634646, "rewards/multi_component_reward/mean": 0.37000002562999723, "rewards/multi_component_reward/std": 0.47640362083911897, "rewards/no_answer_rate/mean": 0.10000000298023223, "rewards/no_answer_rate/std": 0.24494898319244385, "rewards/repetition_rate/mean": 0.0, "rewards/repetition_rate/std": 0.0, "sampling/importance_sampling_ratio/max": 0.14499615281820297, "sampling/importance_sampling_ratio/mean": 0.04840177483856678, "sampling/importance_sampling_ratio/min": 0.0013758533679049771, "sampling/sampling_logp_difference/max": 0.32971014976501467, "sampling/sampling_logp_difference/mean": 0.008220831863582134, "step": 745, "step_time": 16.791488209925593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.36666667759418486, "completions/max_length": 1719.0, "completions/max_terminated_length": 1232.4, "completions/mean_length": 1217.600048828125, "completions/mean_terminated_length": 822.3233520507813, "completions/min_length": 564.6, "completions/min_terminated_length": 564.6, "entropy": 0.19243225703636804, "epoch": 0.273224043715847, "frac_reward_zero_std": 0.0, "grad_norm": 0.03210171312093735, "kl": 0.0007616788071269791, "learning_rate": 3.826687116564417e-07, "loss": -0.005124400556087494, "num_tokens": 4864732.0, "reward": 1.7300000667572022, "reward_std": 0.9617222905158996, "rewards/boxed_rate/mean": 0.7666666626930236, "rewards/boxed_rate/std": 0.3761233925819397, "rewards/correctness/mean": 0.33333334028720857, "rewards/correctness/std": 0.3977532863616943, "rewards/multi_component_reward/mean": 0.29666668772697447, "rewards/multi_component_reward/std": 0.6426073163747787, "rewards/no_answer_rate/mean": 0.2333333373069763, "rewards/no_answer_rate/std": 0.3761233925819397, "rewards/repetition_rate/mean": 0.1, "rewards/repetition_rate/std": 0.10954451560974121, "sampling/importance_sampling_ratio/max": 0.11619223132729531, "sampling/importance_sampling_ratio/mean": 0.03387247350765392, "sampling/importance_sampling_ratio/min": 0.0005780367053725346, "sampling/sampling_logp_difference/max": 0.3284996747970581, "sampling/sampling_logp_difference/mean": 0.012111451011151075, "step": 750, "step_time": 15.568404103443026 }, { "epoch": 0.273224043715847, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.17000000417232514, "eval_completions/max_length": 1756.9, "eval_completions/max_terminated_length": 1208.56, "eval_completions/mean_length": 1074.366694946289, "eval_completions/mean_terminated_length": 874.3350158691406, "eval_completions/min_length": 579.92, "eval_completions/min_terminated_length": 579.92, "eval_entropy": 0.18485607638955115, "eval_frac_reward_zero_std": 0.1, "eval_kl": 0.0007376758259488269, "eval_loss": -0.0014031080063432455, "eval_num_tokens": 4864732.0, "eval_reward": 1.6413334274291993, "eval_reward_std": 0.7854073830693961, "eval_rewards/boxed_rate/mean": 0.8366666615009308, "eval_rewards/boxed_rate/std": 0.27878632068634035, "eval_rewards/correctness/mean": 0.2933333370089531, "eval_rewards/correctness/std": 0.32616410493850706, "eval_rewards/multi_component_reward/mean": 0.3146666841953993, "eval_rewards/multi_component_reward/std": 0.4994544792175293, "eval_rewards/no_answer_rate/mean": 0.16333333730697633, "eval_rewards/no_answer_rate/std": 0.27878632068634035, "eval_rewards/repetition_rate/mean": 0.03333333432674408, "eval_rewards/repetition_rate/std": 0.06964570760726929, "eval_runtime": 681.9794, "eval_samples_per_second": 0.073, "eval_sampling/importance_sampling_ratio/max": 0.09388977923779748, "eval_sampling/importance_sampling_ratio/mean": 0.029817550851148553, "eval_sampling/importance_sampling_ratio/min": 0.003447885300730625, "eval_sampling/sampling_logp_difference/max": 0.7785206460952758, "eval_sampling/sampling_logp_difference/mean": 0.014042644733563066, "eval_steps_per_second": 0.013, "step": 750 } ], "logging_steps": 5, "max_steps": 2745, "num_input_tokens_seen": 4864732, "num_train_epochs": 1, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }