{ "best_global_step": 1000, "best_metric": 1.6066529799881295, "best_model_checkpoint": "/root/autodl-tmp/output_grpo/v2-20250628-104314/checkpoint-1000", "epoch": 0.041437036423155015, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/mean_length": 567.5, "completions/min_length": 116.0, "epoch": 4.1437036423155016e-05, "grad_norm": 4.251134312123561, "kl": 0.0, "learning_rate": 6.666666666666667e-09, "loss": -9.934107758624577e-09, "memory(GiB)": 53.72, "reward": 0.5, "reward_std": 0.5640760660171509, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.25, "rewards/FormatCorrectnessReward/std": 0.26111647486686707, "step": 1, "train_speed(iter/s)": 0.014924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1396.0833740234375, "completions/min_length": 491.0, "epoch": 8.287407284631003e-05, "grad_norm": 3.480297371684561, "kl": 0.0, "learning_rate": 1.3333333333333334e-08, "loss": -0.051413606852293015, "memory(GiB)": 66.05, "reward": 0.375, "reward_std": 0.6077155470848083, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.125, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 2, "train_speed(iter/s)": 0.010149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/mean_length": 467.91668701171875, "completions/min_length": 22.0, "epoch": 0.00012431110926946505, "grad_norm": 16.667770655647345, "kl": 0.0008344650268554688, "learning_rate": 2e-08, "loss": 3.3398471714463085e-05, "memory(GiB)": 66.07, "reward": 0.9166666865348816, "reward_std": 0.6336522102355957, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.25, "rewards/FormatCorrectnessReward/std": 0.26111647486686707, "step": 3, "train_speed(iter/s)": 0.012952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1035.0, "completions/mean_length": 696.0833740234375, "completions/min_length": 397.0, "epoch": 0.00016574814569262006, "grad_norm": 3.956443955937704, "kl": 0.0006437301635742188, "learning_rate": 2.6666666666666667e-08, "loss": 2.5788944185478613e-05, "memory(GiB)": 66.07, "reward": 1.0, "reward_std": 0.6030226945877075, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.3333333432674408, "rewards/FormatCorrectnessReward/std": 0.24618299305438995, "step": 4, "train_speed(iter/s)": 0.014749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/mean_length": 580.5833740234375, "completions/min_length": 27.0, "epoch": 0.00020718518211577507, "grad_norm": 5.291226659990936, "kl": 0.00078582763671875, "learning_rate": 3.3333333333333334e-08, "loss": 3.148118776152842e-05, "memory(GiB)": 66.07, "reward": 0.4583333432674408, "reward_std": 0.5418123602867126, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.2916666567325592, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 5, "train_speed(iter/s)": 0.016233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1054.3333740234375, "completions/min_length": 7.0, "epoch": 0.0002486222185389301, "grad_norm": 10.28074511370301, "kl": 0.0008563995361328125, "learning_rate": 4e-08, "loss": -0.06470134854316711, "memory(GiB)": 66.07, "reward": 0.375, "reward_std": 0.4826536476612091, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.2083333283662796, "rewards/FormatCorrectnessReward/std": 0.25746434926986694, "step": 6, "train_speed(iter/s)": 0.013712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1335.8333740234375, "completions/min_length": 132.0, "epoch": 0.0002900592549620851, "grad_norm": 4.022004438824417, "kl": 0.0007686614990234375, "learning_rate": 4.666666666666667e-08, "loss": -0.08479788154363632, "memory(GiB)": 66.07, "reward": 0.4583333432674408, "reward_std": 0.45016831159591675, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.45016831159591675, "step": 7, "train_speed(iter/s)": 0.012321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/mean_length": 635.25, "completions/min_length": 470.0, "epoch": 0.0003314962913852401, "grad_norm": 3.2795717644849223, "kl": 0.00093841552734375, "learning_rate": 5.3333333333333334e-08, "loss": 3.770987314055674e-05, "memory(GiB)": 66.07, "reward": 0.4583333432674408, "reward_std": 0.3964807391166687, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.375, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 8, "train_speed(iter/s)": 0.013227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 8001.0, "completions/mean_length": 1910.0, "completions/min_length": 416.0, "epoch": 0.00037293332780839516, "grad_norm": 3.0525421997992783, "kl": 0.0007457733154296875, "learning_rate": 6e-08, "loss": -0.15948531031608582, "memory(GiB)": 66.07, "reward": 0.25, "reward_std": 0.26111647486686707, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.25, "rewards/FormatCorrectnessReward/std": 0.26111647486686707, "step": 9, "train_speed(iter/s)": 0.012112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2164.0, "completions/mean_length": 787.75, "completions/min_length": 216.0, "epoch": 0.00041437036423155015, "grad_norm": 4.509607197260831, "kl": 0.0006356239318847656, "learning_rate": 6.666666666666667e-08, "loss": 2.545615097915288e-05, "memory(GiB)": 66.07, "reward": 0.9166666865348816, "reward_std": 0.7334021925926208, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.2886751592159271, "step": 10, "train_speed(iter/s)": 0.012541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2405.0, "completions/mean_length": 913.25, "completions/min_length": 427.0, "epoch": 0.0004558074006547052, "grad_norm": 7.026908950769386, "kl": 0.0007610321044921875, "learning_rate": 7.333333333333333e-08, "loss": 3.041823765670415e-05, "memory(GiB)": 66.07, "reward": 0.75, "reward_std": 0.690849244594574, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.25, "rewards/FormatCorrectnessReward/std": 0.33709993958473206, "step": 11, "train_speed(iter/s)": 0.012881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 8001.0, "completions/mean_length": 1863.0, "completions/min_length": 417.0, "epoch": 0.0004972444370778602, "grad_norm": 2.618198486992356, "kl": 0.0009012222290039062, "learning_rate": 8e-08, "loss": -0.10793150961399078, "memory(GiB)": 66.07, "reward": 0.2916666865348816, "reward_std": 0.45016831159591675, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.2083333283662796, "rewards/FormatCorrectnessReward/std": 0.25746434926986694, "step": 12, "train_speed(iter/s)": 0.012114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1294.0, "completions/mean_length": 554.9166870117188, "completions/min_length": 59.0, "epoch": 0.0005386814735010153, "grad_norm": 15.706879681747642, "kl": 0.0010938644409179688, "learning_rate": 8.666666666666666e-08, "loss": 4.379948222776875e-05, "memory(GiB)": 66.07, "reward": 0.25, "reward_std": 0.33709993958473206, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.25, "rewards/FormatCorrectnessReward/std": 0.33709993958473206, "step": 13, "train_speed(iter/s)": 0.012576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/mean_length": 683.0, "completions/min_length": 551.0, "epoch": 0.0005801185099241702, "grad_norm": 4.66018872226416, "kl": 0.0007047653198242188, "learning_rate": 9.333333333333334e-08, "loss": 2.8272470444790088e-05, "memory(GiB)": 66.07, "reward": 1.2083333730697632, "reward_std": 0.49810245633125305, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.2916666567325592, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 14, "train_speed(iter/s)": 0.013071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1084.0, "completions/mean_length": 718.0, "completions/min_length": 86.0, "epoch": 0.0006215555463473252, "grad_norm": 15.337260610808604, "kl": 0.0009489059448242188, "learning_rate": 1e-07, "loss": 3.788372123381123e-05, "memory(GiB)": 66.07, "reward": 0.375, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.375, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 15, "train_speed(iter/s)": 0.01342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1078.0, "completions/mean_length": 717.1666870117188, "completions/min_length": 207.0, "epoch": 0.0006629925827704803, "grad_norm": 4.4434145112592836, "kl": 0.0008907318115234375, "learning_rate": 1.0666666666666667e-07, "loss": 3.5752855183091015e-05, "memory(GiB)": 66.07, "reward": 0.4166666865348816, "reward_std": 0.5967081785202026, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.1666666716337204, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 16, "train_speed(iter/s)": 0.013808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 741.0, "completions/mean_length": 566.5833740234375, "completions/min_length": 47.0, "epoch": 0.0007044296191936353, "grad_norm": 5.301075895169836, "kl": 0.0007352828979492188, "learning_rate": 1.1333333333333332e-07, "loss": 2.9434761017910205e-05, "memory(GiB)": 66.07, "reward": 0.875, "reward_std": 0.7111130952835083, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.2916666567325592, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 17, "train_speed(iter/s)": 0.014247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1410.0, "completions/mean_length": 933.3333740234375, "completions/min_length": 428.0, "epoch": 0.0007458666556167903, "grad_norm": 3.588857281729449, "kl": 0.0006628036499023438, "learning_rate": 1.2e-07, "loss": 2.656877040863037e-05, "memory(GiB)": 66.07, "reward": 0.2916666865348816, "reward_std": 0.25746431946754456, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.2916666567325592, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 18, "train_speed(iter/s)": 0.014531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/mean_length": 633.9166870117188, "completions/min_length": 487.0, "epoch": 0.0007873036920399453, "grad_norm": 3.241113477703851, "kl": 0.0008869171142578125, "learning_rate": 1.2666666666666666e-07, "loss": 3.5559140087570995e-05, "memory(GiB)": 66.07, "reward": 0.5833333730697632, "reward_std": 0.358870267868042, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 19, "train_speed(iter/s)": 0.01488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1182.75, "completions/min_length": 357.0, "epoch": 0.0008287407284631003, "grad_norm": 4.0789520410779, "kl": 0.0007047653198242188, "learning_rate": 1.3333333333333334e-07, "loss": -0.08640722930431366, "memory(GiB)": 66.07, "reward": 0.75, "reward_std": 0.7229987978935242, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.25, "rewards/FormatCorrectnessReward/std": 0.26111647486686707, "step": 20, "train_speed(iter/s)": 0.014222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1192.75, "completions/min_length": 123.0, "epoch": 0.0008701777648862553, "grad_norm": 3.9460188940386276, "kl": 0.0007143020629882812, "learning_rate": 1.4e-07, "loss": -0.07266230881214142, "memory(GiB)": 66.07, "reward": 0.2916666865348816, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.2916666567325592, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 21, "train_speed(iter/s)": 0.013607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1118.0, "completions/mean_length": 632.0, "completions/min_length": 108.0, "epoch": 0.0009116148013094104, "grad_norm": 6.711771964047038, "kl": 0.000797271728515625, "learning_rate": 1.4666666666666666e-07, "loss": 3.197789192199707e-05, "memory(GiB)": 66.07, "reward": 0.4166666865348816, "reward_std": 0.46871843934059143, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.25, "rewards/FormatCorrectnessReward/std": 0.26111647486686707, "step": 22, "train_speed(iter/s)": 0.013886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/mean_length": 581.75, "completions/min_length": 129.0, "epoch": 0.0009530518377325654, "grad_norm": 3.7607148348198742, "kl": 0.0009088516235351562, "learning_rate": 1.533333333333333e-07, "loss": 3.6398570955498144e-05, "memory(GiB)": 66.07, "reward": 0.5, "reward_std": 0.6396021246910095, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.25, "rewards/FormatCorrectnessReward/std": 0.26111647486686707, "step": 23, "train_speed(iter/s)": 0.014006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2478.0, "completions/mean_length": 825.9166870117188, "completions/min_length": 103.0, "epoch": 0.0009944888741557204, "grad_norm": 4.577675174121393, "kl": 0.0006895065307617188, "learning_rate": 1.6e-07, "loss": 2.7671456336975098e-05, "memory(GiB)": 66.07, "reward": 0.2916666865348816, "reward_std": 0.25746431946754456, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.2916666567325592, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 24, "train_speed(iter/s)": 0.014117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/mean_length": 529.0833740234375, "completions/min_length": 31.0, "epoch": 0.0010359259105788754, "grad_norm": 23.99411453475438, "kl": 0.001010894775390625, "learning_rate": 1.6666666666666665e-07, "loss": 4.049142444273457e-05, "memory(GiB)": 66.07, "reward": 0.3333333432674408, "reward_std": 0.44381269812583923, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.25, "rewards/FormatCorrectnessReward/std": 0.26111647486686707, "step": 25, "train_speed(iter/s)": 0.014184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2794.0, "completions/mean_length": 1007.25, "completions/min_length": 180.0, "epoch": 0.0010773629470020305, "grad_norm": 29.637074698350197, "kl": 0.0008993148803710938, "learning_rate": 1.7333333333333332e-07, "loss": 3.6075711250305176e-05, "memory(GiB)": 66.07, "reward": 0.3333333432674408, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.3333333432674408, "rewards/FormatCorrectnessReward/std": 0.24618299305438995, "step": 26, "train_speed(iter/s)": 0.014225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8000.0, "completions/mean_length": 1189.166748046875, "completions/min_length": 79.0, "epoch": 0.0011187999834251854, "grad_norm": 5.392748181264161, "kl": 0.0015878677368164062, "learning_rate": 1.8e-07, "loss": -0.07440056651830673, "memory(GiB)": 66.07, "reward": 0.75, "reward_std": 0.8393720388412476, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.25, "rewards/FormatCorrectnessReward/std": 0.39886200428009033, "step": 27, "train_speed(iter/s)": 0.013784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5072.0, "completions/mean_length": 1120.75, "completions/min_length": 220.0, "epoch": 0.0011602370198483404, "grad_norm": 4.276093226276709, "kl": 0.0006837844848632812, "learning_rate": 1.8666666666666667e-07, "loss": 2.7487676561577246e-05, "memory(GiB)": 66.07, "reward": 1.0833333730697632, "reward_std": 0.7017294764518738, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.3333333432674408, "rewards/FormatCorrectnessReward/std": 0.32566946744918823, "step": 28, "train_speed(iter/s)": 0.013646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1076.0, "completions/mean_length": 707.9166870117188, "completions/min_length": 124.0, "epoch": 0.0012016740562714955, "grad_norm": 4.0991030870477445, "kl": 0.0010471343994140625, "learning_rate": 1.9333333333333332e-07, "loss": 4.184246063232422e-05, "memory(GiB)": 66.07, "reward": 0.375, "reward_std": 0.4826536476612091, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.2916666567325592, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 29, "train_speed(iter/s)": 0.013857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1336.0, "completions/mean_length": 660.9166870117188, "completions/min_length": 466.0, "epoch": 0.0012431110926946504, "grad_norm": 3.241678270645231, "kl": 0.0008077621459960938, "learning_rate": 2e-07, "loss": 3.235539043089375e-05, "memory(GiB)": 66.07, "reward": 0.7916666865348816, "reward_std": 0.65568608045578, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.375, "rewards/FormatCorrectnessReward/std": 0.376889169216156, "step": 30, "train_speed(iter/s)": 0.014055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2654.0, "completions/mean_length": 787.3333740234375, "completions/min_length": 382.0, "epoch": 0.0012845481291178056, "grad_norm": 3.917837661114276, "kl": 0.0006670951843261719, "learning_rate": 2.0666666666666666e-07, "loss": 2.6673078536987305e-05, "memory(GiB)": 66.07, "reward": 0.75, "reward_std": 0.7833494544029236, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.3333333432674408, "rewards/FormatCorrectnessReward/std": 0.32566946744918823, "step": 31, "train_speed(iter/s)": 0.014125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5938.0, "completions/mean_length": 1040.416748046875, "completions/min_length": 97.0, "epoch": 0.0013259851655409605, "grad_norm": 6.683716474848267, "kl": 0.0009717941284179688, "learning_rate": 2.1333333333333334e-07, "loss": 3.894170367857441e-05, "memory(GiB)": 66.07, "reward": 1.0, "reward_std": 0.5640760660171509, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.25, "rewards/FormatCorrectnessReward/std": 0.26111647486686707, "step": 32, "train_speed(iter/s)": 0.013929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/mean_length": 660.8333740234375, "completions/min_length": 513.0, "epoch": 0.0013674222019641154, "grad_norm": 3.140195933465325, "kl": 0.0008077621459960938, "learning_rate": 2.1999999999999998e-07, "loss": 3.2444797398056835e-05, "memory(GiB)": 66.07, "reward": 0.7083333730697632, "reward_std": 0.49810245633125305, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 33, "train_speed(iter/s)": 0.014135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/mean_length": 657.1666870117188, "completions/min_length": 171.0, "epoch": 0.0014088592383872706, "grad_norm": 3.3904361943506833, "kl": 0.0007925033569335938, "learning_rate": 2.2666666666666663e-07, "loss": 3.177921098540537e-05, "memory(GiB)": 66.07, "reward": 0.1666666716337204, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.1666666716337204, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 34, "train_speed(iter/s)": 0.014101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/mean_length": 629.1666870117188, "completions/min_length": 397.0, "epoch": 0.0014502962748104255, "grad_norm": 5.147607087527442, "kl": 0.001102447509765625, "learning_rate": 2.3333333333333333e-07, "loss": 4.418691241880879e-05, "memory(GiB)": 66.07, "reward": 0.9166666865348816, "reward_std": 0.5967081785202026, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.3333333432674408, "rewards/FormatCorrectnessReward/std": 0.24618299305438995, "step": 35, "train_speed(iter/s)": 0.014296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/mean_length": 462.25, "completions/min_length": 113.0, "epoch": 0.0014917333112335807, "grad_norm": 5.668448155383745, "kl": 0.00099945068359375, "learning_rate": 2.4e-07, "loss": 3.99996861233376e-05, "memory(GiB)": 66.07, "reward": 0.625, "reward_std": 0.5690901875495911, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.2083333283662796, "rewards/FormatCorrectnessReward/std": 0.25746434926986694, "step": 36, "train_speed(iter/s)": 0.014497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/mean_length": 606.75, "completions/min_length": 106.0, "epoch": 0.0015331703476567356, "grad_norm": 6.189142197552896, "kl": 0.0009021759033203125, "learning_rate": 2.4666666666666665e-07, "loss": 3.6209821701049805e-05, "memory(GiB)": 66.07, "reward": 0.8333333730697632, "reward_std": 0.5773502588272095, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.2886751592159271, "step": 37, "train_speed(iter/s)": 0.014685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/mean_length": 710.6666870117188, "completions/min_length": 508.0, "epoch": 0.0015746073840798905, "grad_norm": 2.5073877944080434, "kl": 0.0009222030639648438, "learning_rate": 2.533333333333333e-07, "loss": 3.691514575621113e-05, "memory(GiB)": 66.07, "reward": 1.25, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.3333333432674408, "rewards/FormatCorrectnessReward/std": 0.24618299305438995, "step": 38, "train_speed(iter/s)": 0.014852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/mean_length": 654.0833740234375, "completions/min_length": 444.0, "epoch": 0.0016160444205030457, "grad_norm": 3.582268265681403, "kl": 0.0013818740844726562, "learning_rate": 2.6e-07, "loss": 5.543231964111328e-05, "memory(GiB)": 66.07, "reward": 0.4166666865348816, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.2886751592159271, "step": 39, "train_speed(iter/s)": 0.015021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4659.0, "completions/mean_length": 1008.4166870117188, "completions/min_length": 517.0, "epoch": 0.0016574814569262006, "grad_norm": 4.708817674277068, "kl": 0.0009174346923828125, "learning_rate": 2.6666666666666667e-07, "loss": 3.674626350402832e-05, "memory(GiB)": 66.07, "reward": 0.2916666865348816, "reward_std": 0.25746431946754456, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.2916666567325592, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 40, "train_speed(iter/s)": 0.014904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/mean_length": 699.0, "completions/min_length": 524.0, "epoch": 0.0016989184933493557, "grad_norm": 2.6751887685023066, "kl": 0.0010738372802734375, "learning_rate": 2.733333333333333e-07, "loss": 4.3054424168076366e-05, "memory(GiB)": 66.07, "reward": 1.0, "reward_std": 0.42640143632888794, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.1666666716337204, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 41, "train_speed(iter/s)": 0.015055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1126.8333740234375, "completions/min_length": 109.0, "epoch": 0.0017403555297725107, "grad_norm": 6.405001723622605, "kl": 0.0011043548583984375, "learning_rate": 2.8e-07, "loss": -0.11577056348323822, "memory(GiB)": 66.07, "reward": 0.9583333730697632, "reward_std": 0.6894772052764893, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.2916666567325592, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 42, "train_speed(iter/s)": 0.01472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/mean_length": 769.6666870117188, "completions/min_length": 619.0, "epoch": 0.0017817925661956656, "grad_norm": 3.802307294688045, "kl": 0.0009403228759765625, "learning_rate": 2.866666666666667e-07, "loss": 3.780921542784199e-05, "memory(GiB)": 66.07, "reward": 1.0833333730697632, "reward_std": 0.5967081785202026, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.2886751592159271, "step": 43, "train_speed(iter/s)": 0.014865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 421.0833435058594, "completions/min_length": 34.0, "epoch": 0.0018232296026188207, "grad_norm": 9.005292961633561, "kl": 0.008897781372070312, "learning_rate": 2.933333333333333e-07, "loss": 0.0003568629617802799, "memory(GiB)": 66.07, "reward": 0.5, "reward_std": 0.5222329497337341, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.1666666716337204, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 44, "train_speed(iter/s)": 0.01504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1469.0, "completions/mean_length": 779.8333740234375, "completions/min_length": 551.0, "epoch": 0.0018646666390419757, "grad_norm": 4.100614521566508, "kl": 0.0011348724365234375, "learning_rate": 3e-07, "loss": 4.561742389341816e-05, "memory(GiB)": 66.07, "reward": 0.7916666865348816, "reward_std": 0.65568608045578, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.2083333283662796, "rewards/FormatCorrectnessReward/std": 0.25746434926986694, "step": 45, "train_speed(iter/s)": 0.015147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/mean_length": 592.1666870117188, "completions/min_length": 276.0, "epoch": 0.0019061036754651308, "grad_norm": 3.106798377587334, "kl": 0.0013580322265625, "learning_rate": 3.066666666666666e-07, "loss": 5.443394184112549e-05, "memory(GiB)": 66.07, "reward": 0.5416666865348816, "reward_std": 0.49810245633125305, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.375, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 46, "train_speed(iter/s)": 0.015295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/mean_length": 583.3333740234375, "completions/min_length": 196.0, "epoch": 0.0019475407118882857, "grad_norm": 4.874934819624548, "kl": 0.0012979507446289062, "learning_rate": 3.1333333333333333e-07, "loss": 5.2099429012741894e-05, "memory(GiB)": 66.07, "reward": 1.0416667461395264, "reward_std": 0.6200562119483948, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.375, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 47, "train_speed(iter/s)": 0.015444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/mean_length": 568.0833740234375, "completions/min_length": 176.0, "epoch": 0.001988977748311441, "grad_norm": 13.093124954068271, "kl": 0.0020599365234375, "learning_rate": 3.2e-07, "loss": 8.267661178251728e-05, "memory(GiB)": 66.07, "reward": 0.4166666865348816, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.2886751592159271, "step": 48, "train_speed(iter/s)": 0.015589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1590.0, "completions/mean_length": 752.3333740234375, "completions/min_length": 168.0, "epoch": 0.002030414784734596, "grad_norm": 5.728973363966976, "kl": 0.0014476776123046875, "learning_rate": 3.2666666666666663e-07, "loss": 5.801519000669941e-05, "memory(GiB)": 66.07, "reward": 0.5, "reward_std": 0.6741998791694641, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.25, "rewards/FormatCorrectnessReward/std": 0.33709993958473206, "step": 49, "train_speed(iter/s)": 0.015674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/mean_length": 531.75, "completions/min_length": 153.0, "epoch": 0.0020718518211577507, "grad_norm": 5.413695280591241, "kl": 0.009260177612304688, "learning_rate": 3.333333333333333e-07, "loss": 0.00037175416946411133, "memory(GiB)": 66.07, "reward": 1.125, "reward_std": 0.7423856258392334, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.375, "rewards/FormatCorrectnessReward/std": 0.376889169216156, "step": 50, "train_speed(iter/s)": 0.015804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5255.0, "completions/mean_length": 1005.75, "completions/min_length": 487.0, "epoch": 0.0021132888575809057, "grad_norm": 2.520381323826232, "kl": 0.001415252685546875, "learning_rate": 3.4000000000000003e-07, "loss": 5.658467853209004e-05, "memory(GiB)": 66.07, "reward": 1.1666667461395264, "reward_std": 0.5365433692932129, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 51, "train_speed(iter/s)": 0.015654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/mean_length": 660.6666870117188, "completions/min_length": 448.0, "epoch": 0.002154725894004061, "grad_norm": 2.6006777496564983, "kl": 0.00151824951171875, "learning_rate": 3.4666666666666665e-07, "loss": 6.104509520810097e-05, "memory(GiB)": 66.07, "reward": 1.4583333730697632, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 52, "train_speed(iter/s)": 0.015776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/mean_length": 749.9166870117188, "completions/min_length": 538.0, "epoch": 0.002196162930427216, "grad_norm": 2.835724880684551, "kl": 0.0018405914306640625, "learning_rate": 3.533333333333333e-07, "loss": 7.368624210357666e-05, "memory(GiB)": 66.07, "reward": 1.25, "reward_std": 0.6215815544128418, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 53, "train_speed(iter/s)": 0.015887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/mean_length": 679.0833740234375, "completions/min_length": 445.0, "epoch": 0.002237599966850371, "grad_norm": 3.980980649654866, "kl": 0.0022430419921875, "learning_rate": 3.6e-07, "loss": 8.970499038696289e-05, "memory(GiB)": 66.07, "reward": 1.0, "reward_std": 0.5640760660171509, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 54, "train_speed(iter/s)": 0.015999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/mean_length": 478.5833435058594, "completions/min_length": 100.0, "epoch": 0.002279037003273526, "grad_norm": 7.096851684454993, "kl": 0.03232002258300781, "learning_rate": 3.666666666666666e-07, "loss": 0.0012935003032907844, "memory(GiB)": 66.07, "reward": 1.0416667461395264, "reward_std": 0.65568608045578, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.2916666567325592, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 55, "train_speed(iter/s)": 0.016125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/mean_length": 495.16668701171875, "completions/min_length": 243.0, "epoch": 0.0023204740396966807, "grad_norm": 3.99369103317408, "kl": 0.002788543701171875, "learning_rate": 3.7333333333333334e-07, "loss": 0.00011165937030455098, "memory(GiB)": 66.07, "reward": 0.9583333730697632, "reward_std": 0.6200562119483948, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.375, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 56, "train_speed(iter/s)": 0.016244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1096.0, "completions/mean_length": 776.5, "completions/min_length": 497.0, "epoch": 0.002361911076119836, "grad_norm": 10.3909169688467, "kl": 0.00406646728515625, "learning_rate": 3.7999999999999996e-07, "loss": 0.000162382930284366, "memory(GiB)": 66.07, "reward": 1.625, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.625, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 57, "train_speed(iter/s)": 0.016333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1101.0, "completions/mean_length": 781.0833740234375, "completions/min_length": 534.0, "epoch": 0.002403348112542991, "grad_norm": 2.549978166585729, "kl": 0.00272369384765625, "learning_rate": 3.8666666666666664e-07, "loss": 0.0001091758458642289, "memory(GiB)": 66.07, "reward": 1.0833333730697632, "reward_std": 0.5573204159736633, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 58, "train_speed(iter/s)": 0.016433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/mean_length": 523.8333740234375, "completions/min_length": 76.0, "epoch": 0.002444785148966146, "grad_norm": 22.796455137538743, "kl": 0.019138336181640625, "learning_rate": 3.933333333333333e-07, "loss": 0.0007684330339543521, "memory(GiB)": 66.07, "reward": 1.125, "reward_std": 0.8561276793479919, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.3964807391166687, "step": 59, "train_speed(iter/s)": 0.016539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/mean_length": 646.4166870117188, "completions/min_length": 269.0, "epoch": 0.002486222185389301, "grad_norm": 6.658686049874029, "kl": 0.00283050537109375, "learning_rate": 4e-07, "loss": 0.00011326869571348652, "memory(GiB)": 66.07, "reward": 0.4166666865348816, "reward_std": 0.19462473690509796, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 60, "train_speed(iter/s)": 0.016638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/mean_length": 667.5, "completions/min_length": 371.0, "epoch": 0.002527659221812456, "grad_norm": 5.749147118249718, "kl": 0.003665924072265625, "learning_rate": 4.0666666666666666e-07, "loss": 0.00014700493193231523, "memory(GiB)": 66.07, "reward": 0.9166666865348816, "reward_std": 0.6685579419136047, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.30151134729385376, "step": 61, "train_speed(iter/s)": 0.016737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/mean_length": 557.8333740234375, "completions/min_length": 324.0, "epoch": 0.002569096258235611, "grad_norm": 4.379066618080734, "kl": 0.002925872802734375, "learning_rate": 4.1333333333333333e-07, "loss": 0.00011754035949707031, "memory(GiB)": 66.07, "reward": 0.5833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 62, "train_speed(iter/s)": 0.016845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/mean_length": 695.9166870117188, "completions/min_length": 531.0, "epoch": 0.002610533294658766, "grad_norm": 2.6899006962779484, "kl": 0.005306243896484375, "learning_rate": 4.1999999999999995e-07, "loss": 0.0002126296458300203, "memory(GiB)": 66.07, "reward": 1.5, "reward_std": 0.36927446722984314, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 63, "train_speed(iter/s)": 0.016938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1127.0, "completions/mean_length": 681.5833740234375, "completions/min_length": 368.0, "epoch": 0.002651970331081921, "grad_norm": 5.223416427726243, "kl": 0.003631591796875, "learning_rate": 4.266666666666667e-07, "loss": 0.00014507770538330078, "memory(GiB)": 66.07, "reward": 1.25, "reward_std": 0.5, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 64, "train_speed(iter/s)": 0.017021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/mean_length": 607.1666870117188, "completions/min_length": 442.0, "epoch": 0.002693407367505076, "grad_norm": 2.971422843100217, "kl": 0.0033111572265625, "learning_rate": 4.3333333333333335e-07, "loss": 0.00013267993927001953, "memory(GiB)": 66.07, "reward": 0.625, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 65, "train_speed(iter/s)": 0.017094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/mean_length": 630.75, "completions/min_length": 468.0, "epoch": 0.002734844403928231, "grad_norm": 3.119537523668892, "kl": 0.004974365234375, "learning_rate": 4.3999999999999997e-07, "loss": 0.00019977490592282265, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 66, "train_speed(iter/s)": 0.01719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/mean_length": 528.1666870117188, "completions/min_length": 173.0, "epoch": 0.0027762814403513862, "grad_norm": 6.380964537171133, "kl": 0.005924224853515625, "learning_rate": 4.4666666666666664e-07, "loss": 0.00023743510246276855, "memory(GiB)": 66.07, "reward": 0.8333333730697632, "reward_std": 0.6154574751853943, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 67, "train_speed(iter/s)": 0.017296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/mean_length": 600.75, "completions/min_length": 478.0, "epoch": 0.002817718476774541, "grad_norm": 2.879034665446175, "kl": 0.0089263916015625, "learning_rate": 4.5333333333333326e-07, "loss": 0.00035685300827026367, "memory(GiB)": 66.07, "reward": 1.25, "reward_std": 0.5, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.21320071816444397, "step": 68, "train_speed(iter/s)": 0.017402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/mean_length": 631.3333740234375, "completions/min_length": 505.0, "epoch": 0.002859155513197696, "grad_norm": 2.6406154106161583, "kl": 0.004638671875, "learning_rate": 4.6e-07, "loss": 0.00018599629402160645, "memory(GiB)": 66.07, "reward": 1.4166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 69, "train_speed(iter/s)": 0.017487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/mean_length": 526.4166870117188, "completions/min_length": 112.0, "epoch": 0.002900592549620851, "grad_norm": 6.0916444963242355, "kl": 0.0066070556640625, "learning_rate": 4.6666666666666666e-07, "loss": 0.00026523074484430254, "memory(GiB)": 66.07, "reward": 0.7916666865348816, "reward_std": 0.7525210380554199, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.375, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 70, "train_speed(iter/s)": 0.017573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/mean_length": 647.9166870117188, "completions/min_length": 463.0, "epoch": 0.002942029586044006, "grad_norm": 3.968304315474521, "kl": 0.0046539306640625, "learning_rate": 4.733333333333333e-07, "loss": 0.00018650293350219727, "memory(GiB)": 66.07, "reward": 0.4583333432674408, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 71, "train_speed(iter/s)": 0.017662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/mean_length": 484.25, "completions/min_length": 104.0, "epoch": 0.0029834666224671613, "grad_norm": 15.150893744744794, "kl": 0.02649688720703125, "learning_rate": 4.8e-07, "loss": 0.0010597507935017347, "memory(GiB)": 66.07, "reward": 0.9583333730697632, "reward_std": 0.6200562119483948, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.2083333283662796, "rewards/FormatCorrectnessReward/std": 0.25746434926986694, "step": 72, "train_speed(iter/s)": 0.017754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1376.0, "completions/mean_length": 832.6666870117188, "completions/min_length": 290.0, "epoch": 0.0030249036588903162, "grad_norm": 4.297154995458008, "kl": 0.005619049072265625, "learning_rate": 4.866666666666666e-07, "loss": 0.00022564332175534219, "memory(GiB)": 66.07, "reward": 0.4166666865348816, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 73, "train_speed(iter/s)": 0.01781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4899.0, "completions/mean_length": 990.4166870117188, "completions/min_length": 488.0, "epoch": 0.003066340695313471, "grad_norm": 2.2254967648370942, "kl": 0.00585174560546875, "learning_rate": 4.933333333333333e-07, "loss": 0.00023430585861206055, "memory(GiB)": 66.07, "reward": 1.4583333730697632, "reward_std": 0.49810245633125305, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 74, "train_speed(iter/s)": 0.017663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1077.0, "completions/mean_length": 649.0, "completions/min_length": 515.0, "epoch": 0.003107777731736626, "grad_norm": 3.5978369740059977, "kl": 0.005462646484375, "learning_rate": 5e-07, "loss": 0.00021849572658538818, "memory(GiB)": 66.07, "reward": 1.25, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 75, "train_speed(iter/s)": 0.017732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/mean_length": 587.1666870117188, "completions/min_length": 502.0, "epoch": 0.003149214768159781, "grad_norm": 3.216587659511788, "kl": 0.0064697265625, "learning_rate": 5.066666666666667e-07, "loss": 0.00025932988501153886, "memory(GiB)": 66.07, "reward": 1.5416667461395264, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 76, "train_speed(iter/s)": 0.017817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/mean_length": 656.75, "completions/min_length": 470.0, "epoch": 0.0031906518045829364, "grad_norm": 3.0602107194517982, "kl": 0.00669097900390625, "learning_rate": 5.133333333333333e-07, "loss": 0.000267157971393317, "memory(GiB)": 66.07, "reward": 0.625, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 77, "train_speed(iter/s)": 0.017892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/mean_length": 615.9166870117188, "completions/min_length": 444.0, "epoch": 0.0032320888410060913, "grad_norm": 7.907617519965772, "kl": 0.008544921875, "learning_rate": 5.2e-07, "loss": 0.00034206113195978105, "memory(GiB)": 66.07, "reward": 1.4583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 78, "train_speed(iter/s)": 0.017957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/mean_length": 550.5, "completions/min_length": 414.0, "epoch": 0.0032735258774292462, "grad_norm": 3.6856333658334566, "kl": 0.00864410400390625, "learning_rate": 5.266666666666666e-07, "loss": 0.00034634274197742343, "memory(GiB)": 66.07, "reward": 1.1666667461395264, "reward_std": 0.44381269812583923, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.25, "rewards/FormatCorrectnessReward/std": 0.26111647486686707, "step": 79, "train_speed(iter/s)": 0.018038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/mean_length": 536.75, "completions/min_length": 156.0, "epoch": 0.003314962913852401, "grad_norm": 7.58146532204521, "kl": 0.0096435546875, "learning_rate": 5.333333333333333e-07, "loss": 0.0003863175807055086, "memory(GiB)": 66.07, "reward": 1.3333333730697632, "reward_std": 0.44381269812583923, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 80, "train_speed(iter/s)": 0.01812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/mean_length": 557.25, "completions/min_length": 412.0, "epoch": 0.003356399950275556, "grad_norm": 2.932344538040059, "kl": 0.0109710693359375, "learning_rate": 5.4e-07, "loss": 0.0004397531447466463, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 81, "train_speed(iter/s)": 0.018205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1972.0, "completions/mean_length": 785.6666870117188, "completions/min_length": 505.0, "epoch": 0.0033978369866987114, "grad_norm": 4.990141559999906, "kl": 0.0121002197265625, "learning_rate": 5.466666666666666e-07, "loss": 0.0004835675354115665, "memory(GiB)": 66.07, "reward": 0.5416666865348816, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 82, "train_speed(iter/s)": 0.018225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/mean_length": 644.4166870117188, "completions/min_length": 465.0, "epoch": 0.0034392740231218664, "grad_norm": 3.114596600039455, "kl": 0.0129547119140625, "learning_rate": 5.533333333333334e-07, "loss": 0.0005184511537663639, "memory(GiB)": 66.07, "reward": 0.5833333730697632, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 83, "train_speed(iter/s)": 0.018298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/mean_length": 654.0, "completions/min_length": 486.0, "epoch": 0.0034807110595450213, "grad_norm": 2.5994775357579565, "kl": 0.0124359130859375, "learning_rate": 5.6e-07, "loss": 0.000498871027957648, "memory(GiB)": 66.07, "reward": 1.4166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 84, "train_speed(iter/s)": 0.018365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 904.0, "completions/mean_length": 637.1666870117188, "completions/min_length": 472.0, "epoch": 0.0035221480959681762, "grad_norm": 3.1615463426626538, "kl": 0.0103302001953125, "learning_rate": 5.666666666666666e-07, "loss": 0.00041269761277362704, "memory(GiB)": 66.07, "reward": 1.3333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 85, "train_speed(iter/s)": 0.018437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/mean_length": 503.3333435058594, "completions/min_length": 111.0, "epoch": 0.003563585132391331, "grad_norm": 4.771699729827081, "kl": 0.013580322265625, "learning_rate": 5.733333333333334e-07, "loss": 0.000542630790732801, "memory(GiB)": 66.07, "reward": 0.5416666865348816, "reward_std": 0.49810245633125305, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.375, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 86, "train_speed(iter/s)": 0.018506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/mean_length": 631.75, "completions/min_length": 445.0, "epoch": 0.0036050221688144865, "grad_norm": 4.0737128024586795, "kl": 0.019805908203125, "learning_rate": 5.8e-07, "loss": 0.0007919868221506476, "memory(GiB)": 66.07, "reward": 0.625, "reward_std": 0.4330126941204071, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.38924944400787354, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 87, "train_speed(iter/s)": 0.018575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/mean_length": 513.4166870117188, "completions/min_length": 370.0, "epoch": 0.0036464592052376415, "grad_norm": 0.11631106082682097, "kl": 0.0143890380859375, "learning_rate": 5.866666666666666e-07, "loss": 0.0005760723724961281, "memory(GiB)": 66.07, "reward": 1.5, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 88, "train_speed(iter/s)": 0.018653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1290.666748046875, "completions/min_length": 422.0, "epoch": 0.0036878962416607964, "grad_norm": 2.8479731636667815, "kl": 0.01220703125, "learning_rate": 5.933333333333334e-07, "loss": -0.1276664435863495, "memory(GiB)": 66.07, "reward": 0.875, "reward_std": 0.5690901875495911, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 89, "train_speed(iter/s)": 0.018358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/mean_length": 546.1666870117188, "completions/min_length": 468.0, "epoch": 0.0037293332780839513, "grad_norm": 2.9307960970159046, "kl": 0.016265869140625, "learning_rate": 6e-07, "loss": 0.0006521543255075812, "memory(GiB)": 66.07, "reward": 1.4166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 90, "train_speed(iter/s)": 0.018434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/mean_length": 587.4166870117188, "completions/min_length": 429.0, "epoch": 0.0037707703145071067, "grad_norm": 3.193619217616899, "kl": 0.0205841064453125, "learning_rate": 6.066666666666666e-07, "loss": 0.0008220424642786384, "memory(GiB)": 66.07, "reward": 1.1666667461395264, "reward_std": 0.6154574751853943, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.21320071816444397, "step": 91, "train_speed(iter/s)": 0.018499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/mean_length": 564.25, "completions/min_length": 37.0, "epoch": 0.0038122073509302616, "grad_norm": 11.529973419237903, "kl": 0.0183563232421875, "learning_rate": 6.133333333333332e-07, "loss": 0.0007351140375249088, "memory(GiB)": 66.07, "reward": 1.25, "reward_std": 0.5838742256164551, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.21320071816444397, "step": 92, "train_speed(iter/s)": 0.018565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/mean_length": 635.0, "completions/min_length": 427.0, "epoch": 0.0038536443873534165, "grad_norm": 0.093793043799284, "kl": 0.0116119384765625, "learning_rate": 6.2e-07, "loss": 0.00046533718705177307, "memory(GiB)": 66.07, "reward": 0.5, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 93, "train_speed(iter/s)": 0.018628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/mean_length": 630.9166870117188, "completions/min_length": 534.0, "epoch": 0.0038950814237765715, "grad_norm": 4.433546656858424, "kl": 0.01312255859375, "learning_rate": 6.266666666666667e-07, "loss": 0.0005258222809061408, "memory(GiB)": 66.07, "reward": 0.4166666865348816, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 94, "train_speed(iter/s)": 0.018684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/mean_length": 686.0, "completions/min_length": 466.0, "epoch": 0.003936518460199726, "grad_norm": 2.98551395591005, "kl": 0.0157470703125, "learning_rate": 6.333333333333332e-07, "loss": 0.0006310641765594482, "memory(GiB)": 66.07, "reward": 0.4583333432674408, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 95, "train_speed(iter/s)": 0.018739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/mean_length": 556.5833740234375, "completions/min_length": 129.0, "epoch": 0.003977955496622882, "grad_norm": 17.04752070532852, "kl": 0.0480194091796875, "learning_rate": 6.4e-07, "loss": 0.0019158025970682502, "memory(GiB)": 66.07, "reward": 1.4583333730697632, "reward_std": 0.5418123602867126, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 96, "train_speed(iter/s)": 0.018798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1251.8333740234375, "completions/min_length": 537.0, "epoch": 0.004019392533046036, "grad_norm": 1.6706634848693542, "kl": 0.0148162841796875, "learning_rate": 6.466666666666666e-07, "loss": -0.26389080286026, "memory(GiB)": 66.07, "reward": 0.4583333432674408, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 97, "train_speed(iter/s)": 0.01852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/mean_length": 675.4166870117188, "completions/min_length": 468.0, "epoch": 0.004060829569469192, "grad_norm": 2.7967214204320117, "kl": 0.016998291015625, "learning_rate": 6.533333333333333e-07, "loss": 0.0006808936595916748, "memory(GiB)": 66.07, "reward": 1.25, "reward_std": 0.39886200428009033, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 98, "train_speed(iter/s)": 0.018583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/mean_length": 600.5833740234375, "completions/min_length": 435.0, "epoch": 0.004102266605892347, "grad_norm": 3.76425580278462, "kl": 0.0165252685546875, "learning_rate": 6.6e-07, "loss": 0.0006601115455850959, "memory(GiB)": 66.07, "reward": 1.0416667461395264, "reward_std": 0.45016831159591675, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 99, "train_speed(iter/s)": 0.018642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/mean_length": 595.3333740234375, "completions/min_length": 345.0, "epoch": 0.0041437036423155015, "grad_norm": 3.3925870657328505, "kl": 0.0143280029296875, "learning_rate": 6.666666666666666e-07, "loss": 0.0005736897583119571, "memory(GiB)": 66.07, "reward": 1.2083333730697632, "reward_std": 0.45016831159591675, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 100, "train_speed(iter/s)": 0.018693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/mean_length": 628.0, "completions/min_length": 535.0, "epoch": 0.004185140678738657, "grad_norm": 3.1117402551912776, "kl": 0.01904296875, "learning_rate": 6.733333333333333e-07, "loss": 0.0007610222091898322, "memory(GiB)": 66.07, "reward": 1.3333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 101, "train_speed(iter/s)": 0.018754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/mean_length": 617.1666870117188, "completions/min_length": 527.0, "epoch": 0.004226577715161811, "grad_norm": 5.251419169419873, "kl": 0.022369384765625, "learning_rate": 6.800000000000001e-07, "loss": 0.0008959969272837043, "memory(GiB)": 66.07, "reward": 1.2916667461395264, "reward_std": 0.49810245633125305, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 102, "train_speed(iter/s)": 0.018814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/mean_length": 635.4166870117188, "completions/min_length": 537.0, "epoch": 0.004268014751584967, "grad_norm": 3.0174265711678685, "kl": 0.0189208984375, "learning_rate": 6.866666666666666e-07, "loss": 0.0007577340002171695, "memory(GiB)": 66.07, "reward": 1.0833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 103, "train_speed(iter/s)": 0.01887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/mean_length": 595.5, "completions/min_length": 463.0, "epoch": 0.004309451788008122, "grad_norm": 0.1645444525319701, "kl": 0.020355224609375, "learning_rate": 6.933333333333333e-07, "loss": 0.0008141921716742218, "memory(GiB)": 66.07, "reward": 1.5, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 104, "train_speed(iter/s)": 0.018929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1106.0, "completions/mean_length": 743.9166870117188, "completions/min_length": 513.0, "epoch": 0.0043508888244312765, "grad_norm": 2.865869341048057, "kl": 0.0208740234375, "learning_rate": 7e-07, "loss": 0.0008315245504491031, "memory(GiB)": 66.07, "reward": 1.125, "reward_std": 0.5690901875495911, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 105, "train_speed(iter/s)": 0.01897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/mean_length": 559.3333740234375, "completions/min_length": 398.0, "epoch": 0.004392325860854432, "grad_norm": 3.612311431113161, "kl": 0.020263671875, "learning_rate": 7.066666666666666e-07, "loss": 0.0008110702037811279, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 106, "train_speed(iter/s)": 0.019035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/mean_length": 677.4166870117188, "completions/min_length": 473.0, "epoch": 0.004433762897277586, "grad_norm": 3.7860328064974462, "kl": 0.017669677734375, "learning_rate": 7.133333333333333e-07, "loss": 0.0007055699825286865, "memory(GiB)": 66.07, "reward": 1.125, "reward_std": 0.5690901875495911, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 107, "train_speed(iter/s)": 0.019087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/mean_length": 584.5833740234375, "completions/min_length": 387.0, "epoch": 0.004475199933700742, "grad_norm": 3.629988910541084, "kl": 0.0182037353515625, "learning_rate": 7.2e-07, "loss": 0.000728373765014112, "memory(GiB)": 66.07, "reward": 0.9166666865348816, "reward_std": 0.6336522102355957, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 108, "train_speed(iter/s)": 0.019144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1068.0, "completions/mean_length": 573.25, "completions/min_length": 164.0, "epoch": 0.004516636970123897, "grad_norm": 5.728455402089688, "kl": 0.0264892578125, "learning_rate": 7.266666666666667e-07, "loss": 0.0010626118164509535, "memory(GiB)": 66.07, "reward": 1.25, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.3333333432674408, "rewards/FormatCorrectnessReward/std": 0.24618299305438995, "step": 109, "train_speed(iter/s)": 0.019188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1225.5833740234375, "completions/min_length": 205.0, "epoch": 0.004558074006547052, "grad_norm": 6.522672730931974, "kl": 0.036834716796875, "learning_rate": 7.333333333333332e-07, "loss": -0.030299659818410873, "memory(GiB)": 66.07, "reward": 1.25, "reward_std": 0.6571287512779236, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.3588702976703644, "step": 110, "train_speed(iter/s)": 0.018935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/mean_length": 704.25, "completions/min_length": 509.0, "epoch": 0.004599511042970207, "grad_norm": 3.883047969540562, "kl": 0.022369384765625, "learning_rate": 7.4e-07, "loss": 0.0008935531368479133, "memory(GiB)": 66.07, "reward": 0.7916666865348816, "reward_std": 0.45016831159591675, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 111, "train_speed(iter/s)": 0.018979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1035.0, "completions/mean_length": 613.9166870117188, "completions/min_length": 401.0, "epoch": 0.0046409480793933615, "grad_norm": 3.537700255895571, "kl": 0.019012451171875, "learning_rate": 7.466666666666667e-07, "loss": 0.0007591446628794074, "memory(GiB)": 66.07, "reward": 0.9583333730697632, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 112, "train_speed(iter/s)": 0.019026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/mean_length": 660.0, "completions/min_length": 449.0, "epoch": 0.004682385115816517, "grad_norm": 2.721489341642119, "kl": 0.02130126953125, "learning_rate": 7.533333333333332e-07, "loss": 0.0008497635717503726, "memory(GiB)": 66.07, "reward": 1.4583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 113, "train_speed(iter/s)": 0.019075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/mean_length": 636.5, "completions/min_length": 526.0, "epoch": 0.004723822152239672, "grad_norm": 3.5479515174665996, "kl": 0.02264404296875, "learning_rate": 7.599999999999999e-07, "loss": 0.0009052356472238898, "memory(GiB)": 66.07, "reward": 1.2916667461395264, "reward_std": 0.49810245633125305, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 114, "train_speed(iter/s)": 0.019124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/mean_length": 620.0, "completions/min_length": 439.0, "epoch": 0.004765259188662827, "grad_norm": 39.86429341708531, "kl": 0.1536865234375, "learning_rate": 7.666666666666667e-07, "loss": 0.006150202825665474, "memory(GiB)": 66.07, "reward": 0.5, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 115, "train_speed(iter/s)": 0.019174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/mean_length": 570.6666870117188, "completions/min_length": 85.0, "epoch": 0.004806696225085982, "grad_norm": 12.148964912691604, "kl": 0.020172119140625, "learning_rate": 7.733333333333333e-07, "loss": 0.0008050104370340705, "memory(GiB)": 66.07, "reward": 1.3333333730697632, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 116, "train_speed(iter/s)": 0.019223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/mean_length": 645.4166870117188, "completions/min_length": 434.0, "epoch": 0.0048481332615091365, "grad_norm": 0.14122559806131313, "kl": 0.026275634765625, "learning_rate": 7.799999999999999e-07, "loss": 0.0010503333760425448, "memory(GiB)": 66.07, "reward": 0.5, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 117, "train_speed(iter/s)": 0.019267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/mean_length": 717.1666870117188, "completions/min_length": 489.0, "epoch": 0.004889570297932292, "grad_norm": 4.738923029271301, "kl": 0.028228759765625, "learning_rate": 7.866666666666666e-07, "loss": 0.0011288325767964125, "memory(GiB)": 66.07, "reward": 1.5, "reward_std": 0.36927446722984314, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 118, "train_speed(iter/s)": 0.019307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/mean_length": 607.1666870117188, "completions/min_length": 305.0, "epoch": 0.004931007334355447, "grad_norm": 3.690256474550571, "kl": 0.025482177734375, "learning_rate": 7.933333333333333e-07, "loss": 0.0010206103324890137, "memory(GiB)": 66.07, "reward": 0.5416666865348816, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 119, "train_speed(iter/s)": 0.019355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/mean_length": 545.5, "completions/min_length": 43.0, "epoch": 0.004972444370778602, "grad_norm": 10.537427928208707, "kl": 0.032470703125, "learning_rate": 8e-07, "loss": 0.0012996842851862311, "memory(GiB)": 66.07, "reward": 1.3333333730697632, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 120, "train_speed(iter/s)": 0.019401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/mean_length": 547.8333740234375, "completions/min_length": 468.0, "epoch": 0.005013881407201757, "grad_norm": 13.175608320447324, "kl": 0.194732666015625, "learning_rate": 8.066666666666666e-07, "loss": 0.007812162395566702, "memory(GiB)": 66.07, "reward": 1.4583333730697632, "reward_std": 0.25746431946754456, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 121, "train_speed(iter/s)": 0.019447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/mean_length": 665.8333740234375, "completions/min_length": 434.0, "epoch": 0.005055318443624912, "grad_norm": 2.908893775276258, "kl": 0.02740478515625, "learning_rate": 8.133333333333333e-07, "loss": 0.0010974109172821045, "memory(GiB)": 66.07, "reward": 0.6666666865348816, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 122, "train_speed(iter/s)": 0.019494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/mean_length": 623.1666870117188, "completions/min_length": 451.0, "epoch": 0.005096755480048067, "grad_norm": 3.3980546161394107, "kl": 0.02789306640625, "learning_rate": 8.199999999999999e-07, "loss": 0.0011176567059010267, "memory(GiB)": 66.07, "reward": 0.5833333730697632, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 123, "train_speed(iter/s)": 0.01947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/mean_length": 623.4166870117188, "completions/min_length": 475.0, "epoch": 0.005138192516471222, "grad_norm": 4.233280454218966, "kl": 0.029052734375, "learning_rate": 8.266666666666667e-07, "loss": 0.0011614065151661634, "memory(GiB)": 66.07, "reward": 1.0833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 124, "train_speed(iter/s)": 0.01951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/mean_length": 563.1666870117188, "completions/min_length": 387.0, "epoch": 0.005179629552894377, "grad_norm": 3.331392609921104, "kl": 0.035369873046875, "learning_rate": 8.333333333333333e-07, "loss": 0.0014179646968841553, "memory(GiB)": 66.07, "reward": 0.5833333730697632, "reward_std": 0.358870267868042, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.21320071816444397, "step": 125, "train_speed(iter/s)": 0.01956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1254.166748046875, "completions/min_length": 459.0, "epoch": 0.005221066589317532, "grad_norm": 2.4423539971307497, "kl": 0.032257080078125, "learning_rate": 8.399999999999999e-07, "loss": -0.16351088881492615, "memory(GiB)": 66.07, "reward": 1.125, "reward_std": 0.5690901875495911, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 126, "train_speed(iter/s)": 0.01932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 783.0, "completions/mean_length": 615.0833740234375, "completions/min_length": 484.0, "epoch": 0.005262503625740687, "grad_norm": 3.495298415231956, "kl": 0.031646728515625, "learning_rate": 8.466666666666667e-07, "loss": 0.001262883422896266, "memory(GiB)": 66.07, "reward": 0.5833333730697632, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.21320071816444397, "step": 127, "train_speed(iter/s)": 0.019367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/mean_length": 733.5, "completions/min_length": 536.0, "epoch": 0.005303940662163842, "grad_norm": 3.4224601956880347, "kl": 0.0347900390625, "learning_rate": 8.533333333333334e-07, "loss": 0.0013934423914179206, "memory(GiB)": 66.07, "reward": 0.8333333730697632, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 128, "train_speed(iter/s)": 0.019405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/mean_length": 575.6666870117188, "completions/min_length": 472.0, "epoch": 0.005345377698586997, "grad_norm": 20.34240114673033, "kl": 0.095458984375, "learning_rate": 8.599999999999999e-07, "loss": 0.003813415765762329, "memory(GiB)": 66.07, "reward": 1.5, "reward_std": 0.30151134729385376, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.2886751592159271, "step": 129, "train_speed(iter/s)": 0.019457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/mean_length": 652.25, "completions/min_length": 500.0, "epoch": 0.005386814735010152, "grad_norm": 3.0673381274898244, "kl": 0.035125732421875, "learning_rate": 8.666666666666667e-07, "loss": 0.0014051845064386725, "memory(GiB)": 66.07, "reward": 1.2083333730697632, "reward_std": 0.45016831159591675, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 130, "train_speed(iter/s)": 0.019497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/mean_length": 752.8333740234375, "completions/min_length": 447.0, "epoch": 0.005428251771433307, "grad_norm": 61.869237938340234, "kl": 0.092864990234375, "learning_rate": 8.733333333333333e-07, "loss": 0.00371290254406631, "memory(GiB)": 66.07, "reward": 0.5833333730697632, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 131, "train_speed(iter/s)": 0.019532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/mean_length": 592.0, "completions/min_length": 415.0, "epoch": 0.005469688807856462, "grad_norm": 3.5296080969883428, "kl": 0.03472900390625, "learning_rate": 8.799999999999999e-07, "loss": 0.001385678886435926, "memory(GiB)": 66.07, "reward": 0.8333333730697632, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 132, "train_speed(iter/s)": 0.01958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1247.0833740234375, "completions/min_length": 118.0, "epoch": 0.005511125844279617, "grad_norm": 9.07333202958478, "kl": 0.0386962890625, "learning_rate": 8.866666666666667e-07, "loss": -0.14688026905059814, "memory(GiB)": 66.07, "reward": 0.4583333432674408, "reward_std": 0.25746431946754456, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 133, "train_speed(iter/s)": 0.019355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/mean_length": 636.25, "completions/min_length": 277.0, "epoch": 0.0055525628807027725, "grad_norm": 3.4954765055703723, "kl": 0.045684814453125, "learning_rate": 8.933333333333333e-07, "loss": 0.0018233160953968763, "memory(GiB)": 66.07, "reward": 1.2916667461395264, "reward_std": 0.49810245633125305, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 134, "train_speed(iter/s)": 0.019394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/mean_length": 578.1666870117188, "completions/min_length": 128.0, "epoch": 0.005593999917125927, "grad_norm": 29.014944923170976, "kl": 0.039947509765625, "learning_rate": 9e-07, "loss": 0.0015988945960998535, "memory(GiB)": 66.07, "reward": 1.4166667461395264, "reward_std": 0.46871843934059143, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.21320071816444397, "step": 135, "train_speed(iter/s)": 0.019398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/mean_length": 694.0, "completions/min_length": 536.0, "epoch": 0.005635436953549082, "grad_norm": 3.318883819605055, "kl": 0.04339599609375, "learning_rate": 9.066666666666665e-07, "loss": 0.0017341971397399902, "memory(GiB)": 66.07, "reward": 1.5416667461395264, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 136, "train_speed(iter/s)": 0.019436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4796.0, "completions/mean_length": 986.3333740234375, "completions/min_length": 469.0, "epoch": 0.005676873989972237, "grad_norm": 3.508837229544901, "kl": 0.03424072265625, "learning_rate": 9.133333333333333e-07, "loss": 0.0013690193882212043, "memory(GiB)": 66.07, "reward": 1.25, "reward_std": 0.5435572862625122, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.30151134729385376, "step": 137, "train_speed(iter/s)": 0.019339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/mean_length": 659.1666870117188, "completions/min_length": 523.0, "epoch": 0.005718311026395392, "grad_norm": 2.401680106128053, "kl": 0.0369873046875, "learning_rate": 9.2e-07, "loss": 0.0014791786670684814, "memory(GiB)": 66.07, "reward": 1.5416667461395264, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 138, "train_speed(iter/s)": 0.019382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/mean_length": 676.0833740234375, "completions/min_length": 521.0, "epoch": 0.0057597480628185475, "grad_norm": 7.659663933791879, "kl": 0.0439453125, "learning_rate": 9.266666666666665e-07, "loss": 0.0017611980438232422, "memory(GiB)": 66.07, "reward": 1.4583333730697632, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 139, "train_speed(iter/s)": 0.019422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/mean_length": 636.8333740234375, "completions/min_length": 468.0, "epoch": 0.005801185099241702, "grad_norm": 3.0126857820090067, "kl": 0.04144287109375, "learning_rate": 9.333333333333333e-07, "loss": 0.001662199734710157, "memory(GiB)": 66.07, "reward": 1.0833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 140, "train_speed(iter/s)": 0.019458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/mean_length": 704.75, "completions/min_length": 472.0, "epoch": 0.005842622135664857, "grad_norm": 3.018896044230485, "kl": 0.04156494140625, "learning_rate": 9.399999999999999e-07, "loss": 0.001663828152231872, "memory(GiB)": 66.07, "reward": 0.5, "reward_std": 0.21320071816444397, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.21320071816444397, "step": 141, "train_speed(iter/s)": 0.019491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/mean_length": 621.25, "completions/min_length": 407.0, "epoch": 0.005884059172088012, "grad_norm": 2.9271525053841483, "kl": 0.0484619140625, "learning_rate": 9.466666666666666e-07, "loss": 0.001942416070960462, "memory(GiB)": 66.07, "reward": 1.7083333730697632, "reward_std": 0.25746431946754456, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7083333134651184, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 142, "train_speed(iter/s)": 0.01953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/mean_length": 667.25, "completions/min_length": 509.0, "epoch": 0.005925496208511167, "grad_norm": 3.509369232675446, "kl": 0.0423583984375, "learning_rate": 9.533333333333333e-07, "loss": 0.0016921162605285645, "memory(GiB)": 66.07, "reward": 0.7916666865348816, "reward_std": 0.5418123602867126, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 143, "train_speed(iter/s)": 0.019568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/mean_length": 712.0833740234375, "completions/min_length": 550.0, "epoch": 0.005966933244934323, "grad_norm": 4.2564632891755245, "kl": 0.04913330078125, "learning_rate": 9.6e-07, "loss": 0.0019713491201400757, "memory(GiB)": 66.07, "reward": 0.875, "reward_std": 0.4826536476612091, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 144, "train_speed(iter/s)": 0.019599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/mean_length": 663.25, "completions/min_length": 495.0, "epoch": 0.006008370281357477, "grad_norm": 29.451963083934988, "kl": 0.07623291015625, "learning_rate": 9.666666666666666e-07, "loss": 0.0030578970909118652, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.358870267868042, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.3588702976703644, "step": 145, "train_speed(iter/s)": 0.019637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/mean_length": 623.1666870117188, "completions/min_length": 477.0, "epoch": 0.0060498073177806325, "grad_norm": 2.7928842436659473, "kl": 0.05426025390625, "learning_rate": 9.733333333333333e-07, "loss": 0.0021690726280212402, "memory(GiB)": 66.07, "reward": 0.4166666865348816, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.2886751592159271, "step": 146, "train_speed(iter/s)": 0.019677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8006.0, "completions/mean_length": 1248.166748046875, "completions/min_length": 511.0, "epoch": 0.006091244354203787, "grad_norm": 2.958471192571064, "kl": 0.04931640625, "learning_rate": 9.8e-07, "loss": -0.11842846870422363, "memory(GiB)": 66.07, "reward": 0.4166666865348816, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.2886751592159271, "step": 147, "train_speed(iter/s)": 0.019473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1069.0, "completions/mean_length": 668.25, "completions/min_length": 431.0, "epoch": 0.006132681390626942, "grad_norm": 3.5095234570564617, "kl": 0.05120849609375, "learning_rate": 9.866666666666666e-07, "loss": 0.002050866838544607, "memory(GiB)": 66.07, "reward": 1.0416667461395264, "reward_std": 0.49810245633125305, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 148, "train_speed(iter/s)": 0.019503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1179.916748046875, "completions/min_length": 84.0, "epoch": 0.006174118427050098, "grad_norm": 5.90380998299578, "kl": 0.05108642578125, "learning_rate": 9.933333333333333e-07, "loss": 0.02832694910466671, "memory(GiB)": 66.07, "reward": 0.7916666865348816, "reward_std": 0.65568608045578, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.3964807391166687, "step": 149, "train_speed(iter/s)": 0.019302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/mean_length": 578.9166870117188, "completions/min_length": 425.0, "epoch": 0.006215555463473252, "grad_norm": 3.220663619194467, "kl": 0.05413818359375, "learning_rate": 1e-06, "loss": 0.002162789460271597, "memory(GiB)": 66.07, "reward": 0.7916666865348816, "reward_std": 0.3964807391166687, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.625, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 150, "train_speed(iter/s)": 0.019325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/mean_length": 703.25, "completions/min_length": 551.0, "epoch": 0.0062569924998964075, "grad_norm": 8.002879382872598, "kl": 0.04925537109375, "learning_rate": 9.999999888111142e-07, "loss": 0.0019697449170053005, "memory(GiB)": 66.07, "reward": 0.8333333730697632, "reward_std": 0.5365433692932129, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 151, "train_speed(iter/s)": 0.01936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/mean_length": 621.5833740234375, "completions/min_length": 417.0, "epoch": 0.006298429536319562, "grad_norm": 2.786744411852846, "kl": 0.05908203125, "learning_rate": 9.99999955244457e-07, "loss": 0.0023594796657562256, "memory(GiB)": 66.07, "reward": 1.4166667461395264, "reward_std": 0.46871843934059143, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 152, "train_speed(iter/s)": 0.019396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/mean_length": 591.9166870117188, "completions/min_length": 439.0, "epoch": 0.006339866572742717, "grad_norm": 3.18138494688907, "kl": 0.05987548828125, "learning_rate": 9.999998993000298e-07, "loss": 0.002396325347945094, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 153, "train_speed(iter/s)": 0.019433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/mean_length": 609.1666870117188, "completions/min_length": 441.0, "epoch": 0.006381303609165873, "grad_norm": 0.2123631127176238, "kl": 0.06231689453125, "learning_rate": 9.999998209778355e-07, "loss": 0.0024910932406783104, "memory(GiB)": 66.07, "reward": 1.5, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 154, "train_speed(iter/s)": 0.01947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/mean_length": 498.16668701171875, "completions/min_length": 111.0, "epoch": 0.006422740645589027, "grad_norm": 6.619106861992892, "kl": 0.0556640625, "learning_rate": 9.999997202778774e-07, "loss": 0.0022285382729023695, "memory(GiB)": 66.07, "reward": 0.5416666865348816, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 155, "train_speed(iter/s)": 0.019503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/mean_length": 517.9166870117188, "completions/min_length": 344.0, "epoch": 0.006464177682012183, "grad_norm": 9.131122864950678, "kl": 0.05792236328125, "learning_rate": 9.999995972001601e-07, "loss": 0.0023124616127461195, "memory(GiB)": 66.07, "reward": 1.4583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 156, "train_speed(iter/s)": 0.019536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 836.0, "completions/mean_length": 593.6666870117188, "completions/min_length": 445.0, "epoch": 0.006505614718435337, "grad_norm": 3.3869084182790274, "kl": 0.0546875, "learning_rate": 9.999994517446891e-07, "loss": 0.002190639730542898, "memory(GiB)": 66.07, "reward": 0.6666666865348816, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 157, "train_speed(iter/s)": 0.01957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/mean_length": 649.9166870117188, "completions/min_length": 507.0, "epoch": 0.0065470517548584925, "grad_norm": 3.3319102994820717, "kl": 0.0572509765625, "learning_rate": 9.999992839114706e-07, "loss": 0.0022899708710610867, "memory(GiB)": 66.07, "reward": 1.125, "reward_std": 0.5690901875495911, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 158, "train_speed(iter/s)": 0.019599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/mean_length": 608.9166870117188, "completions/min_length": 229.0, "epoch": 0.006588488791281648, "grad_norm": 11.731643198272518, "kl": 0.08013916015625, "learning_rate": 9.999990937005123e-07, "loss": 0.003205239772796631, "memory(GiB)": 66.07, "reward": 1.0, "reward_std": 0.6396021246910095, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 159, "train_speed(iter/s)": 0.019634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/mean_length": 591.6666870117188, "completions/min_length": 482.0, "epoch": 0.006629925827704802, "grad_norm": 222.4250604962598, "kl": 12.1171875, "learning_rate": 9.99998881111823e-07, "loss": 0.48857176303863525, "memory(GiB)": 66.07, "reward": 1.25, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 160, "train_speed(iter/s)": 0.019672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/mean_length": 526.3333740234375, "completions/min_length": 242.0, "epoch": 0.006671362864127958, "grad_norm": 4.292989533103235, "kl": 0.06561279296875, "learning_rate": 9.999986461454118e-07, "loss": 0.0026299606543034315, "memory(GiB)": 66.07, "reward": 0.4583333432674408, "reward_std": 0.3964807391166687, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.375, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 161, "train_speed(iter/s)": 0.019712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/mean_length": 611.0833740234375, "completions/min_length": 41.0, "epoch": 0.006712799900551112, "grad_norm": 11.462087938336312, "kl": 0.0606689453125, "learning_rate": 9.999983888012896e-07, "loss": 0.002428616164252162, "memory(GiB)": 66.07, "reward": 1.125, "reward_std": 0.7111130952835083, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.625, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 162, "train_speed(iter/s)": 0.019747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/mean_length": 546.1666870117188, "completions/min_length": 351.0, "epoch": 0.0067542369369742675, "grad_norm": 3.4314641214989954, "kl": 0.06549072265625, "learning_rate": 9.999981090794675e-07, "loss": 0.002620796440169215, "memory(GiB)": 66.07, "reward": 0.7083333730697632, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7083333134651184, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 163, "train_speed(iter/s)": 0.019784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/mean_length": 591.8333740234375, "completions/min_length": 377.0, "epoch": 0.006795673973397423, "grad_norm": 3.0605337670059507, "kl": 0.06549072265625, "learning_rate": 9.999978069799583e-07, "loss": 0.002618690486997366, "memory(GiB)": 66.07, "reward": 1.4166667461395264, "reward_std": 0.358870267868042, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 164, "train_speed(iter/s)": 0.019808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/mean_length": 671.8333740234375, "completions/min_length": 569.0, "epoch": 0.006837111009820577, "grad_norm": 2.7767285571173046, "kl": 0.05743408203125, "learning_rate": 9.999974825027754e-07, "loss": 0.002299189567565918, "memory(GiB)": 66.07, "reward": 0.9166666865348816, "reward_std": 0.5573204159736633, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 165, "train_speed(iter/s)": 0.019842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/mean_length": 627.9166870117188, "completions/min_length": 406.0, "epoch": 0.006878548046243733, "grad_norm": 3.35587433668121, "kl": 0.07281494140625, "learning_rate": 9.999971356479335e-07, "loss": 0.0029074649792164564, "memory(GiB)": 66.07, "reward": 1.75, "reward_std": 0.26111647486686707, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.26111647486686707, "step": 166, "train_speed(iter/s)": 0.019869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/mean_length": 625.25, "completions/min_length": 449.0, "epoch": 0.006919985082666887, "grad_norm": 10.494392225743324, "kl": 0.0950927734375, "learning_rate": 9.99996766415448e-07, "loss": 0.0038180947303771973, "memory(GiB)": 66.07, "reward": 0.75, "reward_std": 0.33709993958473206, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.6666666865348816, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 167, "train_speed(iter/s)": 0.019906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 989.0, "completions/mean_length": 697.9166870117188, "completions/min_length": 528.0, "epoch": 0.006961422119090043, "grad_norm": 3.4582721452078253, "kl": 0.0582275390625, "learning_rate": 9.999963748053354e-07, "loss": 0.0023281178437173367, "memory(GiB)": 66.07, "reward": 1.4583333730697632, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 168, "train_speed(iter/s)": 0.019931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/mean_length": 586.4166870117188, "completions/min_length": 226.0, "epoch": 0.007002859155513198, "grad_norm": 4.010762268848291, "kl": 0.07611083984375, "learning_rate": 9.999959608176129e-07, "loss": 0.003046751022338867, "memory(GiB)": 66.07, "reward": 1.2916667461395264, "reward_std": 0.5418123602867126, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 169, "train_speed(iter/s)": 0.019962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1687.0, "completions/mean_length": 713.0833740234375, "completions/min_length": 469.0, "epoch": 0.0070442961919363525, "grad_norm": 2.6650884607708476, "kl": 0.062255859375, "learning_rate": 9.999955244522997e-07, "loss": 0.0024838647805154324, "memory(GiB)": 66.07, "reward": 1.5, "reward_std": 0.36927446722984314, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 170, "train_speed(iter/s)": 0.019967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/mean_length": 501.8333435058594, "completions/min_length": 182.0, "epoch": 0.007085733228359508, "grad_norm": 5.775121526685233, "kl": 0.0712890625, "learning_rate": 9.99995065709415e-07, "loss": 0.002840315457433462, "memory(GiB)": 66.07, "reward": 0.5416666865348816, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 171, "train_speed(iter/s)": 0.020003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/mean_length": 557.75, "completions/min_length": 398.0, "epoch": 0.007127170264782662, "grad_norm": 3.3131463893504707, "kl": 0.06097412109375, "learning_rate": 9.999945845889793e-07, "loss": 0.0024370155297219753, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.4174235463142395, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.6666666865348816, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 172, "train_speed(iter/s)": 0.02004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/mean_length": 587.3333740234375, "completions/min_length": 372.0, "epoch": 0.007168607301205818, "grad_norm": 3.110284736683695, "kl": 0.0631103515625, "learning_rate": 9.99994081091014e-07, "loss": 0.0025216341018676758, "memory(GiB)": 66.07, "reward": 1.75, "reward_std": 0.26111647486686707, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.26111647486686707, "step": 173, "train_speed(iter/s)": 0.020076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/mean_length": 616.5, "completions/min_length": 429.0, "epoch": 0.007210044337628973, "grad_norm": 283.0645330155246, "kl": 0.98321533203125, "learning_rate": 9.999935552155421e-07, "loss": 0.03933355212211609, "memory(GiB)": 66.07, "reward": 0.9583333730697632, "reward_std": 0.5418123602867126, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 174, "train_speed(iter/s)": 0.020107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/mean_length": 602.0833740234375, "completions/min_length": 440.0, "epoch": 0.0072514813740521275, "grad_norm": 2.9626787994516794, "kl": 0.066650390625, "learning_rate": 9.999930069625869e-07, "loss": 0.0026644866447895765, "memory(GiB)": 66.07, "reward": 1.4166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 175, "train_speed(iter/s)": 0.020129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/mean_length": 558.5, "completions/min_length": 154.0, "epoch": 0.007292918410475283, "grad_norm": 5.233694457306199, "kl": 0.05926513671875, "learning_rate": 9.999924363321725e-07, "loss": 0.002368450164794922, "memory(GiB)": 66.07, "reward": 1.3333333730697632, "reward_std": 0.5365433692932129, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.21320071816444397, "step": 176, "train_speed(iter/s)": 0.020158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/mean_length": 637.3333740234375, "completions/min_length": 399.0, "epoch": 0.007334355446898437, "grad_norm": 2.687215156654027, "kl": 0.060791015625, "learning_rate": 9.99991843324325e-07, "loss": 0.002433618064969778, "memory(GiB)": 66.07, "reward": 0.5416666865348816, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 177, "train_speed(iter/s)": 0.020188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/mean_length": 638.1666870117188, "completions/min_length": 418.0, "epoch": 0.007375792483321593, "grad_norm": 3.2101212266441093, "kl": 0.05816650390625, "learning_rate": 9.999912279390707e-07, "loss": 0.0023260614834725857, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 178, "train_speed(iter/s)": 0.020198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/mean_length": 655.4166870117188, "completions/min_length": 556.0, "epoch": 0.007417229519744748, "grad_norm": 2.7415577653002674, "kl": 0.05218505859375, "learning_rate": 9.999905901764373e-07, "loss": 0.0020854275207966566, "memory(GiB)": 66.07, "reward": 1.2083333730697632, "reward_std": 0.5418123602867126, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.4583333432674408, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 179, "train_speed(iter/s)": 0.02023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/mean_length": 714.25, "completions/min_length": 450.0, "epoch": 0.007458666556167903, "grad_norm": 2.8419480407449247, "kl": 0.0556640625, "learning_rate": 9.999899300364532e-07, "loss": 0.0022243461571633816, "memory(GiB)": 66.07, "reward": 1.6666667461395264, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.6666666865348816, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 180, "train_speed(iter/s)": 0.020255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 879.0, "completions/mean_length": 739.75, "completions/min_length": 594.0, "epoch": 0.007500103592591058, "grad_norm": 2.991416122081783, "kl": 0.04913330078125, "learning_rate": 9.99989247519148e-07, "loss": 0.0019610426388680935, "memory(GiB)": 66.07, "reward": 1.0, "reward_std": 0.42640143632888794, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.6666666865348816, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 181, "train_speed(iter/s)": 0.02028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/mean_length": 621.25, "completions/min_length": 424.0, "epoch": 0.007541540629014213, "grad_norm": 2.959455194186519, "kl": 0.05316162109375, "learning_rate": 9.999885426245522e-07, "loss": 0.002126365900039673, "memory(GiB)": 66.07, "reward": 1.125, "reward_std": 0.4330126941204071, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.625, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 182, "train_speed(iter/s)": 0.020311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1082.0, "completions/mean_length": 658.75, "completions/min_length": 450.0, "epoch": 0.007582977665437368, "grad_norm": 2.949831578085604, "kl": 0.0535888671875, "learning_rate": 9.999878153526972e-07, "loss": 0.0021429858170449734, "memory(GiB)": 66.07, "reward": 0.6666666865348816, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 183, "train_speed(iter/s)": 0.020334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/mean_length": 738.6666870117188, "completions/min_length": 635.0, "epoch": 0.007624414701860523, "grad_norm": 2.896459623195631, "kl": 0.0526123046875, "learning_rate": 9.99987065703616e-07, "loss": 0.0021095573902130127, "memory(GiB)": 66.07, "reward": 1.25, "reward_std": 0.5838742256164551, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.26111647486686707, "step": 184, "train_speed(iter/s)": 0.020355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/mean_length": 662.9166870117188, "completions/min_length": 388.0, "epoch": 0.007665851738283678, "grad_norm": 3.254110488489087, "kl": 0.0784912109375, "learning_rate": 9.999862936773419e-07, "loss": 0.0031400572042912245, "memory(GiB)": 66.07, "reward": 1.7083333730697632, "reward_std": 0.25746431946754456, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7083333134651184, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 185, "train_speed(iter/s)": 0.020376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/mean_length": 725.5833740234375, "completions/min_length": 506.0, "epoch": 0.007707288774706833, "grad_norm": 2.5205602956789313, "kl": 0.04974365234375, "learning_rate": 9.999854992739093e-07, "loss": 0.0019872388802468777, "memory(GiB)": 66.07, "reward": 0.8333333730697632, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.6666666865348816, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 186, "train_speed(iter/s)": 0.020401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/mean_length": 551.75, "completions/min_length": 428.0, "epoch": 0.007748725811129988, "grad_norm": 3.5884273779348983, "kl": 0.06280517578125, "learning_rate": 9.999846824933538e-07, "loss": 0.0025152366142719984, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.25746431946754456, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 187, "train_speed(iter/s)": 0.020434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/mean_length": 663.3333740234375, "completions/min_length": 569.0, "epoch": 0.007790162847553143, "grad_norm": 3.0697840989857403, "kl": 0.071044921875, "learning_rate": 9.99983843335712e-07, "loss": 0.002841780660673976, "memory(GiB)": 66.07, "reward": 0.9166666865348816, "reward_std": 0.5967081785202026, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.6666666865348816, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 188, "train_speed(iter/s)": 0.020464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/mean_length": 678.75, "completions/min_length": 449.0, "epoch": 0.007831599883976298, "grad_norm": 3.2566420488653782, "kl": 0.04034423828125, "learning_rate": 9.999829818010219e-07, "loss": 0.0016138553619384766, "memory(GiB)": 66.07, "reward": 0.875, "reward_std": 0.4826536476612091, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 189, "train_speed(iter/s)": 0.020491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/mean_length": 612.25, "completions/min_length": 475.0, "epoch": 0.007873036920399453, "grad_norm": 3.000852777612432, "kl": 0.05877685546875, "learning_rate": 9.999820978893214e-07, "loss": 0.0023524067364633083, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 190, "train_speed(iter/s)": 0.020519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/mean_length": 574.75, "completions/min_length": 417.0, "epoch": 0.007914473956822607, "grad_norm": 4.382578617196156, "kl": 0.059814453125, "learning_rate": 9.999811916006504e-07, "loss": 0.0023967623710632324, "memory(GiB)": 66.07, "reward": 1.9166667461395264, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 191, "train_speed(iter/s)": 0.020547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1062.0, "completions/mean_length": 637.0, "completions/min_length": 433.0, "epoch": 0.007955910993245763, "grad_norm": 3.0260733015928603, "kl": 0.05572509765625, "learning_rate": 9.999802629350491e-07, "loss": 0.00222855806350708, "memory(GiB)": 66.07, "reward": 1.375, "reward_std": 0.5276448726654053, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.625, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 192, "train_speed(iter/s)": 0.020569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/mean_length": 586.8333740234375, "completions/min_length": 412.0, "epoch": 0.007997348029668918, "grad_norm": 4.292931549724434, "kl": 0.0560302734375, "learning_rate": 9.999793118925597e-07, "loss": 0.0022392570972442627, "memory(GiB)": 66.07, "reward": 1.7083333730697632, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 193, "train_speed(iter/s)": 0.020597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/mean_length": 604.6666870117188, "completions/min_length": 445.0, "epoch": 0.008038785066092072, "grad_norm": 36.402505268693425, "kl": 0.111083984375, "learning_rate": 9.999783384732241e-07, "loss": 0.0044366889633238316, "memory(GiB)": 66.07, "reward": 1.5, "reward_std": 0.36927446722984314, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 194, "train_speed(iter/s)": 0.020625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/mean_length": 656.0, "completions/min_length": 506.0, "epoch": 0.008080222102515229, "grad_norm": 4.455930255123107, "kl": 0.0523681640625, "learning_rate": 9.999773426770863e-07, "loss": 0.0020991265773773193, "memory(GiB)": 66.07, "reward": 1.2083333730697632, "reward_std": 0.6200562119483948, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.7083333134651184, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 195, "train_speed(iter/s)": 0.020651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/mean_length": 731.25, "completions/min_length": 538.0, "epoch": 0.008121659138938383, "grad_norm": 3.6626341402079636, "kl": 0.05548095703125, "learning_rate": 9.999763245041907e-07, "loss": 0.0022167563438415527, "memory(GiB)": 66.07, "reward": 1.625, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.625, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 196, "train_speed(iter/s)": 0.020675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/mean_length": 568.8333740234375, "completions/min_length": 387.0, "epoch": 0.008163096175361538, "grad_norm": 3.2070085923194016, "kl": 0.05426025390625, "learning_rate": 9.999752839545832e-07, "loss": 0.002167080994695425, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.25746431946754456, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 197, "train_speed(iter/s)": 0.020706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/mean_length": 559.0833740234375, "completions/min_length": 454.0, "epoch": 0.008204533211784694, "grad_norm": 5.020088302548566, "kl": 0.061767578125, "learning_rate": 9.999742210283097e-07, "loss": 0.0024712681770324707, "memory(GiB)": 66.07, "reward": 0.5416666865348816, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 198, "train_speed(iter/s)": 0.020733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1034.0, "completions/mean_length": 682.9166870117188, "completions/min_length": 421.0, "epoch": 0.008245970248207848, "grad_norm": 3.102095977332481, "kl": 0.04632568359375, "learning_rate": 9.999731357254185e-07, "loss": 0.001848201034590602, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.25746431946754456, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 199, "train_speed(iter/s)": 0.02075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/mean_length": 611.75, "completions/min_length": 449.0, "epoch": 0.008287407284631003, "grad_norm": 2.952313752574326, "kl": 0.05218505859375, "learning_rate": 9.999720280459574e-07, "loss": 0.0020858347415924072, "memory(GiB)": 66.07, "reward": 1.25, "reward_std": 0.6215815544128418, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.6666666865348816, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 200, "train_speed(iter/s)": 0.020775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/mean_length": 629.6666870117188, "completions/min_length": 435.0, "epoch": 0.008328844321054157, "grad_norm": 2.868250652563551, "kl": 0.05279541015625, "learning_rate": 9.999708979899767e-07, "loss": 0.0021128058433532715, "memory(GiB)": 66.07, "reward": 0.7916666865348816, "reward_std": 0.3964807391166687, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.625, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 201, "train_speed(iter/s)": 0.020799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/mean_length": 704.5833740234375, "completions/min_length": 513.0, "epoch": 0.008370281357477314, "grad_norm": 2.4917125074216115, "kl": 0.0496826171875, "learning_rate": 9.999697455575266e-07, "loss": 0.0019886395893990993, "memory(GiB)": 66.07, "reward": 1.7083333730697632, "reward_std": 0.25746431946754456, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7083333134651184, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 202, "train_speed(iter/s)": 0.020821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1152.0, "completions/mean_length": 648.9166870117188, "completions/min_length": 488.0, "epoch": 0.008411718393900468, "grad_norm": 3.9822199922475097, "kl": 0.05548095703125, "learning_rate": 9.999685707486586e-07, "loss": 0.002224961994215846, "memory(GiB)": 66.07, "reward": 1.4583333730697632, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 203, "train_speed(iter/s)": 0.020839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/mean_length": 577.25, "completions/min_length": 404.0, "epoch": 0.008453155430323623, "grad_norm": 2.9002007314608065, "kl": 0.0521240234375, "learning_rate": 9.999673735634259e-07, "loss": 0.0020801625214517117, "memory(GiB)": 66.07, "reward": 1.5, "reward_std": 0.5222329497337341, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.6666666865348816, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 204, "train_speed(iter/s)": 0.020864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/mean_length": 676.8333740234375, "completions/min_length": 417.0, "epoch": 0.008494592466746779, "grad_norm": 3.0331199542325633, "kl": 0.0587158203125, "learning_rate": 9.99966154001881e-07, "loss": 0.00234435498714447, "memory(GiB)": 66.07, "reward": 0.7083333730697632, "reward_std": 0.25746431946754456, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7083333134651184, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 205, "train_speed(iter/s)": 0.020886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1037.0, "completions/mean_length": 676.25, "completions/min_length": 486.0, "epoch": 0.008536029503169933, "grad_norm": 2.997031185458577, "kl": 0.0513916015625, "learning_rate": 9.999649120640795e-07, "loss": 0.0020557641983032227, "memory(GiB)": 66.07, "reward": 0.9583333730697632, "reward_std": 0.5418123602867126, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.625, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 206, "train_speed(iter/s)": 0.020902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/mean_length": 663.1666870117188, "completions/min_length": 406.0, "epoch": 0.008577466539593088, "grad_norm": 0.30298050130768894, "kl": 0.054931640625, "learning_rate": 9.999636477500764e-07, "loss": 0.002196083776652813, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 207, "train_speed(iter/s)": 0.020923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/mean_length": 689.75, "completions/min_length": 415.0, "epoch": 0.008618903576016244, "grad_norm": 3.995002850590541, "kl": 0.058837890625, "learning_rate": 9.999623610599285e-07, "loss": 0.0023568670731037855, "memory(GiB)": 66.07, "reward": 1.75, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 208, "train_speed(iter/s)": 0.020942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/mean_length": 762.25, "completions/min_length": 585.0, "epoch": 0.008660340612439399, "grad_norm": 2.6242806631980056, "kl": 0.0511474609375, "learning_rate": 9.999610519936933e-07, "loss": 0.002051542280241847, "memory(GiB)": 66.07, "reward": 1.5, "reward_std": 0.36927446722984314, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.26111647486686707, "step": 209, "train_speed(iter/s)": 0.020956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/mean_length": 722.8333740234375, "completions/min_length": 462.0, "epoch": 0.008701777648862553, "grad_norm": 4.834407519955498, "kl": 0.06591796875, "learning_rate": 9.999597205514296e-07, "loss": 0.0026440322399139404, "memory(GiB)": 66.07, "reward": 1.375, "reward_std": 0.6077155470848083, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.625, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 210, "train_speed(iter/s)": 0.020974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/mean_length": 710.75, "completions/min_length": 499.0, "epoch": 0.008743214685285708, "grad_norm": 11.006496106260913, "kl": 0.06219482421875, "learning_rate": 9.999583667331967e-07, "loss": 0.002485990524291992, "memory(GiB)": 66.07, "reward": 0.7083333730697632, "reward_std": 0.5418123602867126, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.625, "rewards/FormatCorrectnessReward/std": 0.376889169216156, "step": 211, "train_speed(iter/s)": 0.020986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/mean_length": 531.75, "completions/min_length": 354.0, "epoch": 0.008784651721708864, "grad_norm": 0.2837972869009336, "kl": 0.0618896484375, "learning_rate": 9.999569905390554e-07, "loss": 0.0024735350161790848, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 212, "train_speed(iter/s)": 0.021015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/mean_length": 558.3333740234375, "completions/min_length": 405.0, "epoch": 0.008826088758132018, "grad_norm": 3.1106781405361223, "kl": 0.06005859375, "learning_rate": 9.999555919690672e-07, "loss": 0.0023975372314453125, "memory(GiB)": 66.07, "reward": 1.2916667461395264, "reward_std": 0.5418123602867126, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 213, "train_speed(iter/s)": 0.021044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1292.25, "completions/min_length": 224.0, "epoch": 0.008867525794555173, "grad_norm": 3.8543163889815752, "kl": 0.06622314453125, "learning_rate": 9.999541710232945e-07, "loss": -0.03809094429016113, "memory(GiB)": 66.07, "reward": 1.3333333730697632, "reward_std": 0.685344398021698, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.36927446722984314, "step": 214, "train_speed(iter/s)": 0.020873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/mean_length": 591.0833740234375, "completions/min_length": 383.0, "epoch": 0.008908962830978329, "grad_norm": 3.8075710244501084, "kl": 0.05596923828125, "learning_rate": 9.999527277018014e-07, "loss": 0.0022429227828979492, "memory(GiB)": 66.07, "reward": 1.2083333730697632, "reward_std": 0.5418123602867126, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.7083333134651184, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 215, "train_speed(iter/s)": 0.020896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1081.0, "completions/mean_length": 770.75, "completions/min_length": 481.0, "epoch": 0.008950399867401483, "grad_norm": 2.8035191986369985, "kl": 0.0498046875, "learning_rate": 9.99951262004652e-07, "loss": 0.001993169542402029, "memory(GiB)": 66.07, "reward": 0.9166666865348816, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 216, "train_speed(iter/s)": 0.02091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/mean_length": 616.75, "completions/min_length": 459.0, "epoch": 0.008991836903824638, "grad_norm": 3.0434437942258414, "kl": 0.07501220703125, "learning_rate": 9.999497739319122e-07, "loss": 0.0030018589459359646, "memory(GiB)": 66.07, "reward": 0.7916666865348816, "reward_std": 0.25746431946754456, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 217, "train_speed(iter/s)": 0.020932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 927.0, "completions/mean_length": 703.75, "completions/min_length": 467.0, "epoch": 0.009033273940247794, "grad_norm": 2.8436107747147963, "kl": 0.0546875, "learning_rate": 9.999482634836484e-07, "loss": 0.002193450927734375, "memory(GiB)": 66.07, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 218, "train_speed(iter/s)": 0.020949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1080.0, "completions/mean_length": 814.25, "completions/min_length": 523.0, "epoch": 0.009074710976670949, "grad_norm": 5.751022616029888, "kl": 0.062255859375, "learning_rate": 9.999467306599285e-07, "loss": 0.0024858564138412476, "memory(GiB)": 66.07, "reward": 1.5, "reward_std": 0.6030226945877075, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.33709993958473206, "step": 219, "train_speed(iter/s)": 0.020962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1164.0, "completions/mean_length": 789.75, "completions/min_length": 528.0, "epoch": 0.009116148013094103, "grad_norm": 3.644582757415833, "kl": 0.06842041015625, "learning_rate": 9.999451754608207e-07, "loss": 0.0027417342644184828, "memory(GiB)": 66.07, "reward": 1.125, "reward_std": 0.7423856258392334, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.7083333134651184, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 220, "train_speed(iter/s)": 0.020973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1075.0, "completions/mean_length": 670.75, "completions/min_length": 408.0, "epoch": 0.009157585049517258, "grad_norm": 5.444295741101184, "kl": 0.0809326171875, "learning_rate": 9.999435978863948e-07, "loss": 0.003231366630643606, "memory(GiB)": 66.07, "reward": 0.875, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 221, "train_speed(iter/s)": 0.02099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1176.0, "completions/mean_length": 656.1666870117188, "completions/min_length": 541.0, "epoch": 0.009199022085940414, "grad_norm": 3.4203340497035852, "kl": 0.072265625, "learning_rate": 9.999419979367214e-07, "loss": 0.0028855502605438232, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 222, "train_speed(iter/s)": 0.021006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/mean_length": 696.0833740234375, "completions/min_length": 582.0, "epoch": 0.009240459122363568, "grad_norm": 3.248509546784264, "kl": 0.0533447265625, "learning_rate": 9.999403756118722e-07, "loss": 0.002133747097104788, "memory(GiB)": 66.07, "reward": 1.2083333730697632, "reward_std": 0.49810245633125305, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 223, "train_speed(iter/s)": 0.021027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/mean_length": 629.5, "completions/min_length": 447.0, "epoch": 0.009281896158786723, "grad_norm": 19.874564199777495, "kl": 0.1119384765625, "learning_rate": 9.999387309119195e-07, "loss": 0.0044897994957864285, "memory(GiB)": 66.07, "reward": 0.875, "reward_std": 0.4330126941204071, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 224, "train_speed(iter/s)": 0.021045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/mean_length": 696.8333740234375, "completions/min_length": 527.0, "epoch": 0.009323333195209879, "grad_norm": 0.3902213211896919, "kl": 0.06317138671875, "learning_rate": 9.999370638369376e-07, "loss": 0.0025332835502922535, "memory(GiB)": 66.07, "reward": 0.5, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.0, "step": 225, "train_speed(iter/s)": 0.021066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/mean_length": 651.0833740234375, "completions/min_length": 450.0, "epoch": 0.009364770231633034, "grad_norm": 0.6895501693906639, "kl": 0.062255859375, "learning_rate": 9.999353743870002e-07, "loss": 0.0024912075605243444, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 226, "train_speed(iter/s)": 0.021084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/mean_length": 532.5, "completions/min_length": 348.0, "epoch": 0.009406207268056188, "grad_norm": 104.92408085312178, "kl": 0.20013427734375, "learning_rate": 9.999336625621835e-07, "loss": 0.00801318883895874, "memory(GiB)": 66.07, "reward": 1.2916667461395264, "reward_std": 0.49810245633125305, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 227, "train_speed(iter/s)": 0.021111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 7999.0, "completions/mean_length": 1745.416748046875, "completions/min_length": 435.0, "epoch": 0.009447644304479344, "grad_norm": 8.387504762475642, "kl": 0.082275390625, "learning_rate": 9.99931928362564e-07, "loss": -0.15779229998588562, "memory(GiB)": 66.07, "reward": 1.4166667461395264, "reward_std": 0.7334021925926208, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.4174235761165619, "step": 228, "train_speed(iter/s)": 0.02094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/mean_length": 517.5, "completions/min_length": 350.0, "epoch": 0.009489081340902499, "grad_norm": 4.958817448238141, "kl": 0.0850830078125, "learning_rate": 9.999301717882192e-07, "loss": 0.0033977627754211426, "memory(GiB)": 66.07, "reward": 0.875, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 229, "train_speed(iter/s)": 0.020962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 8001.0, "completions/mean_length": 1851.666748046875, "completions/min_length": 222.0, "epoch": 0.009530518377325653, "grad_norm": 8.857038728853224, "kl": 0.162353515625, "learning_rate": 9.99928392839228e-07, "loss": -0.25362735986709595, "memory(GiB)": 66.07, "reward": 1.1666667461395264, "reward_std": 0.7487363219261169, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.4166666567325592, "rewards/FormatCorrectnessReward/std": 0.3588702976703644, "step": 230, "train_speed(iter/s)": 0.020792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 8001.0, "completions/mean_length": 2151.916748046875, "completions/min_length": 549.0, "epoch": 0.009571955413748808, "grad_norm": 2.2699121465303373, "kl": 0.05828857421875, "learning_rate": 9.999265915156696e-07, "loss": -0.28296715021133423, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.6336522102355957, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.6666666865348816, "rewards/FormatCorrectnessReward/std": 0.44381269812583923, "step": 231, "train_speed(iter/s)": 0.020627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2148.0, "completions/mean_length": 765.75, "completions/min_length": 404.0, "epoch": 0.009613392450171964, "grad_norm": 3.8461719345423138, "kl": 0.0654296875, "learning_rate": 9.999247678176247e-07, "loss": 0.002620140789076686, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 232, "train_speed(iter/s)": 0.020615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/mean_length": 681.75, "completions/min_length": 395.0, "epoch": 0.009654829486595119, "grad_norm": 3.3129866805694017, "kl": 0.05279541015625, "learning_rate": 9.999229217451755e-07, "loss": 0.0021167248487472534, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.25746431946754456, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 233, "train_speed(iter/s)": 0.020637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1232.5, "completions/min_length": 471.0, "epoch": 0.009696266523018273, "grad_norm": 2.861599424421964, "kl": 0.083251953125, "learning_rate": 9.999210532984038e-07, "loss": -0.14434245228767395, "memory(GiB)": 66.07, "reward": 0.9583333730697632, "reward_std": 0.5418123602867126, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.7083333134651184, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 234, "train_speed(iter/s)": 0.02049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1068.0, "completions/mean_length": 638.0833740234375, "completions/min_length": 386.0, "epoch": 0.00973770355944143, "grad_norm": 3.7051750333109497, "kl": 0.0986328125, "learning_rate": 9.999191624773937e-07, "loss": 0.003945480100810528, "memory(GiB)": 66.07, "reward": 1.0416667461395264, "reward_std": 0.6200562119483948, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.3964807391166687, "step": 235, "train_speed(iter/s)": 0.020506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3775.0, "completions/mean_length": 915.0, "completions/min_length": 252.0, "epoch": 0.009779140595864584, "grad_norm": 2.883768266304682, "kl": 0.0703125, "learning_rate": 9.999172492822298e-07, "loss": 0.0028102300129830837, "memory(GiB)": 66.07, "reward": 1.4166667461395264, "reward_std": 0.7334021925926208, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.4174235761165619, "step": 236, "train_speed(iter/s)": 0.020458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/mean_length": 648.0833740234375, "completions/min_length": 384.0, "epoch": 0.009820577632287738, "grad_norm": 3.5666719697792124, "kl": 0.07379150390625, "learning_rate": 9.999153137129977e-07, "loss": 0.0029551635961979628, "memory(GiB)": 66.07, "reward": 0.75, "reward_std": 0.26111647486686707, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.26111647486686707, "step": 237, "train_speed(iter/s)": 0.020473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/mean_length": 617.8333740234375, "completions/min_length": 381.0, "epoch": 0.009862014668710894, "grad_norm": 2.70809937947441, "kl": 0.0804443359375, "learning_rate": 9.999133557697838e-07, "loss": 0.003221015213057399, "memory(GiB)": 66.07, "reward": 1.8333333730697632, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 238, "train_speed(iter/s)": 0.020493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1078.0, "completions/mean_length": 655.0, "completions/min_length": 469.0, "epoch": 0.009903451705134049, "grad_norm": 3.247006800717422, "kl": 0.0924072265625, "learning_rate": 9.99911375452676e-07, "loss": 0.0036928951740264893, "memory(GiB)": 66.07, "reward": 1.8333333730697632, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 239, "train_speed(iter/s)": 0.02051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/mean_length": 731.5, "completions/min_length": 447.0, "epoch": 0.009944888741557203, "grad_norm": 2.4168198010710014, "kl": 0.0670166015625, "learning_rate": 9.999093727617628e-07, "loss": 0.002683242317289114, "memory(GiB)": 66.07, "reward": 1.125, "reward_std": 0.4826536476612091, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 240, "train_speed(iter/s)": 0.020527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/mean_length": 545.5833740234375, "completions/min_length": 402.0, "epoch": 0.00998632577798036, "grad_norm": 3.3306249101398384, "kl": 0.0958251953125, "learning_rate": 9.99907347697134e-07, "loss": 0.0038439035415649414, "memory(GiB)": 66.07, "reward": 1.2916667461395264, "reward_std": 0.6200562119483948, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.7083333134651184, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 241, "train_speed(iter/s)": 0.020553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/mean_length": 630.6666870117188, "completions/min_length": 384.0, "epoch": 0.010027762814403514, "grad_norm": 3.2279535060911186, "kl": 0.0968017578125, "learning_rate": 9.9990530025888e-07, "loss": 0.0038701496087014675, "memory(GiB)": 66.07, "reward": 0.875, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 242, "train_speed(iter/s)": 0.020572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/mean_length": 538.1666870117188, "completions/min_length": 407.0, "epoch": 0.010069199850826669, "grad_norm": 3.3227556177263584, "kl": 0.105712890625, "learning_rate": 9.999032304470924e-07, "loss": 0.004231264349073172, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 243, "train_speed(iter/s)": 0.020598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/mean_length": 618.25, "completions/min_length": 446.0, "epoch": 0.010110636887249823, "grad_norm": 3.018176879549886, "kl": 0.1162109375, "learning_rate": 9.999011382618643e-07, "loss": 0.004645109176635742, "memory(GiB)": 66.07, "reward": 1.375, "reward_std": 0.4826536476612091, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.625, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 244, "train_speed(iter/s)": 0.02062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1076.0, "completions/mean_length": 666.9166870117188, "completions/min_length": 435.0, "epoch": 0.01015207392367298, "grad_norm": 2.8941032161245372, "kl": 0.1103515625, "learning_rate": 9.998990237032888e-07, "loss": 0.004410773515701294, "memory(GiB)": 66.07, "reward": 1.0833333730697632, "reward_std": 0.5967081785202026, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 245, "train_speed(iter/s)": 0.020633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/mean_length": 538.6666870117188, "completions/min_length": 446.0, "epoch": 0.010193510960096134, "grad_norm": 3.124389455299957, "kl": 0.1248779296875, "learning_rate": 9.998968867714608e-07, "loss": 0.004994511604309082, "memory(GiB)": 66.07, "reward": 1.7083333730697632, "reward_std": 0.25746431946754456, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7083333134651184, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 246, "train_speed(iter/s)": 0.020654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1342.916748046875, "completions/min_length": 544.0, "epoch": 0.010234947996519288, "grad_norm": 34.57410644690216, "kl": 0.4422607421875, "learning_rate": 9.998947274664756e-07, "loss": -0.14984726905822754, "memory(GiB)": 66.07, "reward": 1.5, "reward_std": 0.7687060832977295, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.39886200428009033, "step": 247, "train_speed(iter/s)": 0.020511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/mean_length": 589.8333740234375, "completions/min_length": 412.0, "epoch": 0.010276385032942445, "grad_norm": 3.325324477866853, "kl": 0.1279296875, "learning_rate": 9.998925457884305e-07, "loss": 0.0051179928705096245, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.25746431946754456, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 248, "train_speed(iter/s)": 0.020532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/mean_length": 695.9166870117188, "completions/min_length": 542.0, "epoch": 0.0103178220693656, "grad_norm": 2.568026862781445, "kl": 0.0904541015625, "learning_rate": 9.998903417374226e-07, "loss": 0.0036126128397881985, "memory(GiB)": 66.07, "reward": 1.6666667461395264, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.33709993958473206, "step": 249, "train_speed(iter/s)": 0.020547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/mean_length": 744.9166870117188, "completions/min_length": 593.0, "epoch": 0.010359259105788754, "grad_norm": 2.6395037787454707, "kl": 0.090087890625, "learning_rate": 9.99888115313551e-07, "loss": 0.0036038707476109266, "memory(GiB)": 66.07, "reward": 1.375, "reward_std": 0.4826536476612091, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 250, "train_speed(iter/s)": 0.020561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/mean_length": 651.0, "completions/min_length": 151.0, "epoch": 0.01040069614221191, "grad_norm": 10.19911879954258, "kl": 0.099609375, "learning_rate": 9.998858665169147e-07, "loss": 0.003980954643338919, "memory(GiB)": 66.07, "reward": 0.7083333730697632, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7083333134651184, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 251, "train_speed(iter/s)": 0.020572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/mean_length": 688.5, "completions/min_length": 533.0, "epoch": 0.010442133178635064, "grad_norm": 2.718349009884117, "kl": 0.08740234375, "learning_rate": 9.998835953476147e-07, "loss": 0.003495335578918457, "memory(GiB)": 66.07, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 252, "train_speed(iter/s)": 0.020587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/mean_length": 574.9166870117188, "completions/min_length": 434.0, "epoch": 0.010483570215058219, "grad_norm": 3.0522870408587797, "kl": 0.1094970703125, "learning_rate": 9.99881301805753e-07, "loss": 0.00437192153185606, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 253, "train_speed(iter/s)": 0.020609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/mean_length": 555.1666870117188, "completions/min_length": 416.0, "epoch": 0.010525007251481373, "grad_norm": 3.5779867046799336, "kl": 0.1329345703125, "learning_rate": 9.998789858914315e-07, "loss": 0.005320767872035503, "memory(GiB)": 66.07, "reward": 1.6666667461395264, "reward_std": 0.5365433692932129, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 254, "train_speed(iter/s)": 0.02063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1247.3333740234375, "completions/min_length": 413.0, "epoch": 0.01056644428790453, "grad_norm": 1.9608613544465534, "kl": 0.1177978515625, "learning_rate": 9.998766476047545e-07, "loss": -0.2301919162273407, "memory(GiB)": 66.07, "reward": 1.75, "reward_std": 0.6215815544128418, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3892494738101959, "step": 255, "train_speed(iter/s)": 0.020494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/mean_length": 525.5833740234375, "completions/min_length": 33.0, "epoch": 0.010607881324327684, "grad_norm": 7.431908230375418, "kl": 0.117431640625, "learning_rate": 9.998742869458263e-07, "loss": 0.004697948694229126, "memory(GiB)": 66.07, "reward": 1.2083333730697632, "reward_std": 0.7216877937316895, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.7083333134651184, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 256, "train_speed(iter/s)": 0.020501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8002.0, "completions/mean_length": 1131.666748046875, "completions/min_length": 312.0, "epoch": 0.010649318360750839, "grad_norm": 3.613243291669363, "kl": 0.114013671875, "learning_rate": 9.998719039147528e-07, "loss": -0.023488661274313927, "memory(GiB)": 66.07, "reward": 1.25, "reward_std": 0.753778338432312, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.6666666865348816, "rewards/FormatCorrectnessReward/std": 0.38924944400787354, "step": 257, "train_speed(iter/s)": 0.020372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/mean_length": 635.3333740234375, "completions/min_length": 127.0, "epoch": 0.010690755397173995, "grad_norm": 6.057277557888356, "kl": 0.1041259765625, "learning_rate": 9.998694985116404e-07, "loss": 0.004164626356214285, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.6336522102355957, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3256694972515106, "step": 258, "train_speed(iter/s)": 0.020388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1070.0, "completions/mean_length": 675.4166870117188, "completions/min_length": 456.0, "epoch": 0.01073219243359715, "grad_norm": 2.774493947596225, "kl": 0.1297607421875, "learning_rate": 9.998670707365967e-07, "loss": 0.005185435526072979, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 259, "train_speed(iter/s)": 0.020403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4036.0, "completions/mean_length": 992.4166870117188, "completions/min_length": 542.0, "epoch": 0.010773629470020304, "grad_norm": 4.474821207424958, "kl": 0.107666015625, "learning_rate": 9.998646205897307e-07, "loss": 0.004305521957576275, "memory(GiB)": 66.07, "reward": 1.875, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 260, "train_speed(iter/s)": 0.020355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 7999.0, "completions/mean_length": 1230.166748046875, "completions/min_length": 378.0, "epoch": 0.01081506650644346, "grad_norm": 2.7373741250321832, "kl": 0.13720703125, "learning_rate": 9.99862148071152e-07, "loss": -0.11132073402404785, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.4174235463142395, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.33709993958473206, "step": 261, "train_speed(iter/s)": 0.020227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/mean_length": 540.1666870117188, "completions/min_length": 80.0, "epoch": 0.010856503542866614, "grad_norm": 17.165325517803144, "kl": 0.169921875, "learning_rate": 9.99859653180971e-07, "loss": 0.006795207969844341, "memory(GiB)": 66.07, "reward": 0.8333333730697632, "reward_std": 0.32566946744918823, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3256694972515106, "step": 262, "train_speed(iter/s)": 0.020248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/mean_length": 604.4166870117188, "completions/min_length": 427.0, "epoch": 0.010897940579289769, "grad_norm": 2.940725729142261, "kl": 0.1177978515625, "learning_rate": 9.998571359192995e-07, "loss": 0.004729559179395437, "memory(GiB)": 66.07, "reward": 0.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 263, "train_speed(iter/s)": 0.020265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/mean_length": 559.25, "completions/min_length": 423.0, "epoch": 0.010939377615712923, "grad_norm": 3.138134285117118, "kl": 0.1397705078125, "learning_rate": 9.998545962862501e-07, "loss": 0.005594549234956503, "memory(GiB)": 66.07, "reward": 1.1666667461395264, "reward_std": 0.44381269812583923, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 264, "train_speed(iter/s)": 0.020254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/mean_length": 502.5, "completions/min_length": 45.0, "epoch": 0.01098081465213608, "grad_norm": 60.98579103778109, "kl": 0.185302734375, "learning_rate": 9.998520342819366e-07, "loss": 0.007427672855556011, "memory(GiB)": 66.07, "reward": 0.9166666865348816, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 265, "train_speed(iter/s)": 0.020276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/mean_length": 508.25, "completions/min_length": 364.0, "epoch": 0.011022251688559234, "grad_norm": 6.256086108449647, "kl": 0.158447265625, "learning_rate": 9.998494499064735e-07, "loss": 0.006331603042781353, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 266, "train_speed(iter/s)": 0.020301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/mean_length": 572.5833740234375, "completions/min_length": 346.0, "epoch": 0.011063688724982389, "grad_norm": 4.041542397594511, "kl": 0.137939453125, "learning_rate": 9.998468431599767e-07, "loss": 0.005518198013305664, "memory(GiB)": 66.07, "reward": 1.3333333730697632, "reward_std": 0.5365433692932129, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 267, "train_speed(iter/s)": 0.020317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1323.0, "completions/mean_length": 645.6666870117188, "completions/min_length": 55.0, "epoch": 0.011105125761405545, "grad_norm": 133.1005319910557, "kl": 1.3145751953125, "learning_rate": 9.998442140425624e-07, "loss": 0.052513957023620605, "memory(GiB)": 66.07, "reward": 0.8333333730697632, "reward_std": 0.32566946744918823, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3256694972515106, "step": 268, "train_speed(iter/s)": 0.02031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/mean_length": 626.5, "completions/min_length": 391.0, "epoch": 0.0111465627978287, "grad_norm": 3.2553762155727592, "kl": 0.133544921875, "learning_rate": 9.99841562554349e-07, "loss": 0.0053420765325427055, "memory(GiB)": 66.07, "reward": 1.5416667461395264, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 269, "train_speed(iter/s)": 0.020326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/mean_length": 563.0833740234375, "completions/min_length": 416.0, "epoch": 0.011187999834251854, "grad_norm": 0.3398627467149081, "kl": 0.174560546875, "learning_rate": 9.998388886954545e-07, "loss": 0.0069891465827822685, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 270, "train_speed(iter/s)": 0.020345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/mean_length": 546.6666870117188, "completions/min_length": 429.0, "epoch": 0.01122943687067501, "grad_norm": 3.3997225720923625, "kl": 0.13427734375, "learning_rate": 9.99836192465999e-07, "loss": 0.0053717344999313354, "memory(GiB)": 66.07, "reward": 1.8333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 271, "train_speed(iter/s)": 0.020368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/mean_length": 639.3333740234375, "completions/min_length": 300.0, "epoch": 0.011270873907098165, "grad_norm": 2.487721150400784, "kl": 0.1259765625, "learning_rate": 9.998334738661028e-07, "loss": 0.005035032983869314, "memory(GiB)": 66.07, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 272, "train_speed(iter/s)": 0.020387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1393.0, "completions/mean_length": 710.5, "completions/min_length": 301.0, "epoch": 0.01131231094352132, "grad_norm": 386.480776004065, "kl": 4.135498046875, "learning_rate": 9.998307328958877e-07, "loss": 0.1656467318534851, "memory(GiB)": 66.07, "reward": 0.7916666865348816, "reward_std": 0.3964807391166687, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.3964807391166687, "step": 273, "train_speed(iter/s)": 0.020394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/mean_length": 598.0833740234375, "completions/min_length": 452.0, "epoch": 0.011353747979944474, "grad_norm": 15.16390883885331, "kl": 0.157470703125, "learning_rate": 9.998279695554767e-07, "loss": 0.006285454146564007, "memory(GiB)": 66.07, "reward": 1.75, "reward_std": 0.39886200428009033, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 274, "train_speed(iter/s)": 0.020416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/mean_length": 700.5, "completions/min_length": 504.0, "epoch": 0.01139518501636763, "grad_norm": 2.9823512913315366, "kl": 0.1357421875, "learning_rate": 9.99825183844993e-07, "loss": 0.005416383501142263, "memory(GiB)": 66.07, "reward": 0.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 275, "train_speed(iter/s)": 0.020431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/mean_length": 528.0833740234375, "completions/min_length": 392.0, "epoch": 0.011436622052790784, "grad_norm": 3.4057325806104486, "kl": 0.1396484375, "learning_rate": 9.998223757645617e-07, "loss": 0.005592375993728638, "memory(GiB)": 66.07, "reward": 1.2916667461395264, "reward_std": 0.45016831159591675, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 276, "train_speed(iter/s)": 0.020453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/mean_length": 624.9166870117188, "completions/min_length": 449.0, "epoch": 0.011478059089213939, "grad_norm": 4.095459144690608, "kl": 0.1241455078125, "learning_rate": 9.99819545314308e-07, "loss": 0.004964093677699566, "memory(GiB)": 66.07, "reward": 1.1666667461395264, "reward_std": 0.3892494738101959, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.38924944400787354, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 277, "train_speed(iter/s)": 0.020472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/mean_length": 616.8333740234375, "completions/min_length": 380.0, "epoch": 0.011519496125637095, "grad_norm": 3.8645549657270544, "kl": 0.1387939453125, "learning_rate": 9.99816692494359e-07, "loss": 0.005559136625379324, "memory(GiB)": 66.07, "reward": 1.0833333730697632, "reward_std": 0.46871843934059143, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 278, "train_speed(iter/s)": 0.02049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/mean_length": 486.8333435058594, "completions/min_length": 263.0, "epoch": 0.01156093316206025, "grad_norm": 8.59628793051338, "kl": 0.172607421875, "learning_rate": 9.998138173048423e-07, "loss": 0.00690855598077178, "memory(GiB)": 66.07, "reward": 1.75, "reward_std": 0.6215815544128418, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 279, "train_speed(iter/s)": 0.020508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/mean_length": 663.0833740234375, "completions/min_length": 434.0, "epoch": 0.011602370198483404, "grad_norm": 2.602006223196279, "kl": 0.1370849609375, "learning_rate": 9.998109197458865e-07, "loss": 0.005483200307935476, "memory(GiB)": 66.07, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 280, "train_speed(iter/s)": 0.020525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/mean_length": 528.5, "completions/min_length": 389.0, "epoch": 0.01164380723490656, "grad_norm": 4.165979770486904, "kl": 0.18505859375, "learning_rate": 9.998079998176213e-07, "loss": 0.007386287208646536, "memory(GiB)": 66.07, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 281, "train_speed(iter/s)": 0.020545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/mean_length": 520.5833740234375, "completions/min_length": 422.0, "epoch": 0.011685244271329715, "grad_norm": 3.5167574110503743, "kl": 0.173828125, "learning_rate": 9.99805057520177e-07, "loss": 0.0069505623541772366, "memory(GiB)": 66.07, "reward": 1.9166667461395264, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 282, "train_speed(iter/s)": 0.020555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.0, "completions/mean_length": 655.6666870117188, "completions/min_length": 425.0, "epoch": 0.01172668130775287, "grad_norm": 3.439423192985027, "kl": 0.1248779296875, "learning_rate": 9.99802092853686e-07, "loss": 0.0049915313720703125, "memory(GiB)": 66.07, "reward": 1.25, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 283, "train_speed(iter/s)": 0.020568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/mean_length": 501.0, "completions/min_length": 386.0, "epoch": 0.011768118344176024, "grad_norm": 3.5323464444195434, "kl": 0.16552734375, "learning_rate": 9.997991058182806e-07, "loss": 0.006627972237765789, "memory(GiB)": 66.07, "reward": 1.8333333730697632, "reward_std": 0.32566946744918823, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 284, "train_speed(iter/s)": 0.020592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1508.0, "completions/mean_length": 664.0, "completions/min_length": 432.0, "epoch": 0.01180955538059918, "grad_norm": 0.23346696559004557, "kl": 0.12158203125, "learning_rate": 9.997960964140945e-07, "loss": 0.004865213297307491, "memory(GiB)": 66.07, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 285, "train_speed(iter/s)": 0.020594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/mean_length": 533.6666870117188, "completions/min_length": 421.0, "epoch": 0.011850992417022334, "grad_norm": 0.27120622630094215, "kl": 0.15625, "learning_rate": 9.997930646412624e-07, "loss": 0.006265889387577772, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 286, "train_speed(iter/s)": 0.020618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/mean_length": 600.5833740234375, "completions/min_length": 426.0, "epoch": 0.011892429453445489, "grad_norm": 0.2032594912395712, "kl": 0.1295166015625, "learning_rate": 9.997900104999202e-07, "loss": 0.005179170519113541, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 287, "train_speed(iter/s)": 0.020634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1262.416748046875, "completions/min_length": 554.0, "epoch": 0.011933866489868645, "grad_norm": 1.4374816958317291, "kl": 0.137939453125, "learning_rate": 9.99786933990204e-07, "loss": -0.2594202160835266, "memory(GiB)": 66.07, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 288, "train_speed(iter/s)": 0.020511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/mean_length": 611.6666870117188, "completions/min_length": 497.0, "epoch": 0.0119753035262918, "grad_norm": 0.19421033050250716, "kl": 0.1341552734375, "learning_rate": 9.997838351122523e-07, "loss": 0.005360499955713749, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 289, "train_speed(iter/s)": 0.020532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/mean_length": 618.0, "completions/min_length": 478.0, "epoch": 0.012016740562714954, "grad_norm": 0.25134991332196116, "kl": 0.142333984375, "learning_rate": 9.997807138662032e-07, "loss": 0.005692530423402786, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 290, "train_speed(iter/s)": 0.020547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/mean_length": 613.5, "completions/min_length": 494.0, "epoch": 0.01205817759913811, "grad_norm": 7.75777767651102, "kl": 0.1614990234375, "learning_rate": 9.997775702521965e-07, "loss": 0.0064702630043029785, "memory(GiB)": 66.07, "reward": 1.1666667461395264, "reward_std": 0.5365433692932129, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3256694972515106, "step": 291, "train_speed(iter/s)": 0.02056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/mean_length": 593.9166870117188, "completions/min_length": 442.0, "epoch": 0.012099614635561265, "grad_norm": 3.561960719730361, "kl": 0.1248779296875, "learning_rate": 9.99774404270373e-07, "loss": 0.005005916114896536, "memory(GiB)": 66.07, "reward": 0.9166666865348816, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 292, "train_speed(iter/s)": 0.020576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/mean_length": 648.1666870117188, "completions/min_length": 544.0, "epoch": 0.01214105167198442, "grad_norm": 2.8254314884837983, "kl": 0.0950927734375, "learning_rate": 9.997712159208743e-07, "loss": 0.0038094520568847656, "memory(GiB)": 66.07, "reward": 1.8333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 293, "train_speed(iter/s)": 0.020591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/mean_length": 636.75, "completions/min_length": 214.0, "epoch": 0.012182488708407574, "grad_norm": 12.767249399151131, "kl": 0.1339111328125, "learning_rate": 9.997680052038434e-07, "loss": 0.005357315298169851, "memory(GiB)": 66.07, "reward": 1.8333333730697632, "reward_std": 0.5773502588272095, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 294, "train_speed(iter/s)": 0.020608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/mean_length": 626.6666870117188, "completions/min_length": 487.0, "epoch": 0.01222392574483073, "grad_norm": 2.593545307320754, "kl": 0.126220703125, "learning_rate": 9.997647721194234e-07, "loss": 0.005046626087278128, "memory(GiB)": 66.07, "reward": 1.875, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 295, "train_speed(iter/s)": 0.020626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2327.0, "completions/mean_length": 885.5833740234375, "completions/min_length": 575.0, "epoch": 0.012265362781253885, "grad_norm": 2.604057150675933, "kl": 0.092529296875, "learning_rate": 9.997615166677596e-07, "loss": 0.0036995112895965576, "memory(GiB)": 66.07, "reward": 0.875, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 296, "train_speed(iter/s)": 0.020615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/mean_length": 591.8333740234375, "completions/min_length": 499.0, "epoch": 0.01230679981767704, "grad_norm": 30.66267465182808, "kl": 0.2421875, "learning_rate": 9.997582388489973e-07, "loss": 0.009695231914520264, "memory(GiB)": 66.07, "reward": 1.6666667461395264, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 297, "train_speed(iter/s)": 0.020635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/mean_length": 489.66668701171875, "completions/min_length": 351.0, "epoch": 0.012348236854100195, "grad_norm": 3.214772918429275, "kl": 0.14208984375, "learning_rate": 9.997549386632835e-07, "loss": 0.005680471658706665, "memory(GiB)": 66.07, "reward": 1.875, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 298, "train_speed(iter/s)": 0.020656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/mean_length": 726.5, "completions/min_length": 555.0, "epoch": 0.01238967389052335, "grad_norm": 2.900714149806617, "kl": 0.0950927734375, "learning_rate": 9.997516161107656e-07, "loss": 0.0037895888090133667, "memory(GiB)": 66.07, "reward": 0.7916666865348816, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 299, "train_speed(iter/s)": 0.020671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/mean_length": 714.8333740234375, "completions/min_length": 606.0, "epoch": 0.012431110926946504, "grad_norm": 14.736806351666651, "kl": 0.10498046875, "learning_rate": 9.997482711915925e-07, "loss": 0.004194488283246756, "memory(GiB)": 66.07, "reward": 1.75, "reward_std": 0.5, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 300, "train_speed(iter/s)": 0.020686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/mean_length": 644.3333740234375, "completions/min_length": 433.0, "epoch": 0.01247254796336966, "grad_norm": 2.9510157847621357, "kl": 0.1329345703125, "learning_rate": 9.997449039059139e-07, "loss": 0.00531776761636138, "memory(GiB)": 66.07, "reward": 1.8333333730697632, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 301, "train_speed(iter/s)": 0.020702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/mean_length": 666.3333740234375, "completions/min_length": 513.0, "epoch": 0.012513984999792815, "grad_norm": 3.0754691226789137, "kl": 0.1099853515625, "learning_rate": 9.997415142538805e-07, "loss": 0.004392802715301514, "memory(GiB)": 66.07, "reward": 1.8333333730697632, "reward_std": 0.32566946744918823, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 302, "train_speed(iter/s)": 0.020719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1099.0, "completions/mean_length": 836.1666870117188, "completions/min_length": 271.0, "epoch": 0.01255542203621597, "grad_norm": 26895.22486153068, "kl": 30.44873046875, "learning_rate": 9.99738102235644e-07, "loss": 1.2191181182861328, "memory(GiB)": 66.07, "reward": 1.625, "reward_std": 0.6440284848213196, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 303, "train_speed(iter/s)": 0.020729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/mean_length": 528.0, "completions/min_length": 452.0, "epoch": 0.012596859072639124, "grad_norm": 3.265579981324094, "kl": 0.172119140625, "learning_rate": 9.997346678513568e-07, "loss": 0.006885151378810406, "memory(GiB)": 66.07, "reward": 1.8333333730697632, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 304, "train_speed(iter/s)": 0.02075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/mean_length": 718.6666870117188, "completions/min_length": 605.0, "epoch": 0.01263829610906228, "grad_norm": 3.540650289035553, "kl": 0.1326904296875, "learning_rate": 9.997312111011732e-07, "loss": 0.005310282111167908, "memory(GiB)": 66.07, "reward": 1.75, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 305, "train_speed(iter/s)": 0.020765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/mean_length": 678.6666870117188, "completions/min_length": 560.0, "epoch": 0.012679733145485435, "grad_norm": 85.76486792846123, "kl": 0.2535400390625, "learning_rate": 9.997277319852474e-07, "loss": 0.010173222050070763, "memory(GiB)": 66.07, "reward": 1.8333333730697632, "reward_std": 0.32566946744918823, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 306, "train_speed(iter/s)": 0.02078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/mean_length": 672.6666870117188, "completions/min_length": 525.0, "epoch": 0.01272117018190859, "grad_norm": 2.938209465223338, "kl": 0.1204833984375, "learning_rate": 9.997242305037353e-07, "loss": 0.004825115203857422, "memory(GiB)": 66.07, "reward": 1.3333333730697632, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 307, "train_speed(iter/s)": 0.020793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 924.0, "completions/mean_length": 744.0833740234375, "completions/min_length": 597.0, "epoch": 0.012762607218331746, "grad_norm": 4.598131124072292, "kl": 0.122802734375, "learning_rate": 9.997207066567937e-07, "loss": 0.0049080997705459595, "memory(GiB)": 66.07, "reward": 1.75, "reward_std": 0.5, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 308, "train_speed(iter/s)": 0.020809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8000.0, "completions/mean_length": 1297.8333740234375, "completions/min_length": 555.0, "epoch": 0.0128040442547549, "grad_norm": 2.5072261970850835, "kl": 0.132568359375, "learning_rate": 9.997171604445802e-07, "loss": -0.16145741939544678, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.3964807391166687, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 309, "train_speed(iter/s)": 0.020682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1045.0, "completions/mean_length": 807.75, "completions/min_length": 664.0, "epoch": 0.012845481291178054, "grad_norm": 3.0009882113608923, "kl": 0.110595703125, "learning_rate": 9.997135918672535e-07, "loss": 0.004423121921718121, "memory(GiB)": 66.07, "reward": 1.5, "reward_std": 0.5222329497337341, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 310, "train_speed(iter/s)": 0.020692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/mean_length": 736.5, "completions/min_length": 601.0, "epoch": 0.01288691832760121, "grad_norm": 3.02676591940881, "kl": 0.1146240234375, "learning_rate": 9.997100009249735e-07, "loss": 0.004575421568006277, "memory(GiB)": 66.07, "reward": 1.125, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 311, "train_speed(iter/s)": 0.020706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/mean_length": 635.75, "completions/min_length": 493.0, "epoch": 0.012928355364024365, "grad_norm": 0.4882092820791339, "kl": 0.1591796875, "learning_rate": 9.997063876179007e-07, "loss": 0.006356876343488693, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 312, "train_speed(iter/s)": 0.020722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1037.0, "completions/mean_length": 777.0833740234375, "completions/min_length": 542.0, "epoch": 0.01296979240044752, "grad_norm": 3.186072466159794, "kl": 0.1251220703125, "learning_rate": 9.997027519461966e-07, "loss": 0.00500206183642149, "memory(GiB)": 66.07, "reward": 1.75, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 313, "train_speed(iter/s)": 0.020734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/mean_length": 690.75, "completions/min_length": 602.0, "epoch": 0.013011229436870674, "grad_norm": 0.21111566098758347, "kl": 0.1234130859375, "learning_rate": 9.996990939100246e-07, "loss": 0.004938122816383839, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 314, "train_speed(iter/s)": 0.020749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/mean_length": 677.1666870117188, "completions/min_length": 527.0, "epoch": 0.01305266647329383, "grad_norm": 0.325215962162683, "kl": 0.1300048828125, "learning_rate": 9.996954135095478e-07, "loss": 0.005194446537643671, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 315, "train_speed(iter/s)": 0.020766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/mean_length": 584.3333740234375, "completions/min_length": 442.0, "epoch": 0.013094103509716985, "grad_norm": 0.2701512904830647, "kl": 0.158935546875, "learning_rate": 9.99691710744931e-07, "loss": 0.006354997865855694, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 316, "train_speed(iter/s)": 0.020781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/mean_length": 650.25, "completions/min_length": 538.0, "epoch": 0.01313554054614014, "grad_norm": 0.21793999845929643, "kl": 0.122802734375, "learning_rate": 9.996879856163404e-07, "loss": 0.004916059784591198, "memory(GiB)": 66.07, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 317, "train_speed(iter/s)": 0.020799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1280.666748046875, "completions/min_length": 517.0, "epoch": 0.013176977582563296, "grad_norm": 2.7337251329557377, "kl": 0.11572265625, "learning_rate": 9.996842381239422e-07, "loss": -0.12760195136070251, "memory(GiB)": 66.07, "reward": 1.125, "reward_std": 0.7111130952835083, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 318, "train_speed(iter/s)": 0.020687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/mean_length": 646.3333740234375, "completions/min_length": 530.0, "epoch": 0.01321841461898645, "grad_norm": 2.6983403546985545, "kl": 0.1309814453125, "learning_rate": 9.996804682679043e-07, "loss": 0.005233307834714651, "memory(GiB)": 66.07, "reward": 1.9166667461395264, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 319, "train_speed(iter/s)": 0.020703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/mean_length": 702.4166870117188, "completions/min_length": 599.0, "epoch": 0.013259851655409605, "grad_norm": 3.048664692250525, "kl": 0.1220703125, "learning_rate": 9.996766760483955e-07, "loss": 0.004884372465312481, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.5967081785202026, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 320, "train_speed(iter/s)": 0.020661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.0, "completions/mean_length": 688.5833740234375, "completions/min_length": 571.0, "epoch": 0.013301288691832761, "grad_norm": 3.006825603889375, "kl": 0.1392822265625, "learning_rate": 9.996728614655853e-07, "loss": 0.005574067588895559, "memory(GiB)": 66.07, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 321, "train_speed(iter/s)": 0.020677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/mean_length": 656.5, "completions/min_length": 441.0, "epoch": 0.013342725728255915, "grad_norm": 2.897295789565799, "kl": 0.1259765625, "learning_rate": 9.996690245196446e-07, "loss": 0.005034427158534527, "memory(GiB)": 66.07, "reward": 0.9166666865348816, "reward_std": 0.19462473690509796, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 322, "train_speed(iter/s)": 0.020626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/mean_length": 723.75, "completions/min_length": 579.0, "epoch": 0.01338416276467907, "grad_norm": 0.29103949694738496, "kl": 0.10107421875, "learning_rate": 9.99665165210745e-07, "loss": 0.004040519706904888, "memory(GiB)": 66.07, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 323, "train_speed(iter/s)": 0.02064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1423.0, "completions/mean_length": 793.5833740234375, "completions/min_length": 540.0, "epoch": 0.013425599801102224, "grad_norm": 1517.366151683939, "kl": 6.439453125, "learning_rate": 9.996612835390594e-07, "loss": 0.2589181661605835, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 324, "train_speed(iter/s)": 0.020645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1058.0, "completions/mean_length": 799.0833740234375, "completions/min_length": 705.0, "epoch": 0.01346703683752538, "grad_norm": 30.069651245250412, "kl": 0.242919921875, "learning_rate": 9.996573795047616e-07, "loss": 0.009708861820399761, "memory(GiB)": 66.07, "reward": 1.5, "reward_std": 0.5640760660171509, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 325, "train_speed(iter/s)": 0.020656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/mean_length": 651.0, "completions/min_length": 519.0, "epoch": 0.013508473873948535, "grad_norm": 4.12662950194344, "kl": 0.1148681640625, "learning_rate": 9.996534531080259e-07, "loss": 0.004599382635205984, "memory(GiB)": 66.07, "reward": 1.875, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 326, "train_speed(iter/s)": 0.020668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/mean_length": 819.9166870117188, "completions/min_length": 132.0, "epoch": 0.01354991091037169, "grad_norm": 5.093078024989478, "kl": 0.08935546875, "learning_rate": 9.996495043490283e-07, "loss": 0.0035733382683247328, "memory(GiB)": 66.07, "reward": 0.8333333730697632, "reward_std": 0.32566946744918823, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3256694972515106, "step": 327, "train_speed(iter/s)": 0.020676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1203.416748046875, "completions/min_length": 180.0, "epoch": 0.013591347946794846, "grad_norm": 128.2663527513692, "kl": 0.7698974609375, "learning_rate": 9.996455332279457e-07, "loss": -0.05087852478027344, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.5967081785202026, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.6666666865348816, "rewards/FormatCorrectnessReward/std": 0.38924944400787354, "step": 328, "train_speed(iter/s)": 0.020571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/mean_length": 665.8333740234375, "completions/min_length": 544.0, "epoch": 0.013632784983218, "grad_norm": 0.20754321179324062, "kl": 0.09716796875, "learning_rate": 9.996415397449557e-07, "loss": 0.0038863676600158215, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 329, "train_speed(iter/s)": 0.020586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/mean_length": 733.25, "completions/min_length": 511.0, "epoch": 0.013674222019641155, "grad_norm": 5.757930836533083, "kl": 0.1044921875, "learning_rate": 9.996375239002368e-07, "loss": 0.004176884889602661, "memory(GiB)": 66.07, "reward": 1.75, "reward_std": 0.26111647486686707, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.26111647486686707, "step": 330, "train_speed(iter/s)": 0.020596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/mean_length": 719.0833740234375, "completions/min_length": 617.0, "epoch": 0.013715659056064311, "grad_norm": 2.5215759037532863, "kl": 0.0894775390625, "learning_rate": 9.99633485693969e-07, "loss": 0.0035776100121438503, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.25746431946754456, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 331, "train_speed(iter/s)": 0.02061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/mean_length": 659.1666870117188, "completions/min_length": 241.0, "epoch": 0.013757096092487466, "grad_norm": 3.746306248838124, "kl": 0.0955810546875, "learning_rate": 9.996294251263329e-07, "loss": 0.003819401143118739, "memory(GiB)": 66.07, "reward": 1.6666667461395264, "reward_std": 0.7784988880157471, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3892494738101959, "step": 332, "train_speed(iter/s)": 0.020623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/mean_length": 733.6666870117188, "completions/min_length": 570.0, "epoch": 0.01379853312891062, "grad_norm": 3.076282397185723, "kl": 0.0841064453125, "learning_rate": 9.996253421975102e-07, "loss": 0.003360653994604945, "memory(GiB)": 66.07, "reward": 1.4583333730697632, "reward_std": 0.49810245633125305, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 333, "train_speed(iter/s)": 0.020637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/mean_length": 745.8333740234375, "completions/min_length": 614.0, "epoch": 0.013839970165333774, "grad_norm": 0.2572467461459486, "kl": 0.089111328125, "learning_rate": 9.996212369076838e-07, "loss": 0.003564195241779089, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 334, "train_speed(iter/s)": 0.02065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/mean_length": 636.6666870117188, "completions/min_length": 164.0, "epoch": 0.01388140720175693, "grad_norm": 4.852584988072591, "kl": 0.11865234375, "learning_rate": 9.996171092570373e-07, "loss": 0.00473379110917449, "memory(GiB)": 66.07, "reward": 0.7916666865348816, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 335, "train_speed(iter/s)": 0.020665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/mean_length": 727.1666870117188, "completions/min_length": 550.0, "epoch": 0.013922844238180085, "grad_norm": 2.5468238995783703, "kl": 0.0924072265625, "learning_rate": 9.996129592457556e-07, "loss": 0.0036972365342080593, "memory(GiB)": 66.07, "reward": 1.8333333730697632, "reward_std": 0.3892494738101959, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 336, "train_speed(iter/s)": 0.020679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1121.0, "completions/mean_length": 768.9166870117188, "completions/min_length": 495.0, "epoch": 0.01396428127460324, "grad_norm": 2.6113912544577396, "kl": 0.0843505859375, "learning_rate": 9.996087868740241e-07, "loss": 0.0033719539642333984, "memory(GiB)": 66.07, "reward": 0.9166666865348816, "reward_std": 0.19462473690509796, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 337, "train_speed(iter/s)": 0.020686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/mean_length": 687.8333740234375, "completions/min_length": 463.0, "epoch": 0.014005718311026396, "grad_norm": 2.568648720262315, "kl": 0.103515625, "learning_rate": 9.9960459214203e-07, "loss": 0.004138152115046978, "memory(GiB)": 66.07, "reward": 1.75, "reward_std": 0.39886200428009033, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 338, "train_speed(iter/s)": 0.020699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/mean_length": 713.8333740234375, "completions/min_length": 588.0, "epoch": 0.01404715534744955, "grad_norm": 2.830796414223034, "kl": 0.093994140625, "learning_rate": 9.996003750499607e-07, "loss": 0.003773212432861328, "memory(GiB)": 66.07, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 339, "train_speed(iter/s)": 0.020711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/mean_length": 735.0833740234375, "completions/min_length": 617.0, "epoch": 0.014088592383872705, "grad_norm": 2.39673767287172, "kl": 0.095458984375, "learning_rate": 9.99596135598005e-07, "loss": 0.0038341484032571316, "memory(GiB)": 66.07, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 340, "train_speed(iter/s)": 0.020725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/mean_length": 736.4166870117188, "completions/min_length": 649.0, "epoch": 0.014130029420295861, "grad_norm": 2.9128832816997483, "kl": 0.0880126953125, "learning_rate": 9.995918737863528e-07, "loss": 0.003517111297696829, "memory(GiB)": 66.07, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 341, "train_speed(iter/s)": 0.020738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/mean_length": 693.1666870117188, "completions/min_length": 538.0, "epoch": 0.014171466456719016, "grad_norm": 2.9397926015099203, "kl": 0.1002197265625, "learning_rate": 9.995875896151944e-07, "loss": 0.004002253524959087, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.45016831159591675, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 342, "train_speed(iter/s)": 0.02075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/mean_length": 808.3333740234375, "completions/min_length": 253.0, "epoch": 0.01421290349314217, "grad_norm": 6.287613906572899, "kl": 0.0869140625, "learning_rate": 9.99583283084722e-07, "loss": 0.00348179554566741, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.7017294764518738, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3256694972515106, "step": 343, "train_speed(iter/s)": 0.020761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/mean_length": 678.0, "completions/min_length": 422.0, "epoch": 0.014254340529565325, "grad_norm": 142.65824821291895, "kl": 1.1771240234375, "learning_rate": 9.995789541951285e-07, "loss": 0.046893130987882614, "memory(GiB)": 66.07, "reward": 1.9166667461395264, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 344, "train_speed(iter/s)": 0.020776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/mean_length": 671.5833740234375, "completions/min_length": 453.0, "epoch": 0.014295777565988481, "grad_norm": 3.488875864424361, "kl": 0.0980224609375, "learning_rate": 9.99574602946607e-07, "loss": 0.003915782086551189, "memory(GiB)": 66.07, "reward": 0.8333333730697632, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 345, "train_speed(iter/s)": 0.02079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/mean_length": 653.6666870117188, "completions/min_length": 547.0, "epoch": 0.014337214602411635, "grad_norm": 2.900550347825707, "kl": 0.0872802734375, "learning_rate": 9.995702293393526e-07, "loss": 0.003489126916974783, "memory(GiB)": 66.07, "reward": 1.875, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 346, "train_speed(iter/s)": 0.020804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/mean_length": 684.9166870117188, "completions/min_length": 603.0, "epoch": 0.01437865163883479, "grad_norm": 2.868092260971426, "kl": 0.125, "learning_rate": 9.99565833373561e-07, "loss": 0.005003035068511963, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 347, "train_speed(iter/s)": 0.020818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1142.0, "completions/mean_length": 770.0833740234375, "completions/min_length": 603.0, "epoch": 0.014420088675257946, "grad_norm": 3.0307826173901793, "kl": 0.092529296875, "learning_rate": 9.99561415049429e-07, "loss": 0.0037030677776783705, "memory(GiB)": 66.07, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 348, "train_speed(iter/s)": 0.020827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/mean_length": 728.3333740234375, "completions/min_length": 580.0, "epoch": 0.0144615257116811, "grad_norm": 0.3855794072506539, "kl": 0.1156005859375, "learning_rate": 9.995569743671544e-07, "loss": 0.004614721052348614, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 349, "train_speed(iter/s)": 0.020836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/mean_length": 769.3333740234375, "completions/min_length": 686.0, "epoch": 0.014502962748104255, "grad_norm": 0.20933879596618915, "kl": 0.10791015625, "learning_rate": 9.995525113269358e-07, "loss": 0.00431794673204422, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 350, "train_speed(iter/s)": 0.020847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/mean_length": 609.5, "completions/min_length": 436.0, "epoch": 0.014544399784527411, "grad_norm": 0.4103529137077096, "kl": 0.1123046875, "learning_rate": 9.99548025928973e-07, "loss": 0.004493508487939835, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 351, "train_speed(iter/s)": 0.020864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1286.3333740234375, "completions/min_length": 537.0, "epoch": 0.014585836820950566, "grad_norm": 3.173178987960116, "kl": 0.1240234375, "learning_rate": 9.995435181734669e-07, "loss": -0.22986316680908203, "memory(GiB)": 66.07, "reward": 0.875, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 352, "train_speed(iter/s)": 0.020758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1216.0, "completions/mean_length": 889.4166870117188, "completions/min_length": 729.0, "epoch": 0.01462727385737372, "grad_norm": 2.42777150392111, "kl": 0.08984375, "learning_rate": 9.995389880606189e-07, "loss": 0.00359170651063323, "memory(GiB)": 66.07, "reward": 1.75, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 353, "train_speed(iter/s)": 0.020764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/mean_length": 643.1666870117188, "completions/min_length": 507.0, "epoch": 0.014668710893796875, "grad_norm": 2.339422260477366, "kl": 0.1053466796875, "learning_rate": 9.995344355906318e-07, "loss": 0.00421460485085845, "memory(GiB)": 66.07, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 354, "train_speed(iter/s)": 0.020778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/mean_length": 827.8333740234375, "completions/min_length": 686.0, "epoch": 0.014710147930220031, "grad_norm": 3.096194640761597, "kl": 0.0836181640625, "learning_rate": 9.995298607637097e-07, "loss": 0.0033470194321125746, "memory(GiB)": 66.07, "reward": 1.4166667461395264, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 355, "train_speed(iter/s)": 0.020788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/mean_length": 662.9166870117188, "completions/min_length": 544.0, "epoch": 0.014751584966643186, "grad_norm": 2.7053920137791034, "kl": 0.1160888671875, "learning_rate": 9.99525263580057e-07, "loss": 0.004644279833883047, "memory(GiB)": 66.07, "reward": 1.1666667461395264, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 356, "train_speed(iter/s)": 0.020801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/mean_length": 716.3333740234375, "completions/min_length": 537.0, "epoch": 0.01479302200306634, "grad_norm": 3.514281131411394, "kl": 0.0986328125, "learning_rate": 9.995206440398796e-07, "loss": 0.003948589321225882, "memory(GiB)": 66.07, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 357, "train_speed(iter/s)": 0.020813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/mean_length": 684.5, "completions/min_length": 551.0, "epoch": 0.014834459039489496, "grad_norm": 2.720523807512374, "kl": 0.1123046875, "learning_rate": 9.995160021433844e-07, "loss": 0.004487047903239727, "memory(GiB)": 66.07, "reward": 1.3333333730697632, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 358, "train_speed(iter/s)": 0.020826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1048.0, "completions/mean_length": 790.75, "completions/min_length": 659.0, "epoch": 0.01487589607591265, "grad_norm": 2.3722377856000207, "kl": 0.088623046875, "learning_rate": 9.995113378907789e-07, "loss": 0.0035442709922790527, "memory(GiB)": 66.07, "reward": 1.5, "reward_std": 0.5222329497337341, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 359, "train_speed(iter/s)": 0.020834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/mean_length": 640.0, "completions/min_length": 477.0, "epoch": 0.014917333112335805, "grad_norm": 2.727032408555185, "kl": 0.1005859375, "learning_rate": 9.995066512822718e-07, "loss": 0.004013687372207642, "memory(GiB)": 66.07, "reward": 1.875, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 360, "train_speed(iter/s)": 0.020847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/mean_length": 631.9166870117188, "completions/min_length": 512.0, "epoch": 0.014958770148758961, "grad_norm": 0.19603082884824163, "kl": 0.1021728515625, "learning_rate": 9.995019423180733e-07, "loss": 0.0040922933258116245, "memory(GiB)": 66.07, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 361, "train_speed(iter/s)": 0.020862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/mean_length": 790.0833740234375, "completions/min_length": 637.0, "epoch": 0.015000207185182116, "grad_norm": 18.634770972995973, "kl": 0.099365234375, "learning_rate": 9.994972109983937e-07, "loss": 0.003971874713897705, "memory(GiB)": 66.07, "reward": 1.2083333730697632, "reward_std": 0.49810245633125305, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 362, "train_speed(iter/s)": 0.020872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/mean_length": 676.75, "completions/min_length": 563.0, "epoch": 0.01504164422160527, "grad_norm": 0.2273267201728106, "kl": 0.107421875, "learning_rate": 9.994924573234446e-07, "loss": 0.004292338155210018, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 363, "train_speed(iter/s)": 0.020885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/mean_length": 719.75, "completions/min_length": 576.0, "epoch": 0.015083081258028427, "grad_norm": 2.5057878018384345, "kl": 0.082275390625, "learning_rate": 9.994876812934393e-07, "loss": 0.0032923321705311537, "memory(GiB)": 66.07, "reward": 1.625, "reward_std": 0.4826536476612091, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 364, "train_speed(iter/s)": 0.020896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/mean_length": 813.8333740234375, "completions/min_length": 601.0, "epoch": 0.015124518294451581, "grad_norm": 2.0605745370972945, "kl": 0.0916748046875, "learning_rate": 9.994828829085914e-07, "loss": 0.0036655764561146498, "memory(GiB)": 66.07, "reward": 1.0833333730697632, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 365, "train_speed(iter/s)": 0.020907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/mean_length": 679.5833740234375, "completions/min_length": 566.0, "epoch": 0.015165955330874736, "grad_norm": 3.9166097967559974, "kl": 0.0877685546875, "learning_rate": 9.994780621691154e-07, "loss": 0.0035188994370400906, "memory(GiB)": 66.07, "reward": 0.875, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 366, "train_speed(iter/s)": 0.020921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/mean_length": 750.75, "completions/min_length": 618.0, "epoch": 0.01520739236729789, "grad_norm": 2.859404301706248, "kl": 0.08984375, "learning_rate": 9.994732190752274e-07, "loss": 0.0035896748304367065, "memory(GiB)": 66.07, "reward": 1.5416667461395264, "reward_std": 0.6200562119483948, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 367, "train_speed(iter/s)": 0.02093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1435.416748046875, "completions/min_length": 658.0, "epoch": 0.015248829403721046, "grad_norm": 1.4048376106558407, "kl": 0.0870361328125, "learning_rate": 9.994683536271436e-07, "loss": -0.26134610176086426, "memory(GiB)": 66.07, "reward": 0.9166666865348816, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 368, "train_speed(iter/s)": 0.020832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 985.0, "completions/mean_length": 725.5833740234375, "completions/min_length": 523.0, "epoch": 0.015290266440144201, "grad_norm": 2.270702736119456, "kl": 0.094970703125, "learning_rate": 9.994634658250824e-07, "loss": 0.0038032731972634792, "memory(GiB)": 66.07, "reward": 1.875, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 369, "train_speed(iter/s)": 0.020844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/mean_length": 645.3333740234375, "completions/min_length": 525.0, "epoch": 0.015331703476567355, "grad_norm": 3.04069730395272, "kl": 0.1109619140625, "learning_rate": 9.994585556692624e-07, "loss": 0.004441410303115845, "memory(GiB)": 66.07, "reward": 0.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 370, "train_speed(iter/s)": 0.020858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/mean_length": 786.6666870117188, "completions/min_length": 673.0, "epoch": 0.015373140512990512, "grad_norm": 2.162895294617575, "kl": 0.08984375, "learning_rate": 9.994536231599028e-07, "loss": 0.0035959682427346706, "memory(GiB)": 66.07, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 371, "train_speed(iter/s)": 0.020868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/mean_length": 766.6666870117188, "completions/min_length": 600.0, "epoch": 0.015414577549413666, "grad_norm": 3.1847613058953965, "kl": 0.0970458984375, "learning_rate": 9.994486682972252e-07, "loss": 0.0038722506724298, "memory(GiB)": 66.07, "reward": 1.8333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 372, "train_speed(iter/s)": 0.02088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/mean_length": 764.0833740234375, "completions/min_length": 576.0, "epoch": 0.01545601458583682, "grad_norm": 2.5400840243057687, "kl": 0.09765625, "learning_rate": 9.994436910814508e-07, "loss": 0.0039041440468281507, "memory(GiB)": 66.07, "reward": 1.4166667461395264, "reward_std": 0.6685579419136047, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 373, "train_speed(iter/s)": 0.020889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1044.0, "completions/mean_length": 855.5833740234375, "completions/min_length": 694.0, "epoch": 0.015497451622259977, "grad_norm": 5.010199777456857, "kl": 0.08544921875, "learning_rate": 9.994386915128024e-07, "loss": 0.0034130464773625135, "memory(GiB)": 66.07, "reward": 1.1666667461395264, "reward_std": 0.5773502588272095, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 374, "train_speed(iter/s)": 0.020893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/mean_length": 704.3333740234375, "completions/min_length": 591.0, "epoch": 0.015538888658683131, "grad_norm": 2.5255558590322744, "kl": 0.109375, "learning_rate": 9.99433669591504e-07, "loss": 0.004372388124465942, "memory(GiB)": 66.07, "reward": 1.75, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 375, "train_speed(iter/s)": 0.020906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/mean_length": 658.5, "completions/min_length": 547.0, "epoch": 0.015580325695106286, "grad_norm": 0.18401605754434522, "kl": 0.1207275390625, "learning_rate": 9.994286253177802e-07, "loss": 0.004836343228816986, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 376, "train_speed(iter/s)": 0.020917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/mean_length": 715.1666870117188, "completions/min_length": 568.0, "epoch": 0.01562176273152944, "grad_norm": 14.555907481072326, "kl": 0.251220703125, "learning_rate": 9.994235586918568e-07, "loss": 0.01007920503616333, "memory(GiB)": 66.07, "reward": 1.5416667461395264, "reward_std": 0.49810245633125305, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 377, "train_speed(iter/s)": 0.020917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/mean_length": 626.5, "completions/min_length": 475.0, "epoch": 0.015663199767952597, "grad_norm": 8.348117456699201, "kl": 0.2816162109375, "learning_rate": 9.994184697139604e-07, "loss": 0.011263927444815636, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.3964807391166687, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 378, "train_speed(iter/s)": 0.020932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/mean_length": 599.5833740234375, "completions/min_length": 436.0, "epoch": 0.01570463680437575, "grad_norm": 2.7729268903777524, "kl": 0.1094970703125, "learning_rate": 9.99413358384319e-07, "loss": 0.00437172269448638, "memory(GiB)": 66.07, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 379, "train_speed(iter/s)": 0.020945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/mean_length": 783.0, "completions/min_length": 539.0, "epoch": 0.015746073840798906, "grad_norm": 3.328047816007854, "kl": 0.090576171875, "learning_rate": 9.994082247031613e-07, "loss": 0.0036292970180511475, "memory(GiB)": 66.07, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 380, "train_speed(iter/s)": 0.02095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/mean_length": 569.5, "completions/min_length": 438.0, "epoch": 0.01578751087722206, "grad_norm": 3.535932764701129, "kl": 0.1380615234375, "learning_rate": 9.99403068670717e-07, "loss": 0.0055164797231554985, "memory(GiB)": 66.07, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 381, "train_speed(iter/s)": 0.020965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/mean_length": 685.8333740234375, "completions/min_length": 569.0, "epoch": 0.015828947913645215, "grad_norm": 2.342697334228568, "kl": 0.128173828125, "learning_rate": 9.993978902872169e-07, "loss": 0.005117287393659353, "memory(GiB)": 66.07, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 382, "train_speed(iter/s)": 0.020977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/mean_length": 715.75, "completions/min_length": 579.0, "epoch": 0.015870384950068372, "grad_norm": 2.4875320874410867, "kl": 0.1009521484375, "learning_rate": 9.993926895528927e-07, "loss": 0.004030893556773663, "memory(GiB)": 66.07, "reward": 1.7083333730697632, "reward_std": 0.5418123602867126, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 383, "train_speed(iter/s)": 0.020988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/mean_length": 705.8333740234375, "completions/min_length": 542.0, "epoch": 0.015911821986491527, "grad_norm": 2.6633818805802867, "kl": 0.0989990234375, "learning_rate": 9.993874664679772e-07, "loss": 0.0039532482624053955, "memory(GiB)": 66.07, "reward": 1.875, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 384, "train_speed(iter/s)": 0.020995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/mean_length": 643.0, "completions/min_length": 473.0, "epoch": 0.01595325902291468, "grad_norm": 2.490445918728492, "kl": 0.1488037109375, "learning_rate": 9.993822210327043e-07, "loss": 0.005955298896878958, "memory(GiB)": 66.07, "reward": 1.9166667461395264, "reward_std": 0.19462473690509796, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 385, "train_speed(iter/s)": 0.021006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1102.0, "completions/mean_length": 776.0833740234375, "completions/min_length": 605.0, "epoch": 0.015994696059337836, "grad_norm": 0.13092962065226577, "kl": 0.09423828125, "learning_rate": 9.993769532473085e-07, "loss": 0.0037607955746352673, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 386, "train_speed(iter/s)": 0.021014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/mean_length": 616.3333740234375, "completions/min_length": 490.0, "epoch": 0.01603613309576099, "grad_norm": 2.614679834952277, "kl": 0.1226806640625, "learning_rate": 9.993716631120258e-07, "loss": 0.004908710718154907, "memory(GiB)": 66.07, "reward": 0.9166666865348816, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 387, "train_speed(iter/s)": 0.021029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/mean_length": 749.9166870117188, "completions/min_length": 636.0, "epoch": 0.016077570132184145, "grad_norm": 2.3488832079368884, "kl": 0.1031494140625, "learning_rate": 9.993663506270926e-07, "loss": 0.004124184604734182, "memory(GiB)": 66.07, "reward": 1.1666667461395264, "reward_std": 0.5773502588272095, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.6666666865348816, "rewards/FormatCorrectnessReward/std": 0.3892494738101959, "step": 388, "train_speed(iter/s)": 0.02104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/mean_length": 623.1666870117188, "completions/min_length": 452.0, "epoch": 0.016119007168607303, "grad_norm": 3.0504462258771534, "kl": 0.103515625, "learning_rate": 9.993610157927473e-07, "loss": 0.004145205020904541, "memory(GiB)": 66.07, "reward": 1.375, "reward_std": 0.4826536476612091, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.625, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 389, "train_speed(iter/s)": 0.021051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1166.5833740234375, "completions/min_length": 111.0, "epoch": 0.016160444205030457, "grad_norm": 4.971127068700265, "kl": 0.1253662109375, "learning_rate": 9.99355658609228e-07, "loss": -0.023034464567899704, "memory(GiB)": 66.07, "reward": 1.25, "reward_std": 0.753778338432312, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.45226702094078064, "step": 390, "train_speed(iter/s)": 0.020958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 781.0, "completions/mean_length": 629.1666870117188, "completions/min_length": 499.0, "epoch": 0.016201881241453612, "grad_norm": 2.8667620451055167, "kl": 0.123291015625, "learning_rate": 9.993502790767747e-07, "loss": 0.004919072147458792, "memory(GiB)": 66.07, "reward": 1.0416667461395264, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 391, "train_speed(iter/s)": 0.020971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/mean_length": 749.25, "completions/min_length": 561.0, "epoch": 0.016243318277876766, "grad_norm": 0.16329225849606355, "kl": 0.0931396484375, "learning_rate": 9.993448771956284e-07, "loss": 0.0037325944285839796, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 392, "train_speed(iter/s)": 0.020978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/mean_length": 721.5, "completions/min_length": 513.0, "epoch": 0.01628475531429992, "grad_norm": 2.49789373761381, "kl": 0.1007080078125, "learning_rate": 9.993394529660306e-07, "loss": 0.004010965581983328, "memory(GiB)": 66.07, "reward": 1.875, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 393, "train_speed(iter/s)": 0.020989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/mean_length": 731.8333740234375, "completions/min_length": 619.0, "epoch": 0.016326192350723075, "grad_norm": 2.8556680887955546, "kl": 0.0931396484375, "learning_rate": 9.993340063882242e-07, "loss": 0.0037191114388406277, "memory(GiB)": 66.07, "reward": 1.3333333730697632, "reward_std": 0.6154574751853943, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 394, "train_speed(iter/s)": 0.020998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/mean_length": 665.8333740234375, "completions/min_length": 353.0, "epoch": 0.01636762938714623, "grad_norm": 3.6473927835653175, "kl": 0.121337890625, "learning_rate": 9.993285374624529e-07, "loss": 0.004851480480283499, "memory(GiB)": 66.07, "reward": 1.8333333730697632, "reward_std": 0.32566946744918823, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3256694972515106, "step": 395, "train_speed(iter/s)": 0.02101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 812.0, "completions/mean_length": 598.1666870117188, "completions/min_length": 512.0, "epoch": 0.016409066423569388, "grad_norm": 2.496506115714827, "kl": 0.1300048828125, "learning_rate": 9.993230461889615e-07, "loss": 0.005207499023526907, "memory(GiB)": 66.07, "reward": 0.75, "reward_std": 0.33709993958473206, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.33709993958473206, "step": 396, "train_speed(iter/s)": 0.021024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 741.0, "completions/mean_length": 595.0833740234375, "completions/min_length": 493.0, "epoch": 0.016450503459992542, "grad_norm": 2.5053285855518657, "kl": 0.0830078125, "learning_rate": 9.993175325679957e-07, "loss": 0.003318717237561941, "memory(GiB)": 66.07, "reward": 1.8333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 397, "train_speed(iter/s)": 0.021039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/mean_length": 707.6666870117188, "completions/min_length": 526.0, "epoch": 0.016491940496415697, "grad_norm": 2.420639966393638, "kl": 0.097412109375, "learning_rate": 9.993119965998022e-07, "loss": 0.003897647140547633, "memory(GiB)": 66.07, "reward": 0.7916666865348816, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 398, "train_speed(iter/s)": 0.021049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1118.0, "completions/mean_length": 727.0, "completions/min_length": 546.0, "epoch": 0.01653337753283885, "grad_norm": 2.7155309308624753, "kl": 0.099609375, "learning_rate": 9.993064382846289e-07, "loss": 0.003977338783442974, "memory(GiB)": 66.07, "reward": 1.125, "reward_std": 0.5690901875495911, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 399, "train_speed(iter/s)": 0.021056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/mean_length": 704.0, "completions/min_length": 503.0, "epoch": 0.016574814569262006, "grad_norm": 2.583785801034086, "kl": 0.0933837890625, "learning_rate": 9.993008576227246e-07, "loss": 0.003733138320967555, "memory(GiB)": 66.07, "reward": 1.2083333730697632, "reward_std": 0.5418123602867126, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 400, "train_speed(iter/s)": 0.021065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/mean_length": 616.75, "completions/min_length": 495.0, "epoch": 0.01661625160568516, "grad_norm": 2.821197184997131, "kl": 0.1072998046875, "learning_rate": 9.992952546143389e-07, "loss": 0.004294236656278372, "memory(GiB)": 66.07, "reward": 1.625, "reward_std": 0.4330126941204071, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 401, "train_speed(iter/s)": 0.021077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/mean_length": 741.6666870117188, "completions/min_length": 545.0, "epoch": 0.016657688642108315, "grad_norm": 359.54855283392556, "kl": 5.7850341796875, "learning_rate": 9.992896292597228e-07, "loss": 0.2319704294204712, "memory(GiB)": 66.07, "reward": 0.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 402, "train_speed(iter/s)": 0.021088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/mean_length": 730.25, "completions/min_length": 516.0, "epoch": 0.016699125678531473, "grad_norm": 12.129186739847636, "kl": 0.134765625, "learning_rate": 9.992839815591279e-07, "loss": 0.005401179194450378, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.3964807391166687, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 403, "train_speed(iter/s)": 0.021098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/mean_length": 737.75, "completions/min_length": 553.0, "epoch": 0.016740562714954627, "grad_norm": 2.728442384765451, "kl": 0.1041259765625, "learning_rate": 9.99278311512807e-07, "loss": 0.004164238926023245, "memory(GiB)": 66.07, "reward": 0.8333333730697632, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 404, "train_speed(iter/s)": 0.021107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/mean_length": 655.9166870117188, "completions/min_length": 518.0, "epoch": 0.016781999751377782, "grad_norm": 2.611062030836967, "kl": 0.1123046875, "learning_rate": 9.992726191210137e-07, "loss": 0.004504412412643433, "memory(GiB)": 66.07, "reward": 1.9166667461395264, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 405, "train_speed(iter/s)": 0.02112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/mean_length": 646.5, "completions/min_length": 536.0, "epoch": 0.016823436787800936, "grad_norm": 2.6967754749226147, "kl": 0.1099853515625, "learning_rate": 9.99266904384003e-07, "loss": 0.004389822483062744, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.25746431946754456, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 406, "train_speed(iter/s)": 0.021134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1237.0, "completions/mean_length": 824.3333740234375, "completions/min_length": 654.0, "epoch": 0.01686487382422409, "grad_norm": 2.9518863597504503, "kl": 0.1060791015625, "learning_rate": 9.992611673020305e-07, "loss": 0.004244387149810791, "memory(GiB)": 66.07, "reward": 1.1666667461395264, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 407, "train_speed(iter/s)": 0.021139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/mean_length": 581.0833740234375, "completions/min_length": 437.0, "epoch": 0.016906310860647245, "grad_norm": 0.22884319702862171, "kl": 0.14111328125, "learning_rate": 9.992554078753533e-07, "loss": 0.005643174983561039, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 408, "train_speed(iter/s)": 0.021145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1288.75, "completions/min_length": 506.0, "epoch": 0.016947747897070403, "grad_norm": 2.348151225511257, "kl": 0.1085205078125, "learning_rate": 9.992496261042288e-07, "loss": -0.1635902374982834, "memory(GiB)": 66.07, "reward": 1.25, "reward_std": 0.6215815544128418, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 409, "train_speed(iter/s)": 0.021054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/mean_length": 656.8333740234375, "completions/min_length": 451.0, "epoch": 0.016989184933493558, "grad_norm": 6.335508840509681, "kl": 0.127685546875, "learning_rate": 9.99243821988916e-07, "loss": 0.005111913196742535, "memory(GiB)": 66.07, "reward": 1.8333333730697632, "reward_std": 0.5773502588272095, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 410, "train_speed(iter/s)": 0.021066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/mean_length": 636.5, "completions/min_length": 535.0, "epoch": 0.017030621969916712, "grad_norm": 2.5242946289903356, "kl": 0.1357421875, "learning_rate": 9.992379955296745e-07, "loss": 0.005439341068267822, "memory(GiB)": 66.07, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 411, "train_speed(iter/s)": 0.021078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/mean_length": 580.6666870117188, "completions/min_length": 480.0, "epoch": 0.017072059006339867, "grad_norm": 2.811902978510295, "kl": 0.15673828125, "learning_rate": 9.992321467267649e-07, "loss": 0.006268124096095562, "memory(GiB)": 66.07, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 412, "train_speed(iter/s)": 0.021091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/mean_length": 672.25, "completions/min_length": 450.0, "epoch": 0.01711349604276302, "grad_norm": 0.2580947254474904, "kl": 0.1156005859375, "learning_rate": 9.992262755804495e-07, "loss": 0.004632305353879929, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 413, "train_speed(iter/s)": 0.021101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/mean_length": 675.9166870117188, "completions/min_length": 572.0, "epoch": 0.017154933079186176, "grad_norm": 2.6006158400483725, "kl": 0.1077880859375, "learning_rate": 9.992203820909905e-07, "loss": 0.00431843614205718, "memory(GiB)": 66.07, "reward": 1.0833333730697632, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 414, "train_speed(iter/s)": 0.021112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/mean_length": 561.6666870117188, "completions/min_length": 454.0, "epoch": 0.01719637011560933, "grad_norm": 2.691007245386592, "kl": 0.137451171875, "learning_rate": 9.99214466258652e-07, "loss": 0.005484084598720074, "memory(GiB)": 66.07, "reward": 1.8333333730697632, "reward_std": 0.32566946744918823, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 415, "train_speed(iter/s)": 0.021125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/mean_length": 612.0833740234375, "completions/min_length": 474.0, "epoch": 0.017237807152032488, "grad_norm": 0.39078507984396177, "kl": 0.1240234375, "learning_rate": 9.992085280836986e-07, "loss": 0.0049666776321828365, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 416, "train_speed(iter/s)": 0.021134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 421.66668701171875, "completions/min_length": 364.0, "epoch": 0.017279244188455643, "grad_norm": 0.28372973104658483, "kl": 0.1455078125, "learning_rate": 9.992025675663965e-07, "loss": 0.005820628255605698, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 417, "train_speed(iter/s)": 0.021152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1196.0833740234375, "completions/min_length": 410.0, "epoch": 0.017320681224878797, "grad_norm": 2.481390530606022, "kl": 0.1148681640625, "learning_rate": 9.991965847070118e-07, "loss": -0.18115019798278809, "memory(GiB)": 66.07, "reward": 1.5, "reward_std": 0.6741998791694641, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 418, "train_speed(iter/s)": 0.021064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/mean_length": 649.8333740234375, "completions/min_length": 495.0, "epoch": 0.01736211826130195, "grad_norm": 2.9611866172107884, "kl": 0.1124267578125, "learning_rate": 9.991905795058126e-07, "loss": 0.004510641098022461, "memory(GiB)": 66.07, "reward": 1.4583333730697632, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 419, "train_speed(iter/s)": 0.021077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/mean_length": 638.8333740234375, "completions/min_length": 522.0, "epoch": 0.017403555297725106, "grad_norm": 4.923658646905657, "kl": 0.1468505859375, "learning_rate": 9.991845519630676e-07, "loss": 0.005868395324796438, "memory(GiB)": 66.07, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 420, "train_speed(iter/s)": 0.021088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/mean_length": 652.75, "completions/min_length": 422.0, "epoch": 0.01744499233414826, "grad_norm": 2.3504144322377325, "kl": 0.0997314453125, "learning_rate": 9.99178502079047e-07, "loss": 0.003987908363342285, "memory(GiB)": 66.07, "reward": 0.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 421, "train_speed(iter/s)": 0.021098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/mean_length": 565.4166870117188, "completions/min_length": 486.0, "epoch": 0.017486429370571415, "grad_norm": 2.55925163249021, "kl": 0.1405029296875, "learning_rate": 9.99172429854021e-07, "loss": 0.005611380096524954, "memory(GiB)": 66.07, "reward": 0.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 422, "train_speed(iter/s)": 0.021111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/mean_length": 608.0, "completions/min_length": 389.0, "epoch": 0.017527866406994573, "grad_norm": 2.6976740635641683, "kl": 0.1209716796875, "learning_rate": 9.991663352882613e-07, "loss": 0.004839897155761719, "memory(GiB)": 66.07, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 423, "train_speed(iter/s)": 0.021122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/mean_length": 630.5833740234375, "completions/min_length": 461.0, "epoch": 0.017569303443417728, "grad_norm": 0.21221635566554492, "kl": 0.130126953125, "learning_rate": 9.99160218382041e-07, "loss": 0.005206838250160217, "memory(GiB)": 66.07, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 424, "train_speed(iter/s)": 0.021132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/mean_length": 682.4166870117188, "completions/min_length": 537.0, "epoch": 0.017610740479840882, "grad_norm": 2.64644004145584, "kl": 0.10888671875, "learning_rate": 9.991540791356342e-07, "loss": 0.004370838403701782, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 425, "train_speed(iter/s)": 0.021143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/mean_length": 592.9166870117188, "completions/min_length": 335.0, "epoch": 0.017652177516264037, "grad_norm": 0.1888531038471669, "kl": 0.1273193359375, "learning_rate": 9.991479175493148e-07, "loss": 0.005094898398965597, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 426, "train_speed(iter/s)": 0.021156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/mean_length": 547.4166870117188, "completions/min_length": 376.0, "epoch": 0.01769361455268719, "grad_norm": 3.802881862076727, "kl": 0.131103515625, "learning_rate": 9.991417336233593e-07, "loss": 0.005255669355392456, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 427, "train_speed(iter/s)": 0.02117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/mean_length": 649.4166870117188, "completions/min_length": 456.0, "epoch": 0.017735051589110346, "grad_norm": 10.561724589670028, "kl": 0.185791015625, "learning_rate": 9.99135527358044e-07, "loss": 0.007443592883646488, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 428, "train_speed(iter/s)": 0.021182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/mean_length": 643.6666870117188, "completions/min_length": 516.0, "epoch": 0.017776488625533503, "grad_norm": 0.1508374835029139, "kl": 0.1123046875, "learning_rate": 9.991292987536468e-07, "loss": 0.004501977004110813, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 429, "train_speed(iter/s)": 0.021194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/mean_length": 637.5833740234375, "completions/min_length": 497.0, "epoch": 0.017817925661956658, "grad_norm": 2.8112017376600167, "kl": 0.1016845703125, "learning_rate": 9.991230478104464e-07, "loss": 0.00406983494758606, "memory(GiB)": 66.07, "reward": 0.8333333730697632, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 430, "train_speed(iter/s)": 0.021205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/mean_length": 611.8333740234375, "completions/min_length": 436.0, "epoch": 0.017859362698379812, "grad_norm": 124907.64178325814, "kl": 900.087646484375, "learning_rate": 9.991167745287228e-07, "loss": 36.04395294189453, "memory(GiB)": 66.07, "reward": 1.3333333730697632, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 431, "train_speed(iter/s)": 0.021215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/mean_length": 623.3333740234375, "completions/min_length": 408.0, "epoch": 0.017900799734802967, "grad_norm": 3.0480099537376577, "kl": 0.1085205078125, "learning_rate": 9.991104789087569e-07, "loss": 0.004341244697570801, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 432, "train_speed(iter/s)": 0.021214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/mean_length": 612.3333740234375, "completions/min_length": 508.0, "epoch": 0.01794223677122612, "grad_norm": 0.1718639666195516, "kl": 0.1123046875, "learning_rate": 9.991041609508298e-07, "loss": 0.00448406208306551, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 433, "train_speed(iter/s)": 0.021226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/mean_length": 660.5833740234375, "completions/min_length": 535.0, "epoch": 0.017983673807649276, "grad_norm": 0.21654813976347337, "kl": 0.1009521484375, "learning_rate": 9.99097820655225e-07, "loss": 0.004044694826006889, "memory(GiB)": 66.07, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 434, "train_speed(iter/s)": 0.021236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/mean_length": 694.0, "completions/min_length": 579.0, "epoch": 0.01802511084407243, "grad_norm": 0.8204832952620631, "kl": 0.0936279296875, "learning_rate": 9.990914580222255e-07, "loss": 0.0037442073225975037, "memory(GiB)": 66.07, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 435, "train_speed(iter/s)": 0.021246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1279.0, "completions/mean_length": 698.75, "completions/min_length": 486.0, "epoch": 0.01806654788049559, "grad_norm": 0.25521816371814987, "kl": 0.112548828125, "learning_rate": 9.99085073052117e-07, "loss": 0.004491235129535198, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 436, "train_speed(iter/s)": 0.021249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1214.0, "completions/mean_length": 741.4166870117188, "completions/min_length": 447.0, "epoch": 0.018107984916918743, "grad_norm": 15.638818791228443, "kl": 0.1729736328125, "learning_rate": 9.990786657451843e-07, "loss": 0.006932884454727173, "memory(GiB)": 66.07, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 437, "train_speed(iter/s)": 0.021253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/mean_length": 583.9166870117188, "completions/min_length": 471.0, "epoch": 0.018149421953341897, "grad_norm": 0.1969111366773464, "kl": 0.112060546875, "learning_rate": 9.990722361017149e-07, "loss": 0.004481704439967871, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 438, "train_speed(iter/s)": 0.021266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/mean_length": 672.5, "completions/min_length": 578.0, "epoch": 0.018190858989765052, "grad_norm": 26.747294896431818, "kl": 0.2039794921875, "learning_rate": 9.990657841219961e-07, "loss": 0.00817113183438778, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 439, "train_speed(iter/s)": 0.021276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/mean_length": 679.5833740234375, "completions/min_length": 483.0, "epoch": 0.018232296026188206, "grad_norm": 2.5951060515423356, "kl": 0.094970703125, "learning_rate": 9.99059309806317e-07, "loss": 0.003785173175856471, "memory(GiB)": 66.07, "reward": 1.6666667461395264, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 440, "train_speed(iter/s)": 0.021286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 792.0, "completions/mean_length": 579.5833740234375, "completions/min_length": 421.0, "epoch": 0.01827373306261136, "grad_norm": 7.487381990924135, "kl": 0.33251953125, "learning_rate": 9.990528131549671e-07, "loss": 0.013341426849365234, "memory(GiB)": 66.07, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 441, "train_speed(iter/s)": 0.021297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/mean_length": 688.3333740234375, "completions/min_length": 495.0, "epoch": 0.018315170099034515, "grad_norm": 96.64355246253638, "kl": 0.417236328125, "learning_rate": 9.990462941682374e-07, "loss": 0.016720285639166832, "memory(GiB)": 66.07, "reward": 0.8333333730697632, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 442, "train_speed(iter/s)": 0.021306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/mean_length": 724.0833740234375, "completions/min_length": 515.0, "epoch": 0.018356607135457673, "grad_norm": 2.7741030648402325, "kl": 0.0931396484375, "learning_rate": 9.990397528464194e-07, "loss": 0.003719508647918701, "memory(GiB)": 66.07, "reward": 1.75, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 443, "train_speed(iter/s)": 0.021314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1273.0, "completions/mean_length": 971.5833740234375, "completions/min_length": 610.0, "epoch": 0.018398044171880828, "grad_norm": 2.0406901069443766, "kl": 0.06585693359375, "learning_rate": 9.990331891898058e-07, "loss": 0.0026318232994526625, "memory(GiB)": 66.07, "reward": 1.5, "reward_std": 0.6396021246910095, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 444, "train_speed(iter/s)": 0.021317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.0, "completions/mean_length": 593.8333740234375, "completions/min_length": 383.0, "epoch": 0.018439481208303982, "grad_norm": 3.8834687968676973, "kl": 0.1202392578125, "learning_rate": 9.990266031986908e-07, "loss": 0.004819005727767944, "memory(GiB)": 66.07, "reward": 0.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 445, "train_speed(iter/s)": 0.021325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/mean_length": 642.9166870117188, "completions/min_length": 437.0, "epoch": 0.018480918244727137, "grad_norm": 2.9936824425340847, "kl": 0.091552734375, "learning_rate": 9.990199948733689e-07, "loss": 0.0036612204276025295, "memory(GiB)": 66.07, "reward": 1.1666667461395264, "reward_std": 0.5365433692932129, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 446, "train_speed(iter/s)": 0.021294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 508.75, "completions/min_length": 408.0, "epoch": 0.01852235528115029, "grad_norm": 3.480422158454513, "kl": 0.119873046875, "learning_rate": 9.990133642141357e-07, "loss": 0.004789099097251892, "memory(GiB)": 66.07, "reward": 1.875, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 447, "train_speed(iter/s)": 0.021303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/mean_length": 657.0, "completions/min_length": 499.0, "epoch": 0.018563792317573446, "grad_norm": 2.6517816884297996, "kl": 0.0802001953125, "learning_rate": 9.990067112212884e-07, "loss": 0.0032153825741261244, "memory(GiB)": 66.07, "reward": 1.6666667461395264, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 448, "train_speed(iter/s)": 0.021311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 8001.0, "completions/mean_length": 1861.75, "completions/min_length": 481.0, "epoch": 0.018605229353996604, "grad_norm": 2.882345747390627, "kl": 0.13232421875, "learning_rate": 9.99000035895124e-07, "loss": -0.341072678565979, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.7637625932693481, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.39886200428009033, "step": 449, "train_speed(iter/s)": 0.02122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1120.0, "completions/mean_length": 779.0, "completions/min_length": 576.0, "epoch": 0.018646666390419758, "grad_norm": 2.4626907539777747, "kl": 0.0970458984375, "learning_rate": 9.989933382359422e-07, "loss": 0.0038856863975524902, "memory(GiB)": 66.07, "reward": 1.625, "reward_std": 0.376889169216156, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 450, "train_speed(iter/s)": 0.021225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/mean_length": 581.6666870117188, "completions/min_length": 498.0, "epoch": 0.018688103426842913, "grad_norm": 2.6147958910792535, "kl": 0.1119384765625, "learning_rate": 9.98986618244042e-07, "loss": 0.004471282474696636, "memory(GiB)": 66.07, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 451, "train_speed(iter/s)": 0.021238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/mean_length": 649.5833740234375, "completions/min_length": 454.0, "epoch": 0.018729540463266067, "grad_norm": 2.3719899731826932, "kl": 0.088134765625, "learning_rate": 9.989798759197245e-07, "loss": 0.003533328650519252, "memory(GiB)": 66.07, "reward": 0.875, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 452, "train_speed(iter/s)": 0.021248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/mean_length": 601.5, "completions/min_length": 462.0, "epoch": 0.018770977499689222, "grad_norm": 3.230909200799686, "kl": 0.1043701171875, "learning_rate": 9.989731112632916e-07, "loss": 0.0041681332513689995, "memory(GiB)": 66.07, "reward": 1.875, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 453, "train_speed(iter/s)": 0.021259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/mean_length": 645.1666870117188, "completions/min_length": 433.0, "epoch": 0.018812414536112376, "grad_norm": 2.574201505421981, "kl": 0.097412109375, "learning_rate": 9.989663242750457e-07, "loss": 0.0039041440468281507, "memory(GiB)": 66.07, "reward": 1.7083333730697632, "reward_std": 0.45016831159591675, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 454, "train_speed(iter/s)": 0.021269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/mean_length": 660.9166870117188, "completions/min_length": 461.0, "epoch": 0.01885385157253553, "grad_norm": 2.7562480057517713, "kl": 0.1182861328125, "learning_rate": 9.989595149552907e-07, "loss": 0.004742562770843506, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 455, "train_speed(iter/s)": 0.02128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1195.0, "completions/mean_length": 685.25, "completions/min_length": 516.0, "epoch": 0.01889528860895869, "grad_norm": 3.2172625566407844, "kl": 0.1109619140625, "learning_rate": 9.989526833043316e-07, "loss": 0.0044273934327065945, "memory(GiB)": 66.07, "reward": 1.0833333730697632, "reward_std": 0.46871843934059143, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 456, "train_speed(iter/s)": 0.021283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/mean_length": 714.0833740234375, "completions/min_length": 559.0, "epoch": 0.018936725645381843, "grad_norm": 2.6626338494866184, "kl": 0.0882568359375, "learning_rate": 9.989458293224737e-07, "loss": 0.003525992389768362, "memory(GiB)": 66.07, "reward": 1.625, "reward_std": 0.4330126941204071, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 457, "train_speed(iter/s)": 0.021292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/mean_length": 608.1666870117188, "completions/min_length": 445.0, "epoch": 0.018978162681804998, "grad_norm": 3.714117492966326, "kl": 0.127197265625, "learning_rate": 9.98938953010024e-07, "loss": 0.005097578279674053, "memory(GiB)": 66.07, "reward": 1.75, "reward_std": 0.39886200428009033, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 458, "train_speed(iter/s)": 0.021305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/mean_length": 497.91668701171875, "completions/min_length": 373.0, "epoch": 0.019019599718228152, "grad_norm": 3.212513056297756, "kl": 0.1239013671875, "learning_rate": 9.989320543672903e-07, "loss": 0.004950881004333496, "memory(GiB)": 66.07, "reward": 1.2083333730697632, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 459, "train_speed(iter/s)": 0.021317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1341.0, "completions/mean_length": 683.3333740234375, "completions/min_length": 373.0, "epoch": 0.019061036754651307, "grad_norm": 4.927760341425751, "kl": 0.09765625, "learning_rate": 9.98925133394581e-07, "loss": 0.003913789987564087, "memory(GiB)": 66.07, "reward": 1.8333333730697632, "reward_std": 0.32566946744918823, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 460, "train_speed(iter/s)": 0.02132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/mean_length": 608.8333740234375, "completions/min_length": 441.0, "epoch": 0.01910247379107446, "grad_norm": 2.540866213604757, "kl": 0.114990234375, "learning_rate": 9.989181900922065e-07, "loss": 0.00460072373971343, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.3964807391166687, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 461, "train_speed(iter/s)": 0.021331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/mean_length": 614.4166870117188, "completions/min_length": 462.0, "epoch": 0.019143910827497616, "grad_norm": 2.792386867474769, "kl": 0.134765625, "learning_rate": 9.989112244604771e-07, "loss": 0.005392571445554495, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.3964807391166687, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 462, "train_speed(iter/s)": 0.021343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/mean_length": 612.9166870117188, "completions/min_length": 430.0, "epoch": 0.019185347863920774, "grad_norm": 66.83638347959229, "kl": 0.216064453125, "learning_rate": 9.989042364997047e-07, "loss": 0.008646870963275433, "memory(GiB)": 66.07, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 463, "train_speed(iter/s)": 0.021353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/mean_length": 604.5833740234375, "completions/min_length": 407.0, "epoch": 0.019226784900343928, "grad_norm": 2.3281898679990327, "kl": 0.115966796875, "learning_rate": 9.988972262102018e-07, "loss": 0.004634251352399588, "memory(GiB)": 66.07, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 464, "train_speed(iter/s)": 0.021363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/mean_length": 668.1666870117188, "completions/min_length": 540.0, "epoch": 0.019268221936767083, "grad_norm": 3.7875754202154766, "kl": 0.1494140625, "learning_rate": 9.988901935922825e-07, "loss": 0.0059813461266458035, "memory(GiB)": 66.07, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 465, "train_speed(iter/s)": 0.021372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/mean_length": 571.9166870117188, "completions/min_length": 420.0, "epoch": 0.019309658973190237, "grad_norm": 2.499667503067823, "kl": 0.103759765625, "learning_rate": 9.988831386462613e-07, "loss": 0.004143367521464825, "memory(GiB)": 66.07, "reward": 1.125, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 466, "train_speed(iter/s)": 0.021379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1273.166748046875, "completions/min_length": 438.0, "epoch": 0.01935109600961339, "grad_norm": 1.985765144611581, "kl": 0.1090087890625, "learning_rate": 9.988760613724541e-07, "loss": -0.16873125731945038, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.7637625932693481, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.39886200428009033, "step": 467, "train_speed(iter/s)": 0.021298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/mean_length": 552.1666870117188, "completions/min_length": 405.0, "epoch": 0.019392533046036546, "grad_norm": 2.9815017799499097, "kl": 0.114990234375, "learning_rate": 9.988689617711776e-07, "loss": 0.004591668955981731, "memory(GiB)": 66.07, "reward": 0.9166666865348816, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 468, "train_speed(iter/s)": 0.021308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/mean_length": 657.75, "completions/min_length": 384.0, "epoch": 0.019433970082459704, "grad_norm": 2.038667923956404, "kl": 0.103271484375, "learning_rate": 9.988618398427493e-07, "loss": 0.004130254499614239, "memory(GiB)": 66.07, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 469, "train_speed(iter/s)": 0.021315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 493.5833435058594, "completions/min_length": 329.0, "epoch": 0.01947540711888286, "grad_norm": 3.4543208299047325, "kl": 0.15673828125, "learning_rate": 9.988546955874885e-07, "loss": 0.006273667328059673, "memory(GiB)": 66.07, "reward": 1.25, "reward_std": 0.5, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 470, "train_speed(iter/s)": 0.021328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1192.0, "completions/mean_length": 601.4166870117188, "completions/min_length": 386.0, "epoch": 0.019516844155306013, "grad_norm": 3.109444462310898, "kl": 0.1201171875, "learning_rate": 9.988475290057143e-07, "loss": 0.004796703811734915, "memory(GiB)": 66.07, "reward": 1.125, "reward_std": 0.4330126941204071, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 471, "train_speed(iter/s)": 0.021321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/mean_length": 600.9166870117188, "completions/min_length": 471.0, "epoch": 0.019558281191729168, "grad_norm": 2.711739190737496, "kl": 0.117919921875, "learning_rate": 9.988403400977481e-07, "loss": 0.004704773426055908, "memory(GiB)": 66.07, "reward": 1.8333333730697632, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 472, "train_speed(iter/s)": 0.021332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/mean_length": 688.3333740234375, "completions/min_length": 458.0, "epoch": 0.019599718228152322, "grad_norm": 0.1871252228911012, "kl": 0.112548828125, "learning_rate": 9.988331288639111e-07, "loss": 0.004499874077737331, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 473, "train_speed(iter/s)": 0.02134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1178.166748046875, "completions/min_length": 491.0, "epoch": 0.019641155264575477, "grad_norm": 1.6777157768753703, "kl": 0.179443359375, "learning_rate": 9.988258953045262e-07, "loss": -0.2379547357559204, "memory(GiB)": 66.07, "reward": 1.7083333730697632, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 474, "train_speed(iter/s)": 0.021261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/mean_length": 701.0833740234375, "completions/min_length": 454.0, "epoch": 0.01968259230099863, "grad_norm": 2.296947687581704, "kl": 0.11572265625, "learning_rate": 9.988186394199175e-07, "loss": 0.0046246349811553955, "memory(GiB)": 66.07, "reward": 1.7083333730697632, "reward_std": 0.49810245633125305, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 475, "train_speed(iter/s)": 0.02127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/mean_length": 603.25, "completions/min_length": 363.0, "epoch": 0.01972402933742179, "grad_norm": 2.6301278909256824, "kl": 0.12646484375, "learning_rate": 9.98811361210409e-07, "loss": 0.005068530794233084, "memory(GiB)": 66.07, "reward": 0.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 476, "train_speed(iter/s)": 0.021281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1293.416748046875, "completions/min_length": 497.0, "epoch": 0.019765466373844943, "grad_norm": 2.181645000160249, "kl": 0.1158447265625, "learning_rate": 9.988040606763272e-07, "loss": -0.1397336721420288, "memory(GiB)": 66.07, "reward": 1.25, "reward_std": 0.7229987978935242, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.32566946744918823, "step": 477, "train_speed(iter/s)": 0.021202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/mean_length": 549.4166870117188, "completions/min_length": 378.0, "epoch": 0.019806903410268098, "grad_norm": 4.269782129806044, "kl": 0.189697265625, "learning_rate": 9.987967378179983e-07, "loss": 0.007597257848829031, "memory(GiB)": 66.07, "reward": 1.875, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 478, "train_speed(iter/s)": 0.021214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/mean_length": 500.3333435058594, "completions/min_length": 394.0, "epoch": 0.019848340446691252, "grad_norm": 3.798459648859271, "kl": 0.171142578125, "learning_rate": 9.987893926357505e-07, "loss": 0.006864488124847412, "memory(GiB)": 66.07, "reward": 1.875, "reward_std": 0.4330126941204071, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 479, "train_speed(iter/s)": 0.021223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1224.666748046875, "completions/min_length": 438.0, "epoch": 0.019889777483114407, "grad_norm": 1.396971043829234, "kl": 0.134033203125, "learning_rate": 9.98782025129912e-07, "loss": -0.25964128971099854, "memory(GiB)": 66.07, "reward": 0.9166666865348816, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 480, "train_speed(iter/s)": 0.021145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1237.25, "completions/min_length": 410.0, "epoch": 0.01993121451953756, "grad_norm": 2.3921150029395384, "kl": 0.159423828125, "learning_rate": 9.98774635300813e-07, "loss": -0.14242374897003174, "memory(GiB)": 66.07, "reward": 0.9166666865348816, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.38924944400787354, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.33709993958473206, "step": 481, "train_speed(iter/s)": 0.021067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/mean_length": 707.8333740234375, "completions/min_length": 432.0, "epoch": 0.01997265155596072, "grad_norm": 0.18339308952264588, "kl": 0.1375732421875, "learning_rate": 9.98767223148784e-07, "loss": 0.005496257916092873, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 482, "train_speed(iter/s)": 0.021074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1244.916748046875, "completions/min_length": 456.0, "epoch": 0.020014088592383874, "grad_norm": 2.1935265095368117, "kl": 0.1474609375, "learning_rate": 9.987597886741568e-07, "loss": -0.18045774102210999, "memory(GiB)": 66.07, "reward": 1.2083333730697632, "reward_std": 0.5418123602867126, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.625, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 483, "train_speed(iter/s)": 0.020999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3568.0, "completions/mean_length": 931.0, "completions/min_length": 519.0, "epoch": 0.02005552562880703, "grad_norm": 2.143996811887609, "kl": 0.1875, "learning_rate": 9.987523318772642e-07, "loss": 0.007491772528737783, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.3964807391166687, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.3964807391166687, "step": 484, "train_speed(iter/s)": 0.020978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/mean_length": 684.3333740234375, "completions/min_length": 593.0, "epoch": 0.020096962665230183, "grad_norm": 2.429581119532514, "kl": 0.177978515625, "learning_rate": 9.987448527584398e-07, "loss": 0.007117033004760742, "memory(GiB)": 66.07, "reward": 1.75, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 485, "train_speed(iter/s)": 0.020987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 8001.0, "completions/mean_length": 1910.5833740234375, "completions/min_length": 493.0, "epoch": 0.020138399701653337, "grad_norm": 2.931248513608202, "kl": 0.200927734375, "learning_rate": 9.987373513180184e-07, "loss": -0.31856080889701843, "memory(GiB)": 66.07, "reward": 1.5, "reward_std": 0.7687060832977295, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.39886200428009033, "step": 486, "train_speed(iter/s)": 0.020899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/mean_length": 602.0833740234375, "completions/min_length": 442.0, "epoch": 0.020179836738076492, "grad_norm": 2.4794971150949667, "kl": 0.200927734375, "learning_rate": 9.987298275563359e-07, "loss": 0.008015553466975689, "memory(GiB)": 66.07, "reward": 1.5833333730697632, "reward_std": 0.5573204159736633, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.6666666865348816, "rewards/FormatCorrectnessReward/std": 0.3256694972515106, "step": 487, "train_speed(iter/s)": 0.020908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/mean_length": 686.75, "completions/min_length": 556.0, "epoch": 0.020221273774499646, "grad_norm": 2.6029263282405317, "kl": 0.186279296875, "learning_rate": 9.987222814737287e-07, "loss": 0.007461468689143658, "memory(GiB)": 66.07, "reward": 1.4166667461395264, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 488, "train_speed(iter/s)": 0.020916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/mean_length": 534.8333740234375, "completions/min_length": 428.0, "epoch": 0.020262710810922804, "grad_norm": 0.30379375507242645, "kl": 0.253662109375, "learning_rate": 9.987147130705347e-07, "loss": 0.010115528479218483, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 489, "train_speed(iter/s)": 0.02093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/mean_length": 630.8333740234375, "completions/min_length": 446.0, "epoch": 0.02030414784734596, "grad_norm": 2.6568172541234745, "kl": 0.1982421875, "learning_rate": 9.987071223470926e-07, "loss": 0.007932206615805626, "memory(GiB)": 66.07, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 490, "train_speed(iter/s)": 0.02094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/mean_length": 676.5833740234375, "completions/min_length": 462.0, "epoch": 0.020345584883769113, "grad_norm": 2.7551438770898806, "kl": 0.23974609375, "learning_rate": 9.98699509303742e-07, "loss": 0.009584784507751465, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.3964807391166687, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 491, "train_speed(iter/s)": 0.020948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1298.0833740234375, "completions/min_length": 467.0, "epoch": 0.020387021920192268, "grad_norm": 2.10024811338842, "kl": 0.18310546875, "learning_rate": 9.98691873940824e-07, "loss": -0.16690443456172943, "memory(GiB)": 66.07, "reward": 1.2916667461395264, "reward_std": 0.6200562119483948, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 492, "train_speed(iter/s)": 0.020875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/mean_length": 726.5833740234375, "completions/min_length": 500.0, "epoch": 0.020428458956615422, "grad_norm": 0.25521329040014296, "kl": 0.199951171875, "learning_rate": 9.9868421625868e-07, "loss": 0.007980739697813988, "memory(GiB)": 66.07, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 493, "train_speed(iter/s)": 0.020883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1186.0, "completions/min_length": 396.0, "epoch": 0.020469895993038577, "grad_norm": 2.1047171400266085, "kl": 0.239990234375, "learning_rate": 9.986765362576529e-07, "loss": -0.20446878671646118, "memory(GiB)": 66.07, "reward": 0.8333333730697632, "reward_std": 0.32566946744918823, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3256694972515106, "step": 494, "train_speed(iter/s)": 0.020812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 8001.0, "completions/mean_length": 2498.75, "completions/min_length": 557.0, "epoch": 0.02051133302946173, "grad_norm": 2.2441973511601656, "kl": 0.21923828125, "learning_rate": 9.98668833938086e-07, "loss": -0.40811070799827576, "memory(GiB)": 66.07, "reward": 1.5, "reward_std": 0.9045340418815613, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.45226702094078064, "step": 495, "train_speed(iter/s)": 0.020731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1205.25, "completions/min_length": 449.0, "epoch": 0.02055277006588489, "grad_norm": 2.22951355171015, "kl": 0.239013671875, "learning_rate": 9.986611093003248e-07, "loss": -0.24740812182426453, "memory(GiB)": 66.07, "reward": 1.7916667461395264, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 496, "train_speed(iter/s)": 0.020649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 8001.0, "completions/mean_length": 1882.8333740234375, "completions/min_length": 215.0, "epoch": 0.020594207102308044, "grad_norm": 3.9187003861038208, "kl": 0.25830078125, "learning_rate": 9.986533623447143e-07, "loss": -0.24155810475349426, "memory(GiB)": 66.07, "reward": 1.3333333730697632, "reward_std": 0.8876253962516785, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.45226702094078064, "step": 497, "train_speed(iter/s)": 0.020574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/mean_length": 606.5833740234375, "completions/min_length": 553.0, "epoch": 0.0206356441387312, "grad_norm": 2.9287018935063696, "kl": 0.271484375, "learning_rate": 9.986455930716016e-07, "loss": 0.01086790394037962, "memory(GiB)": 66.07, "reward": 0.7916666865348816, "reward_std": 0.25746431946754456, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 498, "train_speed(iter/s)": 0.020585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 8001.0, "completions/mean_length": 2888.416748046875, "completions/min_length": 513.0, "epoch": 0.020677081175154353, "grad_norm": 2.823800024251497, "kl": 0.24072265625, "learning_rate": 9.986378014813344e-07, "loss": -0.2669476270675659, "memory(GiB)": 66.07, "reward": 0.7916666865348816, "reward_std": 0.7216877937316895, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.625, "rewards/FormatCorrectnessReward/std": 0.4826536774635315, "step": 499, "train_speed(iter/s)": 0.020508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 8001.0, "completions/mean_length": 2557.25, "completions/min_length": 421.0, "epoch": 0.020718518211577507, "grad_norm": 2.556316742876612, "kl": 0.2578125, "learning_rate": 9.986299875742611e-07, "loss": -0.3306913375854492, "memory(GiB)": 66.07, "reward": 1.3333333730697632, "reward_std": 0.9847319722175598, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.6666666865348816, "rewards/FormatCorrectnessReward/std": 0.4923659861087799, "step": 500, "train_speed(iter/s)": 0.020426 }, { "epoch": 0.020718518211577507, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.08230452674897117, "eval_completions/max_length": 5485.349794238683, "eval_completions/mean_length": 1254.5024406527295, "eval_completions/min_length": 448.724279835391, "eval_kl": 1.4197088798868314, "eval_loss": -0.10994589328765869, "eval_reward": 1.388374519814189, "eval_reward_std": 0.5218771062024827, "eval_rewards/AnswerAccuracyReward/mean": 0.5943072722465904, "eval_rewards/AnswerAccuracyReward/std": 0.2796342079776795, "eval_rewards/FormatCorrectnessReward/mean": 0.7940672146685329, "eval_rewards/FormatCorrectnessReward/std": 0.31152888917873917, "eval_runtime": 19390.3617, "eval_samples_per_second": 0.013, "eval_steps_per_second": 0.001, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 8001.0, "completions/mean_length": 2001.5833740234375, "completions/min_length": 583.0, "epoch": 0.020759955248000662, "grad_norm": 2.062101339490208, "kl": 0.2181396484375, "learning_rate": 9.986221513507318e-07, "loss": -0.28070589900016785, "memory(GiB)": 77.29, "reward": 1.25, "reward_std": 0.7229987978935242, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.39886200428009033, "step": 501, "train_speed(iter/s)": 0.011365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 3061.08349609375, "completions/min_length": 420.0, "epoch": 0.02080139228442382, "grad_norm": 2.816142038757833, "kl": 0.28662109375, "learning_rate": 9.986142928110972e-07, "loss": -0.3440731465816498, "memory(GiB)": 77.29, "reward": 0.875, "reward_std": 0.829156219959259, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.45016834139823914, "step": 502, "train_speed(iter/s)": 0.011349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 7999.0, "completions/mean_length": 1128.5833740234375, "completions/min_length": 326.0, "epoch": 0.020842829320846974, "grad_norm": 2.9254460116659495, "kl": 0.35205078125, "learning_rate": 9.986064119557087e-07, "loss": -0.14555124938488007, "memory(GiB)": 77.29, "reward": 1.25, "reward_std": 0.6571287512779236, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.358870267868042, "step": 503, "train_speed(iter/s)": 0.011339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/mean_length": 645.3333740234375, "completions/min_length": 565.0, "epoch": 0.02088426635727013, "grad_norm": 2.6877914146951745, "kl": 0.275146484375, "learning_rate": 9.985985087849191e-07, "loss": 0.010979801416397095, "memory(GiB)": 77.29, "reward": 1.0833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 504, "train_speed(iter/s)": 0.011352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/mean_length": 406.91668701171875, "completions/min_length": 263.0, "epoch": 0.020925703393693283, "grad_norm": 4.13347840852966, "kl": 0.36181640625, "learning_rate": 9.985905832990824e-07, "loss": 0.014454404823482037, "memory(GiB)": 77.29, "reward": 0.875, "reward_std": 0.6784005165100098, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.3964807391166687, "step": 505, "train_speed(iter/s)": 0.011366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/mean_length": 680.75, "completions/min_length": 406.0, "epoch": 0.020967140430116438, "grad_norm": 2.911658850070964, "kl": 0.30419921875, "learning_rate": 9.985826354985529e-07, "loss": 0.012123937718570232, "memory(GiB)": 77.29, "reward": 1.0833333730697632, "reward_std": 0.7017294764518738, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.39886200428009033, "step": 506, "train_speed(iter/s)": 0.011379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/mean_length": 592.3333740234375, "completions/min_length": 426.0, "epoch": 0.021008577466539592, "grad_norm": 3.23214481925066, "kl": 0.359375, "learning_rate": 9.985746653836866e-07, "loss": 0.014403795823454857, "memory(GiB)": 77.29, "reward": 0.7916666865348816, "reward_std": 0.25746431946754456, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 507, "train_speed(iter/s)": 0.011392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 8001.0, "completions/mean_length": 1770.3333740234375, "completions/min_length": 407.0, "epoch": 0.021050014502962747, "grad_norm": 2.4350037842289676, "kl": 0.37890625, "learning_rate": 9.985666729548404e-07, "loss": -0.2721875309944153, "memory(GiB)": 77.29, "reward": 1.5833333730697632, "reward_std": 0.6336522102355957, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.39886200428009033, "step": 508, "train_speed(iter/s)": 0.011378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/mean_length": 705.1666870117188, "completions/min_length": 447.0, "epoch": 0.021091451539385905, "grad_norm": 2.9591282017472262, "kl": 0.3173828125, "learning_rate": 9.985586582123713e-07, "loss": 0.012693166732788086, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 509, "train_speed(iter/s)": 0.01139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/mean_length": 600.3333740234375, "completions/min_length": 368.0, "epoch": 0.02113288857580906, "grad_norm": 2.9868817252599174, "kl": 0.38427734375, "learning_rate": 9.985506211566386e-07, "loss": 0.015357911586761475, "memory(GiB)": 77.29, "reward": 0.9166666865348816, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 510, "train_speed(iter/s)": 0.011403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/mean_length": 524.8333740234375, "completions/min_length": 232.0, "epoch": 0.021174325612232214, "grad_norm": 4.460686470846206, "kl": 0.3857421875, "learning_rate": 9.98542561788002e-07, "loss": 0.015414953231811523, "memory(GiB)": 77.29, "reward": 1.75, "reward_std": 0.5838742256164551, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3256694972515106, "step": 511, "train_speed(iter/s)": 0.011412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/mean_length": 581.4166870117188, "completions/min_length": 418.0, "epoch": 0.021215762648655368, "grad_norm": 3.1862640867054766, "kl": 0.388671875, "learning_rate": 9.985344801068218e-07, "loss": 0.015521278604865074, "memory(GiB)": 77.29, "reward": 1.5, "reward_std": 0.6030226945877075, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 512, "train_speed(iter/s)": 0.011426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/mean_length": 494.5, "completions/min_length": 203.0, "epoch": 0.021257199685078523, "grad_norm": 4.333180266572967, "kl": 0.43359375, "learning_rate": 9.9852637611346e-07, "loss": 0.017352541908621788, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 513, "train_speed(iter/s)": 0.011432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1287.666748046875, "completions/min_length": 465.0, "epoch": 0.021298636721501677, "grad_norm": 2.14885909469344, "kl": 0.40380859375, "learning_rate": 9.985182498082794e-07, "loss": -0.18251296877861023, "memory(GiB)": 77.29, "reward": 0.7916666865348816, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 514, "train_speed(iter/s)": 0.011421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8016.0, "completions/mean_length": 1299.0833740234375, "completions/min_length": 435.0, "epoch": 0.02134007375792483, "grad_norm": 2.4869966231563216, "kl": 0.35888671875, "learning_rate": 9.985101011916433e-07, "loss": -0.14549903571605682, "memory(GiB)": 77.29, "reward": 1.25, "reward_std": 0.6571287512779236, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3256694972515106, "step": 515, "train_speed(iter/s)": 0.01141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8000.0, "completions/mean_length": 1142.666748046875, "completions/min_length": 416.0, "epoch": 0.02138151079434799, "grad_norm": 1.4284762503142292, "kl": 0.45361328125, "learning_rate": 9.98501930263917e-07, "loss": -0.2399878203868866, "memory(GiB)": 77.29, "reward": 1.7916667461395264, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 516, "train_speed(iter/s)": 0.011399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1248.25, "completions/min_length": 538.0, "epoch": 0.021422947830771144, "grad_norm": 2.0350007883372103, "kl": 0.37158203125, "learning_rate": 9.984937370254653e-07, "loss": -0.16169682145118713, "memory(GiB)": 77.29, "reward": 1.0833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 517, "train_speed(iter/s)": 0.011388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/mean_length": 463.5833435058594, "completions/min_length": 323.0, "epoch": 0.0214643848671943, "grad_norm": 3.5769913618460167, "kl": 0.48828125, "learning_rate": 9.984855214766557e-07, "loss": 0.01952938362956047, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 518, "train_speed(iter/s)": 0.011402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 8001.0, "completions/mean_length": 1776.916748046875, "completions/min_length": 329.0, "epoch": 0.021505821903617453, "grad_norm": 2.5693884994036957, "kl": 0.474609375, "learning_rate": 9.984772836178556e-07, "loss": -0.25515538454055786, "memory(GiB)": 77.29, "reward": 0.5833333730697632, "reward_std": 0.358870267868042, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.3588702976703644, "step": 519, "train_speed(iter/s)": 0.01139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/mean_length": 367.0833435058594, "completions/min_length": 289.0, "epoch": 0.021547258940040608, "grad_norm": 4.238768749266195, "kl": 0.53076171875, "learning_rate": 9.984690234494338e-07, "loss": 0.021232780069112778, "memory(GiB)": 77.29, "reward": 1.875, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 520, "train_speed(iter/s)": 0.011404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1140.666748046875, "completions/min_length": 440.0, "epoch": 0.021588695976463762, "grad_norm": 1.596115877307644, "kl": 0.47021484375, "learning_rate": 9.984607409717598e-07, "loss": -0.2391325682401657, "memory(GiB)": 77.29, "reward": 1.7916667461395264, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 521, "train_speed(iter/s)": 0.011393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 7994.0, "completions/mean_length": 1214.416748046875, "completions/min_length": 405.0, "epoch": 0.02163013301288692, "grad_norm": 5.144354794912841, "kl": 0.44580078125, "learning_rate": 9.984524361852043e-07, "loss": -0.14642976224422455, "memory(GiB)": 77.29, "reward": 0.9166666865348816, "reward_std": 0.46871843934059143, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3256694972515106, "step": 522, "train_speed(iter/s)": 0.011382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 491.66668701171875, "completions/min_length": 401.0, "epoch": 0.021671570049310074, "grad_norm": 3.5392429621793595, "kl": 0.537109375, "learning_rate": 9.98444109090139e-07, "loss": 0.021457046270370483, "memory(GiB)": 77.29, "reward": 1.875, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 523, "train_speed(iter/s)": 0.011396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/mean_length": 517.8333740234375, "completions/min_length": 388.0, "epoch": 0.02171300708573323, "grad_norm": 3.3390760368993493, "kl": 0.4599609375, "learning_rate": 9.984357596869368e-07, "loss": 0.018393903970718384, "memory(GiB)": 77.29, "reward": 1.0416667461395264, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 524, "train_speed(iter/s)": 0.011408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/mean_length": 596.25, "completions/min_length": 347.0, "epoch": 0.021754444122156383, "grad_norm": 3.557461662675441, "kl": 0.46826171875, "learning_rate": 9.984273879759712e-07, "loss": 0.018721163272857666, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 525, "train_speed(iter/s)": 0.011421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/mean_length": 357.5833435058594, "completions/min_length": 231.0, "epoch": 0.021795881158579538, "grad_norm": 4.443113452263974, "kl": 0.564453125, "learning_rate": 9.98418993957617e-07, "loss": 0.02255682274699211, "memory(GiB)": 77.29, "reward": 1.7083333730697632, "reward_std": 0.45016831159591675, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 526, "train_speed(iter/s)": 0.011435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/mean_length": 466.8333435058594, "completions/min_length": 296.0, "epoch": 0.021837318195002692, "grad_norm": 0.39675455983354924, "kl": 0.5390625, "learning_rate": 9.984105776322495e-07, "loss": 0.021512776613235474, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 527, "train_speed(iter/s)": 0.011449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/mean_length": 586.6666870117188, "completions/min_length": 412.0, "epoch": 0.021878755231425847, "grad_norm": 3.3682827819867405, "kl": 0.47900390625, "learning_rate": 9.984021390002457e-07, "loss": 0.019184639677405357, "memory(GiB)": 77.29, "reward": 0.875, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 528, "train_speed(iter/s)": 0.011461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/mean_length": 421.25, "completions/min_length": 276.0, "epoch": 0.021920192267849005, "grad_norm": 4.170794933231768, "kl": 0.5380859375, "learning_rate": 9.983936780619835e-07, "loss": 0.021453097462654114, "memory(GiB)": 77.29, "reward": 0.6666666865348816, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.6666666865348816, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 529, "train_speed(iter/s)": 0.011475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/mean_length": 565.6666870117188, "completions/min_length": 427.0, "epoch": 0.02196162930427216, "grad_norm": 0.3598067656070582, "kl": 0.49365234375, "learning_rate": 9.98385194817841e-07, "loss": 0.01974741369485855, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 530, "train_speed(iter/s)": 0.011488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/mean_length": 570.5833740234375, "completions/min_length": 403.0, "epoch": 0.022003066340695314, "grad_norm": 7.545537286934143, "kl": 0.48828125, "learning_rate": 9.983766892681985e-07, "loss": 0.01953217014670372, "memory(GiB)": 77.29, "reward": 1.25, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 531, "train_speed(iter/s)": 0.0115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/mean_length": 529.8333740234375, "completions/min_length": 421.0, "epoch": 0.02204450337711847, "grad_norm": 5.362562297393144, "kl": 0.494140625, "learning_rate": 9.98368161413436e-07, "loss": 0.019815323874354362, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.32566946744918823, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.32566946744918823, "step": 532, "train_speed(iter/s)": 0.01151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/mean_length": 498.8333435058594, "completions/min_length": 348.0, "epoch": 0.022085940413541623, "grad_norm": 0.34107914945106727, "kl": 0.49169921875, "learning_rate": 9.983596112539358e-07, "loss": 0.019662199541926384, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 533, "train_speed(iter/s)": 0.011523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1174.666748046875, "completions/min_length": 429.0, "epoch": 0.022127377449964777, "grad_norm": 1.8539256408052396, "kl": 0.4853515625, "learning_rate": 9.983510387900802e-07, "loss": -0.22671648859977722, "memory(GiB)": 77.29, "reward": 1.7083333730697632, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 534, "train_speed(iter/s)": 0.011513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1156.3333740234375, "completions/min_length": 460.0, "epoch": 0.022168814486387932, "grad_norm": 1.9755197172882164, "kl": 0.52587890625, "learning_rate": 9.983424440222529e-07, "loss": -0.21038004755973816, "memory(GiB)": 77.29, "reward": 1.7083333730697632, "reward_std": 0.6200562119483948, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 535, "train_speed(iter/s)": 0.011502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/mean_length": 626.3333740234375, "completions/min_length": 487.0, "epoch": 0.02221025152281109, "grad_norm": 0.34827855884526215, "kl": 0.45068359375, "learning_rate": 9.983338269508389e-07, "loss": 0.018021374940872192, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 536, "train_speed(iter/s)": 0.011515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1255.75, "completions/min_length": 523.0, "epoch": 0.022251688559234244, "grad_norm": 2.4258185625405946, "kl": 0.4482421875, "learning_rate": 9.983251875762232e-07, "loss": -0.16016894578933716, "memory(GiB)": 77.29, "reward": 1.4166667461395264, "reward_std": 0.6685579419136047, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 537, "train_speed(iter/s)": 0.011504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/mean_length": 518.1666870117188, "completions/min_length": 284.0, "epoch": 0.0222931255956574, "grad_norm": 0.3399707249733058, "kl": 0.47314453125, "learning_rate": 9.98316525898793e-07, "loss": 0.018965519964694977, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 538, "train_speed(iter/s)": 0.011517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/mean_length": 657.1666870117188, "completions/min_length": 488.0, "epoch": 0.022334562632080553, "grad_norm": 0.31918987495360374, "kl": 0.43408203125, "learning_rate": 9.983078419189359e-07, "loss": 0.017397698014974594, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 539, "train_speed(iter/s)": 0.011529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1228.0, "completions/mean_length": 845.8333740234375, "completions/min_length": 574.0, "epoch": 0.022375999668503708, "grad_norm": 2.5285322153033647, "kl": 0.38134765625, "learning_rate": 9.982991356370403e-07, "loss": 0.015197615139186382, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 540, "train_speed(iter/s)": 0.01154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1287.3333740234375, "completions/min_length": 550.0, "epoch": 0.022417436704926862, "grad_norm": 1.1994910877495393, "kl": 0.43017578125, "learning_rate": 9.98290407053496e-07, "loss": -0.24871324002742767, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.5773502588272095, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 541, "train_speed(iter/s)": 0.01153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1208.5, "completions/min_length": 401.0, "epoch": 0.02245887374135002, "grad_norm": 12.30931186313521, "kl": 0.61083984375, "learning_rate": 9.982816561686938e-07, "loss": -0.2416851967573166, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.5773502588272095, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 542, "train_speed(iter/s)": 0.011517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/mean_length": 633.4166870117188, "completions/min_length": 462.0, "epoch": 0.022500310777773175, "grad_norm": 3.152470167340273, "kl": 0.42529296875, "learning_rate": 9.98272882983025e-07, "loss": 0.01703852415084839, "memory(GiB)": 77.29, "reward": 1.75, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 543, "train_speed(iter/s)": 0.011529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/mean_length": 605.8333740234375, "completions/min_length": 499.0, "epoch": 0.02254174781419633, "grad_norm": 3.4477057164976563, "kl": 0.4765625, "learning_rate": 9.982640874968825e-07, "loss": 0.019091378897428513, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 544, "train_speed(iter/s)": 0.011542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/mean_length": 615.4166870117188, "completions/min_length": 461.0, "epoch": 0.022583184850619484, "grad_norm": 3.6481669876400176, "kl": 0.44091796875, "learning_rate": 9.9825526971066e-07, "loss": 0.017628729343414307, "memory(GiB)": 77.29, "reward": 1.7916667461395264, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 545, "train_speed(iter/s)": 0.011554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/mean_length": 737.25, "completions/min_length": 544.0, "epoch": 0.02262462188704264, "grad_norm": 2.756631569129876, "kl": 0.40283203125, "learning_rate": 9.982464296247522e-07, "loss": 0.01612071320414543, "memory(GiB)": 77.29, "reward": 1.1666667461395264, "reward_std": 0.3892494738101959, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.38924944400787354, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 546, "train_speed(iter/s)": 0.011566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/mean_length": 638.5833740234375, "completions/min_length": 566.0, "epoch": 0.022666058923465793, "grad_norm": 2.8203000308354493, "kl": 0.43115234375, "learning_rate": 9.982375672395545e-07, "loss": 0.017267465591430664, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 547, "train_speed(iter/s)": 0.011577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/mean_length": 625.8333740234375, "completions/min_length": 395.0, "epoch": 0.022707495959888947, "grad_norm": 0.3315598386021109, "kl": 0.4365234375, "learning_rate": 9.982286825554633e-07, "loss": 0.017459219321608543, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 548, "train_speed(iter/s)": 0.01159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/mean_length": 696.4166870117188, "completions/min_length": 508.0, "epoch": 0.022748932996312105, "grad_norm": 2.96067330302786, "kl": 0.41796875, "learning_rate": 9.98219775572877e-07, "loss": 0.01670881174504757, "memory(GiB)": 77.29, "reward": 0.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 549, "train_speed(iter/s)": 0.011601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/mean_length": 552.8333740234375, "completions/min_length": 456.0, "epoch": 0.02279037003273526, "grad_norm": 0.3427564953738471, "kl": 0.484375, "learning_rate": 9.982108462921937e-07, "loss": 0.01936502940952778, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 550, "train_speed(iter/s)": 0.011614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 909.0, "completions/mean_length": 647.9166870117188, "completions/min_length": 436.0, "epoch": 0.022831807069158414, "grad_norm": 0.32857061896393885, "kl": 0.44873046875, "learning_rate": 9.98201894713813e-07, "loss": 0.01794145442545414, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 551, "train_speed(iter/s)": 0.011625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/mean_length": 688.3333740234375, "completions/min_length": 525.0, "epoch": 0.02287324410558157, "grad_norm": 3.466526178219313, "kl": 0.390625, "learning_rate": 9.981929208381357e-07, "loss": 0.015644878149032593, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 552, "train_speed(iter/s)": 0.011637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/mean_length": 616.4166870117188, "completions/min_length": 497.0, "epoch": 0.022914681142004723, "grad_norm": 3.32115968239738, "kl": 0.46630859375, "learning_rate": 9.981839246655635e-07, "loss": 0.018634796142578125, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.32566946744918823, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 553, "train_speed(iter/s)": 0.011649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/mean_length": 682.5, "completions/min_length": 527.0, "epoch": 0.022956118178427878, "grad_norm": 0.363065277440343, "kl": 0.400390625, "learning_rate": 9.98174906196499e-07, "loss": 0.01603768765926361, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 554, "train_speed(iter/s)": 0.011661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/mean_length": 722.9166870117188, "completions/min_length": 586.0, "epoch": 0.022997555214851032, "grad_norm": 2.947190965568681, "kl": 0.38623046875, "learning_rate": 9.981658654313456e-07, "loss": 0.015409271232783794, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 555, "train_speed(iter/s)": 0.011672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/mean_length": 778.6666870117188, "completions/min_length": 679.0, "epoch": 0.02303899225127419, "grad_norm": 2.767919921070408, "kl": 0.3515625, "learning_rate": 9.981568023705083e-07, "loss": 0.014070352539420128, "memory(GiB)": 77.29, "reward": 1.875, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 556, "train_speed(iter/s)": 0.011684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/mean_length": 790.0, "completions/min_length": 702.0, "epoch": 0.023080429287697345, "grad_norm": 2.6715770559279073, "kl": 0.35009765625, "learning_rate": 9.981477170143924e-07, "loss": 0.014037743210792542, "memory(GiB)": 77.29, "reward": 1.5416667461395264, "reward_std": 0.49810245633125305, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 557, "train_speed(iter/s)": 0.011696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/mean_length": 724.8333740234375, "completions/min_length": 572.0, "epoch": 0.0231218663241205, "grad_norm": 0.2417115364723774, "kl": 0.337890625, "learning_rate": 9.981386093634045e-07, "loss": 0.01348874531686306, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 558, "train_speed(iter/s)": 0.011707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/mean_length": 808.4166870117188, "completions/min_length": 655.0, "epoch": 0.023163303360543654, "grad_norm": 2.724157070088737, "kl": 0.330078125, "learning_rate": 9.981294794179524e-07, "loss": 0.013204763643443584, "memory(GiB)": 77.29, "reward": 1.1666667461395264, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 559, "train_speed(iter/s)": 0.011718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1668.0, "completions/mean_length": 853.1666870117188, "completions/min_length": 615.0, "epoch": 0.023204740396966808, "grad_norm": 2.638315177640496, "kl": 0.330078125, "learning_rate": 9.981203271784448e-07, "loss": 0.01318446360528469, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 560, "train_speed(iter/s)": 0.011727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/mean_length": 779.6666870117188, "completions/min_length": 637.0, "epoch": 0.023246177433389963, "grad_norm": 0.2964114307868099, "kl": 0.29931640625, "learning_rate": 9.98111152645291e-07, "loss": 0.011994770728051662, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 561, "train_speed(iter/s)": 0.011739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8000.0, "completions/mean_length": 1287.416748046875, "completions/min_length": 314.0, "epoch": 0.02328761446981312, "grad_norm": 2.34478962243828, "kl": 0.40234375, "learning_rate": 9.98101955818902e-07, "loss": -0.17036661505699158, "memory(GiB)": 77.29, "reward": 1.4583333730697632, "reward_std": 0.65568608045578, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 562, "train_speed(iter/s)": 0.011728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/mean_length": 743.6666870117188, "completions/min_length": 589.0, "epoch": 0.023329051506236275, "grad_norm": 0.24012085277597098, "kl": 0.3349609375, "learning_rate": 9.980927366996892e-07, "loss": 0.013416143134236336, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 563, "train_speed(iter/s)": 0.011739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/mean_length": 757.6666870117188, "completions/min_length": 550.0, "epoch": 0.02337048854265943, "grad_norm": 2.552561952929514, "kl": 0.30908203125, "learning_rate": 9.98083495288065e-07, "loss": 0.012341777794063091, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 564, "train_speed(iter/s)": 0.01175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/mean_length": 651.5833740234375, "completions/min_length": 569.0, "epoch": 0.023411925579082584, "grad_norm": 2.9191686725224137, "kl": 0.37255859375, "learning_rate": 9.980742315844432e-07, "loss": 0.014903774484992027, "memory(GiB)": 77.29, "reward": 1.5833333730697632, "reward_std": 0.4174235463142395, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 565, "train_speed(iter/s)": 0.011762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1078.0, "completions/mean_length": 788.0833740234375, "completions/min_length": 607.0, "epoch": 0.02345336261550574, "grad_norm": 2.5293976939048086, "kl": 0.2919921875, "learning_rate": 9.980649455892385e-07, "loss": 0.01170327328145504, "memory(GiB)": 77.29, "reward": 1.625, "reward_std": 0.4826536476612091, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 566, "train_speed(iter/s)": 0.011773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8000.0, "completions/mean_length": 1356.0, "completions/min_length": 455.0, "epoch": 0.023494799651928893, "grad_norm": 1.41353731246707, "kl": 0.326171875, "learning_rate": 9.980556373028665e-07, "loss": -0.2443614900112152, "memory(GiB)": 77.29, "reward": 1.7916667461395264, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 567, "train_speed(iter/s)": 0.011762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/mean_length": 691.25, "completions/min_length": 607.0, "epoch": 0.023536236688352048, "grad_norm": 2.6661586184202637, "kl": 0.3271484375, "learning_rate": 9.980463067257436e-07, "loss": 0.013084104284644127, "memory(GiB)": 77.29, "reward": 0.9166666865348816, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 568, "train_speed(iter/s)": 0.011773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1039.0, "completions/mean_length": 827.5833740234375, "completions/min_length": 666.0, "epoch": 0.023577673724775206, "grad_norm": 2.801059341631489, "kl": 0.29150390625, "learning_rate": 9.980369538582876e-07, "loss": 0.011633197776973248, "memory(GiB)": 77.29, "reward": 1.7916667461395264, "reward_std": 0.3964807391166687, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 569, "train_speed(iter/s)": 0.011784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/mean_length": 773.5833740234375, "completions/min_length": 626.0, "epoch": 0.02361911076119836, "grad_norm": 0.20634302985984448, "kl": 0.269287109375, "learning_rate": 9.98027578700917e-07, "loss": 0.01076216995716095, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 570, "train_speed(iter/s)": 0.011795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1115.0, "completions/mean_length": 784.5833740234375, "completions/min_length": 568.0, "epoch": 0.023660547797621514, "grad_norm": 2.7768115587567817, "kl": 0.2900390625, "learning_rate": 9.980181812540512e-07, "loss": 0.011619716882705688, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 571, "train_speed(iter/s)": 0.011805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1122.0, "completions/mean_length": 870.8333740234375, "completions/min_length": 480.0, "epoch": 0.02370198483404467, "grad_norm": 2.469492860916119, "kl": 0.2685546875, "learning_rate": 9.98008761518111e-07, "loss": 0.010729452595114708, "memory(GiB)": 77.29, "reward": 1.7083333730697632, "reward_std": 0.49810245633125305, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 572, "train_speed(iter/s)": 0.011816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1256.0, "completions/mean_length": 855.25, "completions/min_length": 552.0, "epoch": 0.023743421870467823, "grad_norm": 2.4317493586209427, "kl": 0.246826171875, "learning_rate": 9.979993194935182e-07, "loss": 0.009885351173579693, "memory(GiB)": 77.29, "reward": 1.375, "reward_std": 0.4826536476612091, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 573, "train_speed(iter/s)": 0.011826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1078.0, "completions/mean_length": 806.0833740234375, "completions/min_length": 598.0, "epoch": 0.023784858906890978, "grad_norm": 2.4530215224321634, "kl": 0.266845703125, "learning_rate": 9.97989855180695e-07, "loss": 0.01068430207669735, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 574, "train_speed(iter/s)": 0.011836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1125.0, "completions/mean_length": 785.0833740234375, "completions/min_length": 637.0, "epoch": 0.023826295943314132, "grad_norm": 0.2293089398524738, "kl": 0.2685546875, "learning_rate": 9.97980368580065e-07, "loss": 0.010740968398749828, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 575, "train_speed(iter/s)": 0.011847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1245.0, "completions/mean_length": 889.25, "completions/min_length": 609.0, "epoch": 0.02386773297973729, "grad_norm": 2.3527638991337207, "kl": 0.263671875, "learning_rate": 9.979708596920529e-07, "loss": 0.010531991720199585, "memory(GiB)": 77.29, "reward": 1.4583333730697632, "reward_std": 0.49810245633125305, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 576, "train_speed(iter/s)": 0.011857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1033.0, "completions/mean_length": 711.0833740234375, "completions/min_length": 585.0, "epoch": 0.023909170016160445, "grad_norm": 54.10278133708424, "kl": 1.404052734375, "learning_rate": 9.979613285170845e-07, "loss": 0.05603259429335594, "memory(GiB)": 77.29, "reward": 1.6666667461395264, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 577, "train_speed(iter/s)": 0.011868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/mean_length": 777.6666870117188, "completions/min_length": 526.0, "epoch": 0.0239506070525836, "grad_norm": 1.1785756702731554, "kl": 0.266357421875, "learning_rate": 9.97951775055586e-07, "loss": 0.010669661685824394, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 578, "train_speed(iter/s)": 0.011879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1252.0, "completions/mean_length": 876.9166870117188, "completions/min_length": 652.0, "epoch": 0.023992044089006754, "grad_norm": 0.31817250483389026, "kl": 0.2451171875, "learning_rate": 9.97942199307985e-07, "loss": 0.009810155257582664, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 579, "train_speed(iter/s)": 0.011889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/mean_length": 781.75, "completions/min_length": 646.0, "epoch": 0.02403348112542991, "grad_norm": 2.5074703104530744, "kl": 0.32470703125, "learning_rate": 9.979326012747104e-07, "loss": 0.01299341581761837, "memory(GiB)": 77.29, "reward": 1.5, "reward_std": 0.5640760660171509, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 580, "train_speed(iter/s)": 0.0119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2387.0, "completions/mean_length": 1059.916748046875, "completions/min_length": 553.0, "epoch": 0.024074918161853063, "grad_norm": 88.38180320926988, "kl": 1.0390625, "learning_rate": 9.979229809561915e-07, "loss": 0.04155778884887695, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 581, "train_speed(iter/s)": 0.011906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1422.25, "completions/min_length": 624.0, "epoch": 0.02411635519827622, "grad_norm": 2.1084650043235436, "kl": 0.28466796875, "learning_rate": 9.97913338352859e-07, "loss": -0.18499521911144257, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.42640143632888794, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 582, "train_speed(iter/s)": 0.011892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1220.0, "completions/mean_length": 815.5833740234375, "completions/min_length": 561.0, "epoch": 0.024157792234699375, "grad_norm": 2.584771582933255, "kl": 0.2724609375, "learning_rate": 9.979036734651442e-07, "loss": 0.010887364856898785, "memory(GiB)": 77.29, "reward": 1.5833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 583, "train_speed(iter/s)": 0.011902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/mean_length": 859.0833740234375, "completions/min_length": 676.0, "epoch": 0.02419922927112253, "grad_norm": 2.1876389848147264, "kl": 0.260009765625, "learning_rate": 9.9789398629348e-07, "loss": 0.01039926242083311, "memory(GiB)": 77.29, "reward": 1.0833333730697632, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 584, "train_speed(iter/s)": 0.011913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1405.5833740234375, "completions/min_length": 667.0, "epoch": 0.024240666307545684, "grad_norm": 2.018420264171812, "kl": 0.27490234375, "learning_rate": 9.978842768382998e-07, "loss": -0.1871877908706665, "memory(GiB)": 77.29, "reward": 1.5833333730697632, "reward_std": 0.6685579419136047, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 585, "train_speed(iter/s)": 0.011902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/mean_length": 736.0, "completions/min_length": 545.0, "epoch": 0.02428210334396884, "grad_norm": 2.697965475240929, "kl": 0.330078125, "learning_rate": 9.978745451000379e-07, "loss": 0.013194630853831768, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 586, "train_speed(iter/s)": 0.011898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.0, "completions/mean_length": 805.6666870117188, "completions/min_length": 541.0, "epoch": 0.024323540380391993, "grad_norm": 190.22979756257354, "kl": 1.341796875, "learning_rate": 9.978647910791303e-07, "loss": 0.053475212305784225, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 587, "train_speed(iter/s)": 0.011909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/mean_length": 678.4166870117188, "completions/min_length": 171.0, "epoch": 0.024364977416815148, "grad_norm": 3.5196502024164853, "kl": 0.32177734375, "learning_rate": 9.978550147760131e-07, "loss": 0.012875239364802837, "memory(GiB)": 77.29, "reward": 1.0833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 588, "train_speed(iter/s)": 0.011919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1060.0, "completions/mean_length": 785.5, "completions/min_length": 651.0, "epoch": 0.024406414453238306, "grad_norm": 2.8927037533399793, "kl": 0.298828125, "learning_rate": 9.978452161911244e-07, "loss": 0.011974851600825787, "memory(GiB)": 77.29, "reward": 1.125, "reward_std": 0.4330126941204071, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 589, "train_speed(iter/s)": 0.011928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/mean_length": 766.8333740234375, "completions/min_length": 590.0, "epoch": 0.02444785148966146, "grad_norm": 3.126729493428234, "kl": 0.306640625, "learning_rate": 9.978353953249021e-07, "loss": 0.012271463871002197, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 590, "train_speed(iter/s)": 0.011939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/mean_length": 583.3333740234375, "completions/min_length": 427.0, "epoch": 0.024489288526084615, "grad_norm": 2.9913456801186986, "kl": 0.35693359375, "learning_rate": 9.978255521777862e-07, "loss": 0.014325520023703575, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 591, "train_speed(iter/s)": 0.011951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/mean_length": 756.4166870117188, "completions/min_length": 594.0, "epoch": 0.02453072556250777, "grad_norm": 2.4835803331437765, "kl": 0.30419921875, "learning_rate": 9.978156867502172e-07, "loss": 0.012142142280936241, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 592, "train_speed(iter/s)": 0.011962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/mean_length": 747.8333740234375, "completions/min_length": 560.0, "epoch": 0.024572162598930924, "grad_norm": 7.466775878627754, "kl": 0.54345703125, "learning_rate": 9.978057990426364e-07, "loss": 0.02181055210530758, "memory(GiB)": 77.29, "reward": 1.5833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 593, "train_speed(iter/s)": 0.011972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/mean_length": 736.3333740234375, "completions/min_length": 513.0, "epoch": 0.02461359963535408, "grad_norm": 2.4636541291205156, "kl": 0.310791015625, "learning_rate": 9.977958890554866e-07, "loss": 0.012406846508383751, "memory(GiB)": 77.29, "reward": 1.0416667461395264, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 594, "train_speed(iter/s)": 0.011981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/mean_length": 779.4166870117188, "completions/min_length": 625.0, "epoch": 0.024655036671777233, "grad_norm": 17.16023442277917, "kl": 0.34375, "learning_rate": 9.97785956789211e-07, "loss": 0.013781843706965446, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 595, "train_speed(iter/s)": 0.011992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 985.0, "completions/mean_length": 725.25, "completions/min_length": 392.0, "epoch": 0.02469647370820039, "grad_norm": 2.6570693991558376, "kl": 0.32470703125, "learning_rate": 9.977760022442542e-07, "loss": 0.013006767258048058, "memory(GiB)": 77.29, "reward": 1.1666667461395264, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 596, "train_speed(iter/s)": 0.012003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1303.0, "completions/mean_length": 764.0833740234375, "completions/min_length": 481.0, "epoch": 0.024737910744623545, "grad_norm": 3.130442270919455, "kl": 0.352294921875, "learning_rate": 9.97766025421062e-07, "loss": 0.014108608476817608, "memory(GiB)": 77.29, "reward": 1.1666667461395264, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 597, "train_speed(iter/s)": 0.012012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/mean_length": 597.8333740234375, "completions/min_length": 482.0, "epoch": 0.0247793477810467, "grad_norm": 3.131410422944248, "kl": 0.3818359375, "learning_rate": 9.977560263200808e-07, "loss": 0.015271544456481934, "memory(GiB)": 77.29, "reward": 1.5, "reward_std": 0.5222329497337341, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 598, "train_speed(iter/s)": 0.012024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1062.0, "completions/mean_length": 885.9166870117188, "completions/min_length": 713.0, "epoch": 0.024820784817469854, "grad_norm": 2.2598421222290765, "kl": 0.2587890625, "learning_rate": 9.977460049417581e-07, "loss": 0.010353893041610718, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 599, "train_speed(iter/s)": 0.012034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/mean_length": 796.4166870117188, "completions/min_length": 595.0, "epoch": 0.02486222185389301, "grad_norm": 0.1956117658935894, "kl": 0.26904296875, "learning_rate": 9.977359612865422e-07, "loss": 0.010791193693876266, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 600, "train_speed(iter/s)": 0.012045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/mean_length": 641.4166870117188, "completions/min_length": 535.0, "epoch": 0.024903658890316163, "grad_norm": 5.133878346674269, "kl": 0.4775390625, "learning_rate": 9.977258953548829e-07, "loss": 0.0190579891204834, "memory(GiB)": 77.29, "reward": 1.2916667461395264, "reward_std": 0.5418123602867126, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 601, "train_speed(iter/s)": 0.012056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/mean_length": 667.5, "completions/min_length": 481.0, "epoch": 0.02494509592673932, "grad_norm": 0.22276170753798188, "kl": 0.27490234375, "learning_rate": 9.977158071472307e-07, "loss": 0.011004041880369186, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 602, "train_speed(iter/s)": 0.012067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/mean_length": 681.0, "completions/min_length": 477.0, "epoch": 0.024986532963162476, "grad_norm": 0.2482048066580613, "kl": 0.2861328125, "learning_rate": 9.977056966640367e-07, "loss": 0.011437810957431793, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 603, "train_speed(iter/s)": 0.012077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1110.0, "completions/mean_length": 729.25, "completions/min_length": 390.0, "epoch": 0.02502796999958563, "grad_norm": 2.983822088362991, "kl": 0.30078125, "learning_rate": 9.976955639057538e-07, "loss": 0.012037336826324463, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 604, "train_speed(iter/s)": 0.012087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/mean_length": 721.9166870117188, "completions/min_length": 603.0, "epoch": 0.025069407036008785, "grad_norm": 2.59893505928677, "kl": 0.263427734375, "learning_rate": 9.976854088728355e-07, "loss": 0.010543227195739746, "memory(GiB)": 77.29, "reward": 1.6666667461395264, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 605, "train_speed(iter/s)": 0.012098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1410.0, "completions/mean_length": 932.5, "completions/min_length": 692.0, "epoch": 0.02511084407243194, "grad_norm": 1.1767072366546043, "kl": 0.269775390625, "learning_rate": 9.976752315657359e-07, "loss": 0.010772299021482468, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 606, "train_speed(iter/s)": 0.012107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/mean_length": 762.0833740234375, "completions/min_length": 578.0, "epoch": 0.025152281108855094, "grad_norm": 0.20773846188193448, "kl": 0.287109375, "learning_rate": 9.976650319849112e-07, "loss": 0.011463899165391922, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 607, "train_speed(iter/s)": 0.012118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1352.8333740234375, "completions/min_length": 585.0, "epoch": 0.025193718145278248, "grad_norm": 1.7991783299250963, "kl": 0.250244140625, "learning_rate": 9.97654810130817e-07, "loss": -0.20400309562683105, "memory(GiB)": 77.29, "reward": 0.8333333730697632, "reward_std": 0.32566946744918823, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.32566946744918823, "step": 608, "train_speed(iter/s)": 0.012107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/mean_length": 755.1666870117188, "completions/min_length": 666.0, "epoch": 0.025235155181701406, "grad_norm": 2.640061234021588, "kl": 0.27001953125, "learning_rate": 9.976445660039117e-07, "loss": 0.010767428204417229, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 609, "train_speed(iter/s)": 0.012117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/mean_length": 735.1666870117188, "completions/min_length": 564.0, "epoch": 0.02527659221812456, "grad_norm": 0.21216616138009906, "kl": 0.2724609375, "learning_rate": 9.97634299604653e-07, "loss": 0.010905827395617962, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 610, "train_speed(iter/s)": 0.012128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/mean_length": 720.4166870117188, "completions/min_length": 607.0, "epoch": 0.025318029254547715, "grad_norm": 2.407378083133436, "kl": 0.28125, "learning_rate": 9.976240109335009e-07, "loss": 0.011245648376643658, "memory(GiB)": 77.29, "reward": 1.25, "reward_std": 0.6215815544128418, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 611, "train_speed(iter/s)": 0.012139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/mean_length": 741.6666870117188, "completions/min_length": 534.0, "epoch": 0.02535946629097087, "grad_norm": 2.634421677292828, "kl": 0.27099609375, "learning_rate": 9.976136999909155e-07, "loss": 0.010832737199962139, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 612, "train_speed(iter/s)": 0.01215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/mean_length": 786.9166870117188, "completions/min_length": 708.0, "epoch": 0.025400903327394024, "grad_norm": 0.263613860564361, "kl": 0.227783203125, "learning_rate": 9.976033667773585e-07, "loss": 0.009114071726799011, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 613, "train_speed(iter/s)": 0.012161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/mean_length": 730.25, "completions/min_length": 554.0, "epoch": 0.02544234036381718, "grad_norm": 2.758386628942559, "kl": 0.28271484375, "learning_rate": 9.975930112932925e-07, "loss": 0.011287709698081017, "memory(GiB)": 77.29, "reward": 1.25, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 614, "train_speed(iter/s)": 0.012171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 509.41668701171875, "completions/min_length": 397.0, "epoch": 0.025483777400240337, "grad_norm": 3.560924688257707, "kl": 0.35205078125, "learning_rate": 9.975826335391805e-07, "loss": 0.01405700109899044, "memory(GiB)": 77.29, "reward": 1.75, "reward_std": 0.26111647486686707, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.26111647486686707, "step": 615, "train_speed(iter/s)": 0.012183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/mean_length": 631.25, "completions/min_length": 428.0, "epoch": 0.02552521443666349, "grad_norm": 0.20366627400447246, "kl": 0.265869140625, "learning_rate": 9.975722335154875e-07, "loss": 0.010604707524180412, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 616, "train_speed(iter/s)": 0.012194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/mean_length": 780.5833740234375, "completions/min_length": 600.0, "epoch": 0.025566651473086646, "grad_norm": 2.511010453259748, "kl": 0.249755859375, "learning_rate": 9.975618112226787e-07, "loss": 0.009993473999202251, "memory(GiB)": 77.29, "reward": 1.75, "reward_std": 0.39886200428009033, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 617, "train_speed(iter/s)": 0.012204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1368.0, "completions/mean_length": 786.3333740234375, "completions/min_length": 571.0, "epoch": 0.0256080885095098, "grad_norm": 2.349901159798454, "kl": 0.234130859375, "learning_rate": 9.975513666612203e-07, "loss": 0.009360343217849731, "memory(GiB)": 77.29, "reward": 1.7083333730697632, "reward_std": 0.45016831159591675, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 618, "train_speed(iter/s)": 0.012213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1157.0, "completions/mean_length": 764.9166870117188, "completions/min_length": 578.0, "epoch": 0.025649525545932955, "grad_norm": 3.6438002882264713, "kl": 0.26318359375, "learning_rate": 9.975408998315803e-07, "loss": 0.010505358688533306, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 619, "train_speed(iter/s)": 0.012223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/mean_length": 684.5833740234375, "completions/min_length": 473.0, "epoch": 0.02569096258235611, "grad_norm": 2.3426302655988525, "kl": 0.267333984375, "learning_rate": 9.975304107342268e-07, "loss": 0.01069784164428711, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 620, "train_speed(iter/s)": 0.012234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/mean_length": 535.3333740234375, "completions/min_length": 439.0, "epoch": 0.025732399618779263, "grad_norm": 7.286845106565109, "kl": 0.30419921875, "learning_rate": 9.975198993696291e-07, "loss": 0.012172440998256207, "memory(GiB)": 77.29, "reward": 1.4166667461395264, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 621, "train_speed(iter/s)": 0.012246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1598.0, "completions/mean_length": 830.4166870117188, "completions/min_length": 592.0, "epoch": 0.02577383665520242, "grad_norm": 2.462940963755229, "kl": 0.2275390625, "learning_rate": 9.97509365738258e-07, "loss": 0.009099384769797325, "memory(GiB)": 77.29, "reward": 1.3333333730697632, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 622, "train_speed(iter/s)": 0.012254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/mean_length": 639.0, "completions/min_length": 532.0, "epoch": 0.025815273691625576, "grad_norm": 2.829411702945256, "kl": 0.234130859375, "learning_rate": 9.97498809840585e-07, "loss": 0.009372591972351074, "memory(GiB)": 77.29, "reward": 1.4166667461395264, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 623, "train_speed(iter/s)": 0.012265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/mean_length": 691.4166870117188, "completions/min_length": 568.0, "epoch": 0.02585671072804873, "grad_norm": 2.5070769481843076, "kl": 0.212890625, "learning_rate": 9.97488231677082e-07, "loss": 0.008512100204825401, "memory(GiB)": 77.29, "reward": 1.4166667461395264, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 624, "train_speed(iter/s)": 0.012276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/mean_length": 647.4166870117188, "completions/min_length": 488.0, "epoch": 0.025898147764471885, "grad_norm": 0.19849334346594016, "kl": 0.249755859375, "learning_rate": 9.974776312482228e-07, "loss": 0.009983190335333347, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 625, "train_speed(iter/s)": 0.012287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/mean_length": 763.75, "completions/min_length": 616.0, "epoch": 0.02593958480089504, "grad_norm": 2.294067022014051, "kl": 0.25341796875, "learning_rate": 9.97467008554482e-07, "loss": 0.010139008983969688, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 626, "train_speed(iter/s)": 0.012297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/mean_length": 550.75, "completions/min_length": 440.0, "epoch": 0.025981021837318194, "grad_norm": 3.282592035719811, "kl": 0.3115234375, "learning_rate": 9.974563635963347e-07, "loss": 0.01244497299194336, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 627, "train_speed(iter/s)": 0.012308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/mean_length": 667.75, "completions/min_length": 537.0, "epoch": 0.02602245887374135, "grad_norm": 2.325191154855278, "kl": 0.248046875, "learning_rate": 9.974456963742572e-07, "loss": 0.009904375299811363, "memory(GiB)": 77.29, "reward": 1.0833333730697632, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 628, "train_speed(iter/s)": 0.012319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/mean_length": 631.25, "completions/min_length": 448.0, "epoch": 0.026063895910164506, "grad_norm": 3.181564857506991, "kl": 0.233642578125, "learning_rate": 9.974350068887274e-07, "loss": 0.00936724804341793, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 629, "train_speed(iter/s)": 0.012329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1163.0, "completions/mean_length": 665.25, "completions/min_length": 445.0, "epoch": 0.02610533294658766, "grad_norm": 0.3196917505838232, "kl": 0.21435546875, "learning_rate": 9.974242951402235e-07, "loss": 0.00858390610665083, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 630, "train_speed(iter/s)": 0.012339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1217.0, "completions/mean_length": 773.4166870117188, "completions/min_length": 573.0, "epoch": 0.026146769983010815, "grad_norm": 0.1789082265565564, "kl": 0.1884765625, "learning_rate": 9.974135611292248e-07, "loss": 0.007547212764620781, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 631, "train_speed(iter/s)": 0.012349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/mean_length": 732.1666870117188, "completions/min_length": 556.0, "epoch": 0.02618820701943397, "grad_norm": 3.516645571247629, "kl": 0.18896484375, "learning_rate": 9.974028048562116e-07, "loss": 0.00760301947593689, "memory(GiB)": 77.29, "reward": 0.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 632, "train_speed(iter/s)": 0.012358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/mean_length": 722.0833740234375, "completions/min_length": 451.0, "epoch": 0.026229644055857124, "grad_norm": 0.19477464883242748, "kl": 0.21484375, "learning_rate": 9.973920263216657e-07, "loss": 0.008587772957980633, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 633, "train_speed(iter/s)": 0.012369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/mean_length": 658.0, "completions/min_length": 480.0, "epoch": 0.02627108109228028, "grad_norm": 0.19891778458306694, "kl": 0.2021484375, "learning_rate": 9.973812255260692e-07, "loss": 0.008059101179242134, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 634, "train_speed(iter/s)": 0.012379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/mean_length": 657.8333740234375, "completions/min_length": 501.0, "epoch": 0.026312518128703437, "grad_norm": 0.7955608858056532, "kl": 0.20849609375, "learning_rate": 9.973704024699055e-07, "loss": 0.008340905420482159, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 635, "train_speed(iter/s)": 0.01239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/mean_length": 614.25, "completions/min_length": 487.0, "epoch": 0.02635395516512659, "grad_norm": 0.17645321140107012, "kl": 0.1875, "learning_rate": 9.97359557153659e-07, "loss": 0.0074850223027169704, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 636, "train_speed(iter/s)": 0.012402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1055.0, "completions/mean_length": 775.25, "completions/min_length": 650.0, "epoch": 0.026395392201549746, "grad_norm": 2.7066187573440668, "kl": 0.178955078125, "learning_rate": 9.973486895778155e-07, "loss": 0.007151852361857891, "memory(GiB)": 77.29, "reward": 1.75, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 637, "train_speed(iter/s)": 0.012411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1060.0, "completions/mean_length": 720.5, "completions/min_length": 474.0, "epoch": 0.0264368292379729, "grad_norm": 3.4431837256303113, "kl": 0.17724609375, "learning_rate": 9.973377997428608e-07, "loss": 0.007090399973094463, "memory(GiB)": 77.29, "reward": 1.0416667461395264, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 638, "train_speed(iter/s)": 0.012418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/mean_length": 727.5, "completions/min_length": 573.0, "epoch": 0.026478266274396055, "grad_norm": 2.4038366487896012, "kl": 0.177734375, "learning_rate": 9.973268876492825e-07, "loss": 0.007100164890289307, "memory(GiB)": 77.29, "reward": 1.5, "reward_std": 0.5222329497337341, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 639, "train_speed(iter/s)": 0.012429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/mean_length": 587.3333740234375, "completions/min_length": 477.0, "epoch": 0.02651970331081921, "grad_norm": 0.1885514040803826, "kl": 0.20068359375, "learning_rate": 9.97315953297569e-07, "loss": 0.008019976317882538, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 640, "train_speed(iter/s)": 0.01244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/mean_length": 535.0, "completions/min_length": 385.0, "epoch": 0.026561140347242364, "grad_norm": 2.891542040241176, "kl": 0.21044921875, "learning_rate": 9.973049966882097e-07, "loss": 0.008416250348091125, "memory(GiB)": 77.29, "reward": 1.6666667461395264, "reward_std": 0.44381269812583923, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 641, "train_speed(iter/s)": 0.012451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/mean_length": 648.25, "completions/min_length": 521.0, "epoch": 0.026602577383665522, "grad_norm": 2.5446070930703266, "kl": 0.205322265625, "learning_rate": 9.972940178216952e-07, "loss": 0.00822361372411251, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 642, "train_speed(iter/s)": 0.012462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/mean_length": 531.9166870117188, "completions/min_length": 399.0, "epoch": 0.026644014420088676, "grad_norm": 2.8238818250219535, "kl": 0.189208984375, "learning_rate": 9.972830166985163e-07, "loss": 0.007584760896861553, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 643, "train_speed(iter/s)": 0.012473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/mean_length": 674.3333740234375, "completions/min_length": 477.0, "epoch": 0.02668545145651183, "grad_norm": 2.6515179169778156, "kl": 0.193603515625, "learning_rate": 9.972719933191655e-07, "loss": 0.007739256136119366, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 644, "train_speed(iter/s)": 0.012481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/mean_length": 632.4166870117188, "completions/min_length": 489.0, "epoch": 0.026726888492934985, "grad_norm": 2.383071984796377, "kl": 0.1923828125, "learning_rate": 9.972609476841365e-07, "loss": 0.007697622291743755, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 645, "train_speed(iter/s)": 0.012491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/mean_length": 737.6666870117188, "completions/min_length": 545.0, "epoch": 0.02676832552935814, "grad_norm": 0.5741435249049929, "kl": 0.162109375, "learning_rate": 9.972498797939237e-07, "loss": 0.0064905062317848206, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 646, "train_speed(iter/s)": 0.012501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/mean_length": 634.6666870117188, "completions/min_length": 562.0, "epoch": 0.026809762565781294, "grad_norm": 9.966535999412827, "kl": 0.333984375, "learning_rate": 9.97238789649022e-07, "loss": 0.013354520313441753, "memory(GiB)": 77.29, "reward": 1.25, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 647, "train_speed(iter/s)": 0.012512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/mean_length": 555.8333740234375, "completions/min_length": 452.0, "epoch": 0.02685119960220445, "grad_norm": 2.6075965151240825, "kl": 0.214599609375, "learning_rate": 9.97227677249928e-07, "loss": 0.008561819791793823, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 648, "train_speed(iter/s)": 0.012523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/mean_length": 653.0, "completions/min_length": 529.0, "epoch": 0.026892636638627607, "grad_norm": 0.17532265236364414, "kl": 0.17529296875, "learning_rate": 9.97216542597139e-07, "loss": 0.007003895007073879, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 649, "train_speed(iter/s)": 0.012533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1257.0, "completions/mean_length": 690.0, "completions/min_length": 450.0, "epoch": 0.02693407367505076, "grad_norm": 2.4380940552049504, "kl": 0.16552734375, "learning_rate": 9.972053856911534e-07, "loss": 0.00662278663367033, "memory(GiB)": 77.29, "reward": 1.25, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 650, "train_speed(iter/s)": 0.012542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/mean_length": 728.4166870117188, "completions/min_length": 583.0, "epoch": 0.026975510711473916, "grad_norm": 2.5007620545294897, "kl": 0.15576171875, "learning_rate": 9.971942065324702e-07, "loss": 0.006217335816472769, "memory(GiB)": 77.29, "reward": 1.5416667461395264, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 651, "train_speed(iter/s)": 0.012553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1543.0, "completions/mean_length": 821.8333740234375, "completions/min_length": 573.0, "epoch": 0.02701694774789707, "grad_norm": 2.298761186372879, "kl": 0.150390625, "learning_rate": 9.971830051215903e-07, "loss": 0.006007587071508169, "memory(GiB)": 77.29, "reward": 1.2916667461395264, "reward_std": 0.6894772052764893, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 652, "train_speed(iter/s)": 0.01256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/mean_length": 572.1666870117188, "completions/min_length": 361.0, "epoch": 0.027058384784320225, "grad_norm": 0.2213889852350506, "kl": 0.1865234375, "learning_rate": 9.971717814590148e-07, "loss": 0.007474973332136869, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 653, "train_speed(iter/s)": 0.01257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/mean_length": 516.1666870117188, "completions/min_length": 405.0, "epoch": 0.02709982182074338, "grad_norm": 0.22839221785284247, "kl": 0.223388671875, "learning_rate": 9.971605355452457e-07, "loss": 0.008928644470870495, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 654, "train_speed(iter/s)": 0.012581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/mean_length": 725.4166870117188, "completions/min_length": 563.0, "epoch": 0.027141258857166537, "grad_norm": 2.3268247731888843, "kl": 0.1474609375, "learning_rate": 9.971492673807869e-07, "loss": 0.005893051624298096, "memory(GiB)": 77.29, "reward": 1.75, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 655, "train_speed(iter/s)": 0.012591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/mean_length": 616.9166870117188, "completions/min_length": 456.0, "epoch": 0.02718269589358969, "grad_norm": 2.8076034824526297, "kl": 0.182861328125, "learning_rate": 9.971379769661421e-07, "loss": 0.007316311355680227, "memory(GiB)": 77.29, "reward": 1.0833333730697632, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 656, "train_speed(iter/s)": 0.012601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/mean_length": 640.5833740234375, "completions/min_length": 421.0, "epoch": 0.027224132930012846, "grad_norm": 0.170987304881827, "kl": 0.156982421875, "learning_rate": 9.97126664301817e-07, "loss": 0.0062702796421945095, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 657, "train_speed(iter/s)": 0.012611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/mean_length": 631.75, "completions/min_length": 467.0, "epoch": 0.027265569966436, "grad_norm": 0.6742701773960375, "kl": 0.1544189453125, "learning_rate": 9.971153293883178e-07, "loss": 0.006187140010297298, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 658, "train_speed(iter/s)": 0.012618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.0, "completions/mean_length": 651.6666870117188, "completions/min_length": 442.0, "epoch": 0.027307007002859155, "grad_norm": 2.3679955388909097, "kl": 0.157958984375, "learning_rate": 9.971039722261518e-07, "loss": 0.006319021340459585, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 659, "train_speed(iter/s)": 0.012627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/mean_length": 639.6666870117188, "completions/min_length": 400.0, "epoch": 0.02734844403928231, "grad_norm": 3.0103418292924538, "kl": 0.165283203125, "learning_rate": 9.970925928158272e-07, "loss": 0.006611799355596304, "memory(GiB)": 77.29, "reward": 1.1666667461395264, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 660, "train_speed(iter/s)": 0.012637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 781.0, "completions/mean_length": 593.4166870117188, "completions/min_length": 443.0, "epoch": 0.027389881075705464, "grad_norm": 3.0886737787701293, "kl": 0.230224609375, "learning_rate": 9.970811911578537e-07, "loss": 0.00920896790921688, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 661, "train_speed(iter/s)": 0.012648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/mean_length": 499.66668701171875, "completions/min_length": 367.0, "epoch": 0.027431318112128622, "grad_norm": 3.5624422104560995, "kl": 0.2119140625, "learning_rate": 9.970697672527407e-07, "loss": 0.008468846790492535, "memory(GiB)": 77.29, "reward": 1.875, "reward_std": 0.4330126941204071, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 662, "train_speed(iter/s)": 0.012659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/mean_length": 621.1666870117188, "completions/min_length": 406.0, "epoch": 0.027472755148551777, "grad_norm": 0.2689023304898727, "kl": 0.174560546875, "learning_rate": 9.970583211010007e-07, "loss": 0.0069673634134233, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 663, "train_speed(iter/s)": 0.012669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 792.0, "completions/mean_length": 673.5, "completions/min_length": 531.0, "epoch": 0.02751419218497493, "grad_norm": 0.22774905829885192, "kl": 0.15087890625, "learning_rate": 9.97046852703145e-07, "loss": 0.006043684668838978, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 664, "train_speed(iter/s)": 0.012679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/mean_length": 809.0833740234375, "completions/min_length": 485.0, "epoch": 0.027555629221398086, "grad_norm": 258.8359486875519, "kl": 6.6912841796875, "learning_rate": 9.970353620596871e-07, "loss": 0.26777055859565735, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 665, "train_speed(iter/s)": 0.012688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 545.6666870117188, "completions/min_length": 489.0, "epoch": 0.02759706625782124, "grad_norm": 0.1901555392156992, "kl": 0.23486328125, "learning_rate": 9.970238491711415e-07, "loss": 0.00936332531273365, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 666, "train_speed(iter/s)": 0.012699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1798.0, "completions/mean_length": 811.5, "completions/min_length": 136.0, "epoch": 0.027638503294244395, "grad_norm": 18.286804248832702, "kl": 0.1717529296875, "learning_rate": 9.970123140380234e-07, "loss": 0.006865342613309622, "memory(GiB)": 77.29, "reward": 1.7083333730697632, "reward_std": 0.6894772052764893, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 667, "train_speed(iter/s)": 0.012706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 989.0, "completions/mean_length": 877.1666870117188, "completions/min_length": 780.0, "epoch": 0.02767994033066755, "grad_norm": 2160.4558317323213, "kl": 8.4180908203125, "learning_rate": 9.970007566608491e-07, "loss": 0.3384190499782562, "memory(GiB)": 77.29, "reward": 1.0833333730697632, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 668, "train_speed(iter/s)": 0.012716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/mean_length": 878.75, "completions/min_length": 626.0, "epoch": 0.027721377367090707, "grad_norm": 2.0599479286433446, "kl": 0.1322021484375, "learning_rate": 9.969891770401356e-07, "loss": 0.005291914101690054, "memory(GiB)": 77.29, "reward": 1.7083333730697632, "reward_std": 0.45016831159591675, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 669, "train_speed(iter/s)": 0.012725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/mean_length": 598.5833740234375, "completions/min_length": 477.0, "epoch": 0.02776281440351386, "grad_norm": 0.24892899813114297, "kl": 0.19873046875, "learning_rate": 9.969775751764014e-07, "loss": 0.007959566079080105, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 670, "train_speed(iter/s)": 0.012735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1178.0, "completions/mean_length": 725.0833740234375, "completions/min_length": 443.0, "epoch": 0.027804251439937016, "grad_norm": 2.4740328998368795, "kl": 0.1304931640625, "learning_rate": 9.969659510701657e-07, "loss": 0.005225449800491333, "memory(GiB)": 77.29, "reward": 1.3333333730697632, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 671, "train_speed(iter/s)": 0.012744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/mean_length": 811.1666870117188, "completions/min_length": 701.0, "epoch": 0.02784568847636017, "grad_norm": 4114.520560533321, "kl": 11.339599609375, "learning_rate": 9.969543047219486e-07, "loss": 0.4546031653881073, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 672, "train_speed(iter/s)": 0.012753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/mean_length": 775.9166870117188, "completions/min_length": 620.0, "epoch": 0.027887125512783325, "grad_norm": 2.444314338749845, "kl": 0.141845703125, "learning_rate": 9.969426361322714e-07, "loss": 0.005679105408489704, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 673, "train_speed(iter/s)": 0.012763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/mean_length": 810.0833740234375, "completions/min_length": 606.0, "epoch": 0.02792856254920648, "grad_norm": 0.41599587817413436, "kl": 0.122314453125, "learning_rate": 9.969309453016567e-07, "loss": 0.004889309406280518, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 674, "train_speed(iter/s)": 0.012772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/mean_length": 791.3333740234375, "completions/min_length": 637.0, "epoch": 0.027969999585629637, "grad_norm": 2.3566352330726756, "kl": 0.1163330078125, "learning_rate": 9.96919232230627e-07, "loss": 0.004653652664273977, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 675, "train_speed(iter/s)": 0.012781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/mean_length": 577.1666870117188, "completions/min_length": 498.0, "epoch": 0.028011436622052792, "grad_norm": 2.7183634770054734, "kl": 0.219482421875, "learning_rate": 9.969074969197071e-07, "loss": 0.008770495653152466, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 676, "train_speed(iter/s)": 0.012791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1400.0833740234375, "completions/min_length": 604.0, "epoch": 0.028052873658475946, "grad_norm": 2.0534957659860735, "kl": 0.12109375, "learning_rate": 9.968957393694222e-07, "loss": -0.1432676613330841, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.5640760660171509, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3256694972515106, "step": 677, "train_speed(iter/s)": 0.012767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/mean_length": 725.0833740234375, "completions/min_length": 500.0, "epoch": 0.0280943106948991, "grad_norm": 0.2356225574537096, "kl": 0.132080078125, "learning_rate": 9.968839595802981e-07, "loss": 0.005292870104312897, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 678, "train_speed(iter/s)": 0.012777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/mean_length": 650.9166870117188, "completions/min_length": 526.0, "epoch": 0.028135747731322255, "grad_norm": 0.5113669659841531, "kl": 0.1278076171875, "learning_rate": 9.968721575528625e-07, "loss": 0.005124017130583525, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 679, "train_speed(iter/s)": 0.012787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 976.0, "completions/mean_length": 762.6666870117188, "completions/min_length": 596.0, "epoch": 0.02817718476774541, "grad_norm": 2.227292311294397, "kl": 0.137939453125, "learning_rate": 9.968603332876432e-07, "loss": 0.0055214762687683105, "memory(GiB)": 77.29, "reward": 1.5416667461395264, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 680, "train_speed(iter/s)": 0.012796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1336.0, "completions/mean_length": 924.8333740234375, "completions/min_length": 726.0, "epoch": 0.028218621804168564, "grad_norm": 0.4938759500655172, "kl": 0.127685546875, "learning_rate": 9.968484867851697e-07, "loss": 0.005112603306770325, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 681, "train_speed(iter/s)": 0.012804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/mean_length": 738.1666870117188, "completions/min_length": 594.0, "epoch": 0.028260058840591722, "grad_norm": 3.210938403615743, "kl": 0.148681640625, "learning_rate": 9.968366180459723e-07, "loss": 0.005946457386016846, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 682, "train_speed(iter/s)": 0.012814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/mean_length": 633.4166870117188, "completions/min_length": 433.0, "epoch": 0.028301495877014877, "grad_norm": 2.9159033031633066, "kl": 0.15966796875, "learning_rate": 9.968247270705817e-07, "loss": 0.0063749453984200954, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 683, "train_speed(iter/s)": 0.012823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/mean_length": 595.5833740234375, "completions/min_length": 517.0, "epoch": 0.02834293291343803, "grad_norm": 2.6815506041689305, "kl": 0.1904296875, "learning_rate": 9.968128138595302e-07, "loss": 0.0076114339753985405, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 684, "train_speed(iter/s)": 0.012834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1065.0, "completions/mean_length": 882.1666870117188, "completions/min_length": 573.0, "epoch": 0.028384369949861186, "grad_norm": 2.443471100778839, "kl": 0.120849609375, "learning_rate": 9.968008784133514e-07, "loss": 0.004840652458369732, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 685, "train_speed(iter/s)": 0.012843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1076.0, "completions/mean_length": 834.75, "completions/min_length": 568.0, "epoch": 0.02842580698628434, "grad_norm": 2.820713533273292, "kl": 0.165771484375, "learning_rate": 9.967889207325793e-07, "loss": 0.006625141017138958, "memory(GiB)": 77.29, "reward": 1.0416667461395264, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 686, "train_speed(iter/s)": 0.012852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/mean_length": 776.8333740234375, "completions/min_length": 483.0, "epoch": 0.028467244022707495, "grad_norm": 2.023370837882847, "kl": 0.1209716796875, "learning_rate": 9.967769408177488e-07, "loss": 0.004841755144298077, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 687, "train_speed(iter/s)": 0.012861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5400.0, "completions/mean_length": 1270.166748046875, "completions/min_length": 563.0, "epoch": 0.02850868105913065, "grad_norm": 2.2103628274839533, "kl": 0.1180419921875, "learning_rate": 9.967649386693964e-07, "loss": 0.004714548587799072, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 688, "train_speed(iter/s)": 0.012856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 918.0, "completions/mean_length": 724.8333740234375, "completions/min_length": 470.0, "epoch": 0.028550118095553807, "grad_norm": 0.23739680751628134, "kl": 0.140380859375, "learning_rate": 9.96752914288059e-07, "loss": 0.005614589434117079, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 689, "train_speed(iter/s)": 0.012865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/mean_length": 818.25, "completions/min_length": 640.0, "epoch": 0.028591555131976962, "grad_norm": 2.280175637850496, "kl": 0.15625, "learning_rate": 9.967408676742751e-07, "loss": 0.006263067480176687, "memory(GiB)": 77.29, "reward": 1.5833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 690, "train_speed(iter/s)": 0.012874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1110.0, "completions/mean_length": 845.0833740234375, "completions/min_length": 623.0, "epoch": 0.028632992168400116, "grad_norm": 4.9311088960768865, "kl": 0.136962890625, "learning_rate": 9.967287988285835e-07, "loss": 0.005477726459503174, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 691, "train_speed(iter/s)": 0.012883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1319.0, "completions/mean_length": 820.6666870117188, "completions/min_length": 502.0, "epoch": 0.02867442920482327, "grad_norm": 2.646182402119625, "kl": 0.11669921875, "learning_rate": 9.967167077515243e-07, "loss": 0.004670421592891216, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 692, "train_speed(iter/s)": 0.012891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/mean_length": 588.25, "completions/min_length": 429.0, "epoch": 0.028715866241246425, "grad_norm": 3.655421869505893, "kl": 0.26806640625, "learning_rate": 9.967045944436393e-07, "loss": 0.010728051885962486, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 693, "train_speed(iter/s)": 0.012901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1337.0, "completions/min_length": 546.0, "epoch": 0.02875730327766958, "grad_norm": 1.498987922752754, "kl": 0.1416015625, "learning_rate": 9.966924589054696e-07, "loss": -0.25941765308380127, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.5773502588272095, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 694, "train_speed(iter/s)": 0.012888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/mean_length": 775.8333740234375, "completions/min_length": 543.0, "epoch": 0.028798740314092738, "grad_norm": 0.284276643625608, "kl": 0.1480712890625, "learning_rate": 9.966803011375594e-07, "loss": 0.0059271762147545815, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 695, "train_speed(iter/s)": 0.012898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/mean_length": 730.3333740234375, "completions/min_length": 473.0, "epoch": 0.028840177350515892, "grad_norm": 0.37665227174959554, "kl": 0.1376953125, "learning_rate": 9.96668121140452e-07, "loss": 0.005507954861968756, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 696, "train_speed(iter/s)": 0.012907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1256.0, "completions/mean_length": 776.25, "completions/min_length": 577.0, "epoch": 0.028881614386939047, "grad_norm": 0.1621864727873889, "kl": 0.1285400390625, "learning_rate": 9.96655918914693e-07, "loss": 0.00513082928955555, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 697, "train_speed(iter/s)": 0.012916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/mean_length": 630.9166870117188, "completions/min_length": 443.0, "epoch": 0.0289230514233622, "grad_norm": 2.6653857547778474, "kl": 0.1396484375, "learning_rate": 9.966436944608283e-07, "loss": 0.0055931806564331055, "memory(GiB)": 77.29, "reward": 1.75, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 698, "train_speed(iter/s)": 0.012926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/mean_length": 671.6666870117188, "completions/min_length": 515.0, "epoch": 0.028964488459785356, "grad_norm": 2.5967205408552845, "kl": 0.1611328125, "learning_rate": 9.966314477794052e-07, "loss": 0.006438056938350201, "memory(GiB)": 77.29, "reward": 0.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 699, "train_speed(iter/s)": 0.012933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/mean_length": 536.5, "completions/min_length": 324.0, "epoch": 0.02900592549620851, "grad_norm": 3.7406510129242845, "kl": 0.228759765625, "learning_rate": 9.966191788709714e-07, "loss": 0.009131908416748047, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 700, "train_speed(iter/s)": 0.012943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/mean_length": 563.4166870117188, "completions/min_length": 421.0, "epoch": 0.029047362532631665, "grad_norm": 2.5940125082723013, "kl": 0.214111328125, "learning_rate": 9.966068877360765e-07, "loss": 0.008566677570343018, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 701, "train_speed(iter/s)": 0.012953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/mean_length": 762.8333740234375, "completions/min_length": 492.0, "epoch": 0.029088799569054823, "grad_norm": 2.149947234401585, "kl": 0.1239013671875, "learning_rate": 9.965945743752705e-07, "loss": 0.004952549934387207, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 702, "train_speed(iter/s)": 0.012962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/mean_length": 557.4166870117188, "completions/min_length": 417.0, "epoch": 0.029130236605477977, "grad_norm": 3.1886131000621805, "kl": 0.1640625, "learning_rate": 9.96582238789104e-07, "loss": 0.006565918680280447, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 703, "train_speed(iter/s)": 0.012972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/mean_length": 575.6666870117188, "completions/min_length": 474.0, "epoch": 0.02917167364190113, "grad_norm": 2.476114557077495, "kl": 0.14990234375, "learning_rate": 9.965698809781298e-07, "loss": 0.00599437952041626, "memory(GiB)": 77.29, "reward": 0.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 704, "train_speed(iter/s)": 0.012982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/mean_length": 638.75, "completions/min_length": 409.0, "epoch": 0.029213110678324286, "grad_norm": 2.7543427970591456, "kl": 0.175537109375, "learning_rate": 9.965575009429005e-07, "loss": 0.007006734609603882, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 705, "train_speed(iter/s)": 0.012992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.0, "completions/mean_length": 723.75, "completions/min_length": 426.0, "epoch": 0.02925454771474744, "grad_norm": 2.3149201213312383, "kl": 0.174072265625, "learning_rate": 9.965450986839702e-07, "loss": 0.006952430121600628, "memory(GiB)": 77.29, "reward": 1.75, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 706, "train_speed(iter/s)": 0.013001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/mean_length": 597.6666870117188, "completions/min_length": 415.0, "epoch": 0.029295984751170595, "grad_norm": 3.7247756097859073, "kl": 0.187744140625, "learning_rate": 9.965326742018942e-07, "loss": 0.007502786815166473, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 707, "train_speed(iter/s)": 0.013011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/mean_length": 545.5, "completions/min_length": 415.0, "epoch": 0.02933742178759375, "grad_norm": 2.9923053398765784, "kl": 0.2177734375, "learning_rate": 9.965202274972286e-07, "loss": 0.008718709461390972, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 708, "train_speed(iter/s)": 0.01302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4809.0, "completions/mean_length": 1052.916748046875, "completions/min_length": 527.0, "epoch": 0.029378858824016908, "grad_norm": 1.2922800776869638, "kl": 0.146484375, "learning_rate": 9.965077585705301e-07, "loss": 0.0058714053593575954, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 709, "train_speed(iter/s)": 0.013017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/mean_length": 545.9166870117188, "completions/min_length": 357.0, "epoch": 0.029420295860440062, "grad_norm": 2.5317693429777566, "kl": 0.1561279296875, "learning_rate": 9.96495267422357e-07, "loss": 0.006234586238861084, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 710, "train_speed(iter/s)": 0.013027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/mean_length": 650.3333740234375, "completions/min_length": 447.0, "epoch": 0.029461732896863217, "grad_norm": 2.922104713585102, "kl": 0.145263671875, "learning_rate": 9.964827540532684e-07, "loss": 0.005828981753438711, "memory(GiB)": 77.29, "reward": 1.5833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 711, "train_speed(iter/s)": 0.013036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/mean_length": 686.4166870117188, "completions/min_length": 483.0, "epoch": 0.02950316993328637, "grad_norm": 3.462815741828153, "kl": 0.139892578125, "learning_rate": 9.964702184638243e-07, "loss": 0.005609422922134399, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 712, "train_speed(iter/s)": 0.013045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/mean_length": 669.8333740234375, "completions/min_length": 516.0, "epoch": 0.029544606969709526, "grad_norm": 2.5894468262793873, "kl": 0.151123046875, "learning_rate": 9.964576606545856e-07, "loss": 0.006050353404134512, "memory(GiB)": 77.29, "reward": 1.5833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 713, "train_speed(iter/s)": 0.013054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/mean_length": 603.5, "completions/min_length": 433.0, "epoch": 0.02958604400613268, "grad_norm": 2.6265483682550133, "kl": 0.16650390625, "learning_rate": 9.964450806261144e-07, "loss": 0.0066565475426614285, "memory(GiB)": 77.29, "reward": 1.6666667461395264, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 714, "train_speed(iter/s)": 0.013064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/mean_length": 489.66668701171875, "completions/min_length": 319.0, "epoch": 0.029627481042555838, "grad_norm": 3.7034733342116244, "kl": 0.21044921875, "learning_rate": 9.964324783789737e-07, "loss": 0.008415868505835533, "memory(GiB)": 77.29, "reward": 0.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 715, "train_speed(iter/s)": 0.013074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/mean_length": 637.5833740234375, "completions/min_length": 494.0, "epoch": 0.029668918078978992, "grad_norm": 138.5196865552705, "kl": 0.89501953125, "learning_rate": 9.964198539137277e-07, "loss": 0.035825710743665695, "memory(GiB)": 77.29, "reward": 1.0833333730697632, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 716, "train_speed(iter/s)": 0.013083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/mean_length": 523.6666870117188, "completions/min_length": 389.0, "epoch": 0.029710355115402147, "grad_norm": 3.25227681111143, "kl": 0.2373046875, "learning_rate": 9.96407207230941e-07, "loss": 0.009464293718338013, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 717, "train_speed(iter/s)": 0.013093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/mean_length": 473.25, "completions/min_length": 90.0, "epoch": 0.0297517921518253, "grad_norm": 2715.096156373681, "kl": 63.161376953125, "learning_rate": 9.963945383311803e-07, "loss": 2.5269947052001953, "memory(GiB)": 77.29, "reward": 1.7916667461395264, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 718, "train_speed(iter/s)": 0.013103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/mean_length": 634.5, "completions/min_length": 496.0, "epoch": 0.029793229188248456, "grad_norm": 2.9205096859520085, "kl": 0.1773681640625, "learning_rate": 9.96381847215012e-07, "loss": 0.007122318260371685, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 719, "train_speed(iter/s)": 0.013112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.0, "completions/mean_length": 660.25, "completions/min_length": 524.0, "epoch": 0.02983466622467161, "grad_norm": 2.756801349971226, "kl": 0.174072265625, "learning_rate": 9.963691338830042e-07, "loss": 0.006953339092433453, "memory(GiB)": 77.29, "reward": 1.75, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 720, "train_speed(iter/s)": 0.013121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/mean_length": 638.5, "completions/min_length": 498.0, "epoch": 0.029876103261094765, "grad_norm": 0.2668823097864816, "kl": 0.174072265625, "learning_rate": 9.96356398335726e-07, "loss": 0.006960154511034489, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 721, "train_speed(iter/s)": 0.013131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/mean_length": 623.3333740234375, "completions/min_length": 402.0, "epoch": 0.029917540297517923, "grad_norm": 0.16671544151752368, "kl": 0.195068359375, "learning_rate": 9.963436405737475e-07, "loss": 0.007790262810885906, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 722, "train_speed(iter/s)": 0.01314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/mean_length": 605.0, "completions/min_length": 440.0, "epoch": 0.029958977333941077, "grad_norm": 0.17132387860428672, "kl": 0.16455078125, "learning_rate": 9.963308605976396e-07, "loss": 0.006572184152901173, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 723, "train_speed(iter/s)": 0.01315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/mean_length": 614.25, "completions/min_length": 499.0, "epoch": 0.030000414370364232, "grad_norm": 2.3943858368269577, "kl": 0.19287109375, "learning_rate": 9.96318058407974e-07, "loss": 0.007722049951553345, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 724, "train_speed(iter/s)": 0.013159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/mean_length": 569.3333740234375, "completions/min_length": 475.0, "epoch": 0.030041851406787386, "grad_norm": 3.0436103614962646, "kl": 0.1611328125, "learning_rate": 9.96305234005324e-07, "loss": 0.00643921410664916, "memory(GiB)": 77.29, "reward": 1.6666667461395264, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 725, "train_speed(iter/s)": 0.013169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/mean_length": 631.1666870117188, "completions/min_length": 522.0, "epoch": 0.03008328844321054, "grad_norm": 0.17965003421191217, "kl": 0.183349609375, "learning_rate": 9.962923873902636e-07, "loss": 0.007354232482612133, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 726, "train_speed(iter/s)": 0.013178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/mean_length": 611.25, "completions/min_length": 515.0, "epoch": 0.030124725479633695, "grad_norm": 0.16673851049482183, "kl": 0.179931640625, "learning_rate": 9.962795185633675e-07, "loss": 0.007186560891568661, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 727, "train_speed(iter/s)": 0.013188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/mean_length": 518.5833740234375, "completions/min_length": 438.0, "epoch": 0.030166162516056853, "grad_norm": 0.25895291911754587, "kl": 0.2529296875, "learning_rate": 9.962666275252115e-07, "loss": 0.010106499306857586, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 728, "train_speed(iter/s)": 0.013192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/mean_length": 729.75, "completions/min_length": 557.0, "epoch": 0.030207599552480008, "grad_norm": 2.4673672549536287, "kl": 0.16162109375, "learning_rate": 9.962537142763732e-07, "loss": 0.006458719726651907, "memory(GiB)": 77.29, "reward": 1.25, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 729, "train_speed(iter/s)": 0.013201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/mean_length": 546.6666870117188, "completions/min_length": 424.0, "epoch": 0.030249036588903162, "grad_norm": 3.3010530196455337, "kl": 0.217041015625, "learning_rate": 9.9624077881743e-07, "loss": 0.008672615513205528, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 730, "train_speed(iter/s)": 0.01321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/mean_length": 522.1666870117188, "completions/min_length": 380.0, "epoch": 0.030290473625326317, "grad_norm": 1.7498919552135752, "kl": 0.211669921875, "learning_rate": 9.96227821148961e-07, "loss": 0.008448420092463493, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 731, "train_speed(iter/s)": 0.01322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/mean_length": 564.4166870117188, "completions/min_length": 436.0, "epoch": 0.03033191066174947, "grad_norm": 0.1775627929101283, "kl": 0.173828125, "learning_rate": 9.962148412715463e-07, "loss": 0.006961186416447163, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 732, "train_speed(iter/s)": 0.01323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/mean_length": 681.75, "completions/min_length": 462.0, "epoch": 0.030373347698172626, "grad_norm": 2.3973159367471903, "kl": 0.171630859375, "learning_rate": 9.962018391857665e-07, "loss": 0.006867885589599609, "memory(GiB)": 77.29, "reward": 1.1666667461395264, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 733, "train_speed(iter/s)": 0.013239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/mean_length": 523.5833740234375, "completions/min_length": 417.0, "epoch": 0.03041478473459578, "grad_norm": 0.20275175507948923, "kl": 0.231689453125, "learning_rate": 9.961888148922035e-07, "loss": 0.009272966533899307, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 734, "train_speed(iter/s)": 0.013248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/mean_length": 623.3333740234375, "completions/min_length": 472.0, "epoch": 0.03045622177101894, "grad_norm": 0.23521555294992338, "kl": 0.157470703125, "learning_rate": 9.961757683914405e-07, "loss": 0.006293478421866894, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 735, "train_speed(iter/s)": 0.013257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/mean_length": 544.1666870117188, "completions/min_length": 451.0, "epoch": 0.030497658807442093, "grad_norm": 0.2502028794148434, "kl": 0.172119140625, "learning_rate": 9.961626996840613e-07, "loss": 0.006888481788337231, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 736, "train_speed(iter/s)": 0.013267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1064.0, "completions/mean_length": 768.1666870117188, "completions/min_length": 521.0, "epoch": 0.030539095843865247, "grad_norm": 2.660506287099609, "kl": 0.14501953125, "learning_rate": 9.961496087706505e-07, "loss": 0.005817483179271221, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 737, "train_speed(iter/s)": 0.013275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/mean_length": 674.8333740234375, "completions/min_length": 423.0, "epoch": 0.030580532880288402, "grad_norm": 2.638519568406308, "kl": 0.146484375, "learning_rate": 9.961364956517946e-07, "loss": 0.005863606929779053, "memory(GiB)": 77.29, "reward": 1.5833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 738, "train_speed(iter/s)": 0.013284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1132.5833740234375, "completions/min_length": 382.0, "epoch": 0.030621969916711556, "grad_norm": 1.5372345551650684, "kl": 0.18896484375, "learning_rate": 9.9612336032808e-07, "loss": -0.2575892508029938, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.5773502588272095, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 739, "train_speed(iter/s)": 0.013271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/mean_length": 592.8333740234375, "completions/min_length": 470.0, "epoch": 0.03066340695313471, "grad_norm": 2.313192111841091, "kl": 0.1304931640625, "learning_rate": 9.961102028000947e-07, "loss": 0.005212932825088501, "memory(GiB)": 77.29, "reward": 1.75, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 740, "train_speed(iter/s)": 0.01328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1226.0, "completions/mean_length": 623.0, "completions/min_length": 465.0, "epoch": 0.030704843989557865, "grad_norm": 0.2309289256042439, "kl": 0.19677734375, "learning_rate": 9.960970230684275e-07, "loss": 0.007867051288485527, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 741, "train_speed(iter/s)": 0.013288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/mean_length": 605.5833740234375, "completions/min_length": 483.0, "epoch": 0.030746281025981023, "grad_norm": 0.18773063325002826, "kl": 0.14990234375, "learning_rate": 9.960838211336683e-07, "loss": 0.005971540696918964, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 742, "train_speed(iter/s)": 0.013297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/mean_length": 608.5, "completions/min_length": 433.0, "epoch": 0.030787718062404178, "grad_norm": 0.6308845713913533, "kl": 0.1451416015625, "learning_rate": 9.960705969964082e-07, "loss": 0.005809442605823278, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 743, "train_speed(iter/s)": 0.013306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/mean_length": 631.5, "completions/min_length": 469.0, "epoch": 0.030829155098827332, "grad_norm": 2.740909481131706, "kl": 0.1455078125, "learning_rate": 9.960573506572389e-07, "loss": 0.005820175167173147, "memory(GiB)": 77.29, "reward": 1.2083333730697632, "reward_std": 0.49810245633125305, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 744, "train_speed(iter/s)": 0.013315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/mean_length": 587.5, "completions/min_length": 440.0, "epoch": 0.030870592135250487, "grad_norm": 2.608982971894611, "kl": 0.192138671875, "learning_rate": 9.96044082116753e-07, "loss": 0.007683436386287212, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 745, "train_speed(iter/s)": 0.013321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1154.0, "completions/mean_length": 579.9166870117188, "completions/min_length": 299.0, "epoch": 0.03091202917167364, "grad_norm": 4.520783533689034, "kl": 0.248291015625, "learning_rate": 9.960307913755448e-07, "loss": 0.009944329969584942, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.5773502588272095, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 746, "train_speed(iter/s)": 0.01333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/mean_length": 526.1666870117188, "completions/min_length": 330.0, "epoch": 0.030953466208096796, "grad_norm": 0.18310446251014897, "kl": 0.204833984375, "learning_rate": 9.960174784342087e-07, "loss": 0.008194666355848312, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 747, "train_speed(iter/s)": 0.013339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/mean_length": 518.5833740234375, "completions/min_length": 374.0, "epoch": 0.030994903244519954, "grad_norm": 0.18179461494536125, "kl": 0.1640625, "learning_rate": 9.960041432933408e-07, "loss": 0.006555275525897741, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 748, "train_speed(iter/s)": 0.013349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/mean_length": 557.3333740234375, "completions/min_length": 431.0, "epoch": 0.031036340280943108, "grad_norm": 0.21759249307365736, "kl": 0.153076171875, "learning_rate": 9.959907859535378e-07, "loss": 0.006126761436462402, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 749, "train_speed(iter/s)": 0.013359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/mean_length": 618.3333740234375, "completions/min_length": 444.0, "epoch": 0.031077777317366263, "grad_norm": 3.0531860749276927, "kl": 0.14990234375, "learning_rate": 9.959774064153975e-07, "loss": 0.00598482321947813, "memory(GiB)": 77.29, "reward": 1.7083333730697632, "reward_std": 0.5418123602867126, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 750, "train_speed(iter/s)": 0.013367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 879.0, "completions/mean_length": 680.5833740234375, "completions/min_length": 561.0, "epoch": 0.031119214353789417, "grad_norm": 0.20200183323592613, "kl": 0.148681640625, "learning_rate": 9.95964004679519e-07, "loss": 0.00595038291066885, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 751, "train_speed(iter/s)": 0.013376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/mean_length": 574.0, "completions/min_length": 370.0, "epoch": 0.03116065139021257, "grad_norm": 0.23244425992457715, "kl": 0.189208984375, "learning_rate": 9.959505807465015e-07, "loss": 0.0075599271804094315, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 752, "train_speed(iter/s)": 0.013385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/mean_length": 654.75, "completions/min_length": 522.0, "epoch": 0.031202088426635726, "grad_norm": 2.687093831065294, "kl": 0.148681640625, "learning_rate": 9.959371346169465e-07, "loss": 0.005935609340667725, "memory(GiB)": 77.29, "reward": 1.4583333730697632, "reward_std": 0.49810245633125305, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 753, "train_speed(iter/s)": 0.013395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 8001.0, "completions/mean_length": 2017.25, "completions/min_length": 490.0, "epoch": 0.03124352546305888, "grad_norm": 7936.139209822281, "kl": 244.689697265625, "learning_rate": 9.959236662914553e-07, "loss": 9.473520278930664, "memory(GiB)": 77.29, "reward": 1.625, "reward_std": 0.772392988204956, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.3964807391166687, "step": 754, "train_speed(iter/s)": 0.01338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/mean_length": 581.8333740234375, "completions/min_length": 325.0, "epoch": 0.03128496249948204, "grad_norm": 2.8587103398224283, "kl": 0.163330078125, "learning_rate": 9.959101757706308e-07, "loss": 0.0065238079987466335, "memory(GiB)": 77.29, "reward": 1.6666667461395264, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 755, "train_speed(iter/s)": 0.013389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 820.0, "completions/mean_length": 569.4166870117188, "completions/min_length": 403.0, "epoch": 0.03132639953590519, "grad_norm": 3.022003202591665, "kl": 0.1531982421875, "learning_rate": 9.95896663055077e-07, "loss": 0.0061383843421936035, "memory(GiB)": 77.29, "reward": 1.5416667461395264, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 756, "train_speed(iter/s)": 0.013398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/mean_length": 582.1666870117188, "completions/min_length": 422.0, "epoch": 0.03136783657232835, "grad_norm": 3.1932172502700595, "kl": 0.17529296875, "learning_rate": 9.958831281453981e-07, "loss": 0.007022365927696228, "memory(GiB)": 77.29, "reward": 1.5833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 757, "train_speed(iter/s)": 0.013407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/mean_length": 620.3333740234375, "completions/min_length": 446.0, "epoch": 0.0314092736087515, "grad_norm": 0.37944406120745366, "kl": 0.166259765625, "learning_rate": 9.958695710422006e-07, "loss": 0.0066432892344892025, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 758, "train_speed(iter/s)": 0.013416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/mean_length": 566.0, "completions/min_length": 395.0, "epoch": 0.03145071064517466, "grad_norm": 6.8634240049305415, "kl": 0.346435546875, "learning_rate": 9.958559917460907e-07, "loss": 0.013770481571555138, "memory(GiB)": 77.29, "reward": 1.25, "reward_std": 0.5838742256164551, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 759, "train_speed(iter/s)": 0.013425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 853.0, "completions/mean_length": 578.75, "completions/min_length": 420.0, "epoch": 0.03149214768159781, "grad_norm": 0.18805753743298806, "kl": 0.1339111328125, "learning_rate": 9.958423902576762e-07, "loss": 0.005342703312635422, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 760, "train_speed(iter/s)": 0.013433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1596.0, "completions/mean_length": 846.8333740234375, "completions/min_length": 558.0, "epoch": 0.031533584718020966, "grad_norm": 3.3285884245880264, "kl": 0.11767578125, "learning_rate": 9.958287665775662e-07, "loss": 0.004709403030574322, "memory(GiB)": 77.29, "reward": 1.5, "reward_std": 0.5222329497337341, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 761, "train_speed(iter/s)": 0.01344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/mean_length": 645.5, "completions/min_length": 426.0, "epoch": 0.03157502175444412, "grad_norm": 121.88868887636295, "kl": 0.986572265625, "learning_rate": 9.958151207063703e-07, "loss": 0.03960942104458809, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 762, "train_speed(iter/s)": 0.013448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1593.0, "completions/mean_length": 734.75, "completions/min_length": 456.0, "epoch": 0.031616458790867275, "grad_norm": 2.704125854940902, "kl": 0.150390625, "learning_rate": 9.95801452644699e-07, "loss": 0.00601448118686676, "memory(GiB)": 77.29, "reward": 1.6666667461395264, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.33709993958473206, "step": 763, "train_speed(iter/s)": 0.013453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/mean_length": 541.0, "completions/min_length": 380.0, "epoch": 0.03165789582729043, "grad_norm": 3.0519890106989576, "kl": 0.17236328125, "learning_rate": 9.95787762393164e-07, "loss": 0.006897717714309692, "memory(GiB)": 77.29, "reward": 1.5, "reward_std": 0.5640760660171509, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 764, "train_speed(iter/s)": 0.013463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/mean_length": 482.5833435058594, "completions/min_length": 343.0, "epoch": 0.03169933286371359, "grad_norm": 0.22100626900669862, "kl": 0.213134765625, "learning_rate": 9.957740499523785e-07, "loss": 0.008537273854017258, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 765, "train_speed(iter/s)": 0.013472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 7998.0, "completions/mean_length": 1160.5, "completions/min_length": 410.0, "epoch": 0.031740769900136745, "grad_norm": 2.8981309924441945, "kl": 0.204345703125, "learning_rate": 9.957603153229559e-07, "loss": -0.16539065539836884, "memory(GiB)": 77.29, "reward": 1.5833333730697632, "reward_std": 0.7637625932693481, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.39886200428009033, "step": 766, "train_speed(iter/s)": 0.013455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/mean_length": 546.5833740234375, "completions/min_length": 339.0, "epoch": 0.0317822069365599, "grad_norm": 2.9533570085158374, "kl": 0.21240234375, "learning_rate": 9.957465585055107e-07, "loss": 0.00849737785756588, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 767, "train_speed(iter/s)": 0.013465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/mean_length": 570.75, "completions/min_length": 380.0, "epoch": 0.031823643972983054, "grad_norm": 2.893794906697116, "kl": 0.1483154296875, "learning_rate": 9.957327795006588e-07, "loss": 0.005935927387326956, "memory(GiB)": 77.29, "reward": 1.3333333730697632, "reward_std": 0.6154574751853943, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 768, "train_speed(iter/s)": 0.013474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/mean_length": 651.75, "completions/min_length": 466.0, "epoch": 0.03186508100940621, "grad_norm": 2.7447338309196114, "kl": 0.1502685546875, "learning_rate": 9.95718978309017e-07, "loss": 0.006007254123687744, "memory(GiB)": 77.29, "reward": 1.2916667461395264, "reward_std": 0.5418123602867126, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 769, "train_speed(iter/s)": 0.013483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 970.0, "completions/mean_length": 643.4166870117188, "completions/min_length": 463.0, "epoch": 0.03190651804582936, "grad_norm": 2.7174441607289306, "kl": 0.1358642578125, "learning_rate": 9.957051549312027e-07, "loss": 0.005444060079753399, "memory(GiB)": 77.29, "reward": 1.375, "reward_std": 0.5690901875495911, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 770, "train_speed(iter/s)": 0.013491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/mean_length": 632.3333740234375, "completions/min_length": 390.0, "epoch": 0.03194795508225252, "grad_norm": 2.992821548830488, "kl": 0.1607666015625, "learning_rate": 9.956913093678348e-07, "loss": 0.006425082683563232, "memory(GiB)": 77.29, "reward": 1.7916667461395264, "reward_std": 0.3964807391166687, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 771, "train_speed(iter/s)": 0.0135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/mean_length": 708.1666870117188, "completions/min_length": 447.0, "epoch": 0.03198939211867567, "grad_norm": 2.612829655805734, "kl": 0.13134765625, "learning_rate": 9.956774416195329e-07, "loss": 0.005249351263046265, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 772, "train_speed(iter/s)": 0.013508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/mean_length": 559.5, "completions/min_length": 448.0, "epoch": 0.032030829155098826, "grad_norm": 2.6783059487219525, "kl": 0.142578125, "learning_rate": 9.956635516869175e-07, "loss": 0.005687147378921509, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 773, "train_speed(iter/s)": 0.013517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/mean_length": 608.5833740234375, "completions/min_length": 445.0, "epoch": 0.03207226619152198, "grad_norm": 2.263660148415105, "kl": 0.190185546875, "learning_rate": 9.956496395706105e-07, "loss": 0.0076032779179513454, "memory(GiB)": 77.29, "reward": 1.875, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 774, "train_speed(iter/s)": 0.013526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/mean_length": 545.5833740234375, "completions/min_length": 409.0, "epoch": 0.032113703227945135, "grad_norm": 4.043412002355087, "kl": 0.1689453125, "learning_rate": 9.956357052712344e-07, "loss": 0.006749927997589111, "memory(GiB)": 77.29, "reward": 1.25, "reward_std": 0.39886200428009033, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.5833333134651184, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 775, "train_speed(iter/s)": 0.013535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/mean_length": 546.0833740234375, "completions/min_length": 433.0, "epoch": 0.03215514026436829, "grad_norm": 2.969439518556151, "kl": 0.169189453125, "learning_rate": 9.95621748789413e-07, "loss": 0.00677711796015501, "memory(GiB)": 77.29, "reward": 1.7916667461395264, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 776, "train_speed(iter/s)": 0.013544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/mean_length": 634.0833740234375, "completions/min_length": 445.0, "epoch": 0.032196577300791444, "grad_norm": 3.1044241865904496, "kl": 0.1455078125, "learning_rate": 9.956077701257707e-07, "loss": 0.0058433315716683865, "memory(GiB)": 77.29, "reward": 0.9583333730697632, "reward_std": 0.3964807391166687, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 777, "train_speed(iter/s)": 0.013553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 8001.0, "completions/mean_length": 1835.416748046875, "completions/min_length": 427.0, "epoch": 0.032238014337214606, "grad_norm": 2.1428615555814834, "kl": 0.16943359375, "learning_rate": 9.955937692809335e-07, "loss": -0.3511400818824768, "memory(GiB)": 77.29, "reward": 1.6666667461395264, "reward_std": 0.7784988880157471, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3892494738101959, "step": 778, "train_speed(iter/s)": 0.013538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/mean_length": 665.25, "completions/min_length": 493.0, "epoch": 0.03227945137363776, "grad_norm": 2.6762997210766026, "kl": 0.154296875, "learning_rate": 9.955797462555275e-07, "loss": 0.0061674416065216064, "memory(GiB)": 77.29, "reward": 0.9166666865348816, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 779, "train_speed(iter/s)": 0.013545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/mean_length": 670.0, "completions/min_length": 527.0, "epoch": 0.032320888410060915, "grad_norm": 2.984687289660248, "kl": 0.1484375, "learning_rate": 9.955657010501806e-07, "loss": 0.005925019737333059, "memory(GiB)": 77.29, "reward": 1.75, "reward_std": 0.39886200428009033, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 780, "train_speed(iter/s)": 0.013553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/mean_length": 570.25, "completions/min_length": 388.0, "epoch": 0.03236232544648407, "grad_norm": 3.0346192828903202, "kl": 0.219970703125, "learning_rate": 9.955516336655212e-07, "loss": 0.008796349167823792, "memory(GiB)": 77.29, "reward": 0.875, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 781, "train_speed(iter/s)": 0.013557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/mean_length": 687.8333740234375, "completions/min_length": 87.0, "epoch": 0.032403762482907224, "grad_norm": 6.060958002274858, "kl": 0.137451171875, "learning_rate": 9.955375441021793e-07, "loss": 0.005494832992553711, "memory(GiB)": 77.29, "reward": 1.6666667461395264, "reward_std": 0.685344398021698, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3256694972515106, "step": 782, "train_speed(iter/s)": 0.013565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/mean_length": 623.0, "completions/min_length": 459.0, "epoch": 0.03244519951933038, "grad_norm": 2.8093381027229594, "kl": 0.212646484375, "learning_rate": 9.955234323607851e-07, "loss": 0.008507758378982544, "memory(GiB)": 77.29, "reward": 0.8333333730697632, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 783, "train_speed(iter/s)": 0.013574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/mean_length": 629.1666870117188, "completions/min_length": 498.0, "epoch": 0.03248663655575353, "grad_norm": 2.710581645075791, "kl": 0.207763671875, "learning_rate": 9.955092984419705e-07, "loss": 0.008317073807120323, "memory(GiB)": 77.29, "reward": 0.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 784, "train_speed(iter/s)": 0.013584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/mean_length": 608.0833740234375, "completions/min_length": 442.0, "epoch": 0.03252807359217669, "grad_norm": 2.558355306741223, "kl": 0.185791015625, "learning_rate": 9.954951423463676e-07, "loss": 0.007431144826114178, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 785, "train_speed(iter/s)": 0.013593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/mean_length": 620.1666870117188, "completions/min_length": 497.0, "epoch": 0.03256951062859984, "grad_norm": 0.19702383081953748, "kl": 0.167724609375, "learning_rate": 9.954809640746105e-07, "loss": 0.0067266011610627174, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 786, "train_speed(iter/s)": 0.013601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/mean_length": 589.4166870117188, "completions/min_length": 428.0, "epoch": 0.032610947665022996, "grad_norm": 2.823939866852503, "kl": 0.18994140625, "learning_rate": 9.954667636273334e-07, "loss": 0.007591386791318655, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 787, "train_speed(iter/s)": 0.013609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4371.0, "completions/mean_length": 910.3333740234375, "completions/min_length": 468.0, "epoch": 0.03265238470144615, "grad_norm": 1.7840793611868297, "kl": 0.169677734375, "learning_rate": 9.95452541005172e-07, "loss": 0.006787826772779226, "memory(GiB)": 77.29, "reward": 1.7916667461395264, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 788, "train_speed(iter/s)": 0.013607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/mean_length": 590.8333740234375, "completions/min_length": 446.0, "epoch": 0.032693821737869305, "grad_norm": 0.2080530368036711, "kl": 0.180419921875, "learning_rate": 9.954382962087627e-07, "loss": 0.007217449601739645, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 789, "train_speed(iter/s)": 0.013615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5085.0, "completions/mean_length": 881.3333740234375, "completions/min_length": 413.0, "epoch": 0.03273525877429246, "grad_norm": 4.191906787061106, "kl": 0.20458984375, "learning_rate": 9.954240292387433e-07, "loss": 0.008193433284759521, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 790, "train_speed(iter/s)": 0.01361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/mean_length": 646.0833740234375, "completions/min_length": 407.0, "epoch": 0.032776695810715614, "grad_norm": 3.2255989751091168, "kl": 0.166015625, "learning_rate": 9.95409740095752e-07, "loss": 0.006644467823207378, "memory(GiB)": 77.29, "reward": 1.6666667461395264, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 791, "train_speed(iter/s)": 0.013619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/mean_length": 502.0, "completions/min_length": 363.0, "epoch": 0.032818132847138776, "grad_norm": 4.0146700651803195, "kl": 0.258544921875, "learning_rate": 9.953954287804284e-07, "loss": 0.010311494581401348, "memory(GiB)": 77.29, "reward": 1.3333333730697632, "reward_std": 0.5365433692932129, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 792, "train_speed(iter/s)": 0.013628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/mean_length": 520.75, "completions/min_length": 384.0, "epoch": 0.03285956988356193, "grad_norm": 2.9747507122781074, "kl": 0.25390625, "learning_rate": 9.953810952934132e-07, "loss": 0.010178583674132824, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.32566946744918823, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 793, "train_speed(iter/s)": 0.013637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/mean_length": 442.0833435058594, "completions/min_length": 367.0, "epoch": 0.032901006919985085, "grad_norm": 3.375265314408748, "kl": 0.33447265625, "learning_rate": 9.95366739635348e-07, "loss": 0.013392250053584576, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.32566946744918823, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 794, "train_speed(iter/s)": 0.013647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1087.0, "completions/min_length": 276.0, "epoch": 0.03294244395640824, "grad_norm": 2.0163455665527454, "kl": 0.284912109375, "learning_rate": 9.953523618068748e-07, "loss": -0.24605785310268402, "memory(GiB)": 77.29, "reward": 1.7916667461395264, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 795, "train_speed(iter/s)": 0.013634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1283.25, "completions/min_length": 343.0, "epoch": 0.032983880992831394, "grad_norm": 1.7208173943086875, "kl": 0.28564453125, "learning_rate": 9.953379618086376e-07, "loss": -0.22422635555267334, "memory(GiB)": 77.29, "reward": 0.875, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 796, "train_speed(iter/s)": 0.01362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/mean_length": 604.5833740234375, "completions/min_length": 489.0, "epoch": 0.03302531802925455, "grad_norm": 0.19338623103633423, "kl": 0.200439453125, "learning_rate": 9.953235396412806e-07, "loss": 0.008009283803403378, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 797, "train_speed(iter/s)": 0.013629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1240.0, "completions/mean_length": 618.1666870117188, "completions/min_length": 439.0, "epoch": 0.0330667550656777, "grad_norm": 2.875384228881286, "kl": 0.258056640625, "learning_rate": 9.95309095305449e-07, "loss": 0.010302862152457237, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 798, "train_speed(iter/s)": 0.013636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1796.0, "completions/mean_length": 629.1666870117188, "completions/min_length": 296.0, "epoch": 0.03310819210210086, "grad_norm": 4.337120257090208, "kl": 0.203125, "learning_rate": 9.952946288017898e-07, "loss": 0.008145203813910484, "memory(GiB)": 77.29, "reward": 1.2916667461395264, "reward_std": 0.5418123602867126, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 799, "train_speed(iter/s)": 0.013642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/mean_length": 460.16668701171875, "completions/min_length": 298.0, "epoch": 0.03314962913852401, "grad_norm": 0.27442233004638644, "kl": 0.345703125, "learning_rate": 9.952801401309502e-07, "loss": 0.013856267556548119, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 800, "train_speed(iter/s)": 0.013651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/mean_length": 557.8333740234375, "completions/min_length": 419.0, "epoch": 0.033191066174947166, "grad_norm": 3.0367932443711028, "kl": 0.228759765625, "learning_rate": 9.952656292935788e-07, "loss": 0.00915489625185728, "memory(GiB)": 77.29, "reward": 1.2916667461395264, "reward_std": 0.45016831159591675, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 801, "train_speed(iter/s)": 0.01366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/mean_length": 528.6666870117188, "completions/min_length": 360.0, "epoch": 0.03323250321137032, "grad_norm": 4.2972270784021385, "kl": 0.23681640625, "learning_rate": 9.952510962903249e-07, "loss": 0.009487947449088097, "memory(GiB)": 77.29, "reward": 1.25, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 802, "train_speed(iter/s)": 0.013669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 8001.0, "completions/mean_length": 2381.666748046875, "completions/min_length": 401.0, "epoch": 0.033273940247793475, "grad_norm": 2.639119987551003, "kl": 0.25634765625, "learning_rate": 9.95236541121839e-07, "loss": -0.3530282974243164, "memory(GiB)": 77.29, "reward": 0.625, "reward_std": 0.4330126941204071, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.625, "rewards/FormatCorrectnessReward/std": 0.4330126941204071, "step": 803, "train_speed(iter/s)": 0.013652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1176.916748046875, "completions/min_length": 356.0, "epoch": 0.03331537728421663, "grad_norm": 2.764774946210307, "kl": 0.231689453125, "learning_rate": 9.952219637887725e-07, "loss": -0.1450652778148651, "memory(GiB)": 77.29, "reward": 1.2083333730697632, "reward_std": 0.65568608045578, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 804, "train_speed(iter/s)": 0.01364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/mean_length": 514.4166870117188, "completions/min_length": 343.0, "epoch": 0.03335681432063979, "grad_norm": 3.1968488609171937, "kl": 0.262939453125, "learning_rate": 9.952073642917777e-07, "loss": 0.0105082793161273, "memory(GiB)": 77.29, "reward": 1.625, "reward_std": 0.4826536476612091, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 805, "train_speed(iter/s)": 0.013649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/mean_length": 479.0, "completions/min_length": 261.0, "epoch": 0.033398251357062946, "grad_norm": 3.7614234651846483, "kl": 0.28662109375, "learning_rate": 9.951927426315081e-07, "loss": 0.01145862601697445, "memory(GiB)": 77.29, "reward": 1.875, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 806, "train_speed(iter/s)": 0.013658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/mean_length": 572.8333740234375, "completions/min_length": 396.0, "epoch": 0.0334396883934861, "grad_norm": 2.9853851322422518, "kl": 0.26611328125, "learning_rate": 9.951780988086183e-07, "loss": 0.01061907410621643, "memory(GiB)": 77.29, "reward": 1.4166667461395264, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 807, "train_speed(iter/s)": 0.013666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1474.0, "completions/mean_length": 594.1666870117188, "completions/min_length": 274.0, "epoch": 0.033481125429909254, "grad_norm": 5.572375460015627, "kl": 0.32666015625, "learning_rate": 9.951634328237634e-07, "loss": 0.013032625429332256, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 808, "train_speed(iter/s)": 0.013672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 405.25, "completions/min_length": 248.0, "epoch": 0.03352256246633241, "grad_norm": 4.369502324558235, "kl": 0.31787109375, "learning_rate": 9.951487446776e-07, "loss": 0.012721419334411621, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 809, "train_speed(iter/s)": 0.013682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 381.41668701171875, "completions/min_length": 268.0, "epoch": 0.033563999502755563, "grad_norm": 4.359057531503768, "kl": 0.3623046875, "learning_rate": 9.95134034370785e-07, "loss": 0.014461557380855083, "memory(GiB)": 77.29, "reward": 1.875, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 810, "train_speed(iter/s)": 0.013692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/mean_length": 444.16668701171875, "completions/min_length": 262.0, "epoch": 0.03360543653917872, "grad_norm": 0.2792435312993568, "kl": 0.37255859375, "learning_rate": 9.951193019039775e-07, "loss": 0.014921387657523155, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 811, "train_speed(iter/s)": 0.013701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1139.5833740234375, "completions/min_length": 345.0, "epoch": 0.03364687357560187, "grad_norm": 2.741864682336722, "kl": 0.3134765625, "learning_rate": 9.951045472778364e-07, "loss": -0.13024859130382538, "memory(GiB)": 77.29, "reward": 1.1666667461395264, "reward_std": 0.685344398021698, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3256694972515106, "step": 812, "train_speed(iter/s)": 0.013688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/mean_length": 413.66668701171875, "completions/min_length": 317.0, "epoch": 0.03368831061202503, "grad_norm": 3.228598821132227, "kl": 0.36962890625, "learning_rate": 9.95089770493022e-07, "loss": 0.014779637567698956, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 813, "train_speed(iter/s)": 0.013697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/mean_length": 528.0833740234375, "completions/min_length": 340.0, "epoch": 0.03372974764844818, "grad_norm": 0.25545834672648043, "kl": 0.32080078125, "learning_rate": 9.950749715501961e-07, "loss": 0.012828312814235687, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 814, "train_speed(iter/s)": 0.013706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/mean_length": 548.75, "completions/min_length": 378.0, "epoch": 0.033771184684871336, "grad_norm": 3.435600603163883, "kl": 0.31005859375, "learning_rate": 9.950601504500204e-07, "loss": 0.01238344144076109, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 815, "train_speed(iter/s)": 0.013715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/mean_length": 529.25, "completions/min_length": 397.0, "epoch": 0.03381262172129449, "grad_norm": 3.246455028057043, "kl": 0.32763671875, "learning_rate": 9.950453071931588e-07, "loss": 0.013097604736685753, "memory(GiB)": 77.29, "reward": 1.75, "reward_std": 0.39886200428009033, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 816, "train_speed(iter/s)": 0.013723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1312.0, "completions/mean_length": 618.9166870117188, "completions/min_length": 298.0, "epoch": 0.033854058757717645, "grad_norm": 3.30022445081843, "kl": 0.29443359375, "learning_rate": 9.950304417802753e-07, "loss": 0.011763354763388634, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 817, "train_speed(iter/s)": 0.01373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1214.916748046875, "completions/min_length": 469.0, "epoch": 0.033895495794140806, "grad_norm": 1.5327835439196147, "kl": 0.262451171875, "learning_rate": 9.95015554212035e-07, "loss": -0.2550123631954193, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.5773502588272095, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 818, "train_speed(iter/s)": 0.013716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1163.25, "completions/min_length": 411.0, "epoch": 0.03393693283056396, "grad_norm": 1.9490826256176033, "kl": 0.330078125, "learning_rate": 9.950006444891048e-07, "loss": -0.2523411810398102, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.5773502588272095, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 819, "train_speed(iter/s)": 0.013703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/mean_length": 574.1666870117188, "completions/min_length": 321.0, "epoch": 0.033978369866987115, "grad_norm": 3.349190141877714, "kl": 0.310546875, "learning_rate": 9.949857126121516e-07, "loss": 0.012389004230499268, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 820, "train_speed(iter/s)": 0.013712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1431.0, "completions/mean_length": 588.0, "completions/min_length": 314.0, "epoch": 0.03401980690341027, "grad_norm": 2.721157210499322, "kl": 0.36572265625, "learning_rate": 9.949707585818439e-07, "loss": 0.01460602879524231, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 821, "train_speed(iter/s)": 0.013719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1303.166748046875, "completions/min_length": 502.0, "epoch": 0.034061243939833424, "grad_norm": 1.8262864602902908, "kl": 0.30322265625, "learning_rate": 9.949557823988506e-07, "loss": -0.20223459601402283, "memory(GiB)": 77.29, "reward": 1.6666667461395264, "reward_std": 0.6513389348983765, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 822, "train_speed(iter/s)": 0.013705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1497.0, "completions/mean_length": 546.0, "completions/min_length": 333.0, "epoch": 0.03410268097625658, "grad_norm": 3.488249721048501, "kl": 0.37841796875, "learning_rate": 9.949407840638423e-07, "loss": 0.01513428520411253, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 823, "train_speed(iter/s)": 0.013712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1211.0, "completions/mean_length": 741.25, "completions/min_length": 458.0, "epoch": 0.03414411801267973, "grad_norm": 0.314314799418365, "kl": 0.283203125, "learning_rate": 9.9492576357749e-07, "loss": 0.011336103081703186, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 824, "train_speed(iter/s)": 0.013719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1057.0, "completions/mean_length": 665.6666870117188, "completions/min_length": 478.0, "epoch": 0.03418555504910289, "grad_norm": 0.24648558418234587, "kl": 0.30322265625, "learning_rate": 9.949107209404663e-07, "loss": 0.012136711739003658, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 825, "train_speed(iter/s)": 0.013726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2155.0, "completions/mean_length": 807.5, "completions/min_length": 413.0, "epoch": 0.03422699208552604, "grad_norm": 2.658649671190779, "kl": 0.316162109375, "learning_rate": 9.948956561534444e-07, "loss": 0.012627760879695415, "memory(GiB)": 77.29, "reward": 1.7916667461395264, "reward_std": 0.3964807391166687, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 826, "train_speed(iter/s)": 0.013731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 8001.0, "completions/mean_length": 1913.75, "completions/min_length": 419.0, "epoch": 0.0342684291219492, "grad_norm": 2.6331385851645304, "kl": 0.353515625, "learning_rate": 9.94880569217098e-07, "loss": -0.32129475474357605, "memory(GiB)": 77.29, "reward": 1.5833333730697632, "reward_std": 0.7929614782333374, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.45226702094078064, "step": 827, "train_speed(iter/s)": 0.013716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/mean_length": 609.0833740234375, "completions/min_length": 393.0, "epoch": 0.03430986615837235, "grad_norm": 0.25315593055375046, "kl": 0.3203125, "learning_rate": 9.94865460132103e-07, "loss": 0.012825514189898968, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 828, "train_speed(iter/s)": 0.013724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1383.0, "completions/mean_length": 691.4166870117188, "completions/min_length": 397.0, "epoch": 0.034351303194795506, "grad_norm": 0.2603940584347514, "kl": 0.30126953125, "learning_rate": 9.94850328899135e-07, "loss": 0.012018239125609398, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 829, "train_speed(iter/s)": 0.013731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1325.0, "completions/mean_length": 673.9166870117188, "completions/min_length": 421.0, "epoch": 0.03439274023121866, "grad_norm": 2.645924509641955, "kl": 0.31103515625, "learning_rate": 9.948351755188718e-07, "loss": 0.012459814548492432, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.3892494738101959, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 830, "train_speed(iter/s)": 0.013738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/mean_length": 564.0, "completions/min_length": 441.0, "epoch": 0.034434177267641815, "grad_norm": 3.51425879377333, "kl": 0.37646484375, "learning_rate": 9.948199999919912e-07, "loss": 0.015033086761832237, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 831, "train_speed(iter/s)": 0.013747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1054.0, "completions/mean_length": 717.9166870117188, "completions/min_length": 436.0, "epoch": 0.034475614304064976, "grad_norm": 2.403997006659054, "kl": 0.2626953125, "learning_rate": 9.948048023191726e-07, "loss": 0.010529398918151855, "memory(GiB)": 77.29, "reward": 1.0833333730697632, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 832, "train_speed(iter/s)": 0.013755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1195.25, "completions/min_length": 431.0, "epoch": 0.03451705134048813, "grad_norm": 1.6354156773944504, "kl": 0.322265625, "learning_rate": 9.947895825010961e-07, "loss": -0.25296059250831604, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.5773502588272095, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 833, "train_speed(iter/s)": 0.013741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1237.416748046875, "completions/min_length": 420.0, "epoch": 0.034558488376911285, "grad_norm": 2.1978039637184064, "kl": 0.3505859375, "learning_rate": 9.947743405384428e-07, "loss": -0.19740121066570282, "memory(GiB)": 77.29, "reward": 1.625, "reward_std": 0.6440284848213196, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.7083333134651184, "rewards/FormatCorrectnessReward/std": 0.45016834139823914, "step": 834, "train_speed(iter/s)": 0.013728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5506.0, "completions/mean_length": 1201.25, "completions/min_length": 451.0, "epoch": 0.03459992541333444, "grad_norm": 2.8250300924974625, "kl": 0.3037109375, "learning_rate": 9.947590764318949e-07, "loss": 0.012172630056738853, "memory(GiB)": 77.29, "reward": 1.6666667461395264, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 835, "train_speed(iter/s)": 0.013722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/mean_length": 542.25, "completions/min_length": 408.0, "epoch": 0.034641362449757594, "grad_norm": 4.447023753644728, "kl": 0.396484375, "learning_rate": 9.947437901821356e-07, "loss": 0.015850624069571495, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.19462473690509796, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 836, "train_speed(iter/s)": 0.013727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 958.0, "completions/mean_length": 593.0, "completions/min_length": 427.0, "epoch": 0.03468279948618075, "grad_norm": 3.2501678498118656, "kl": 0.3994140625, "learning_rate": 9.947284817898492e-07, "loss": 0.016022861003875732, "memory(GiB)": 77.29, "reward": 1.1666667461395264, "reward_std": 0.5773502588272095, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 837, "train_speed(iter/s)": 0.013735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 7999.0, "completions/mean_length": 1232.8333740234375, "completions/min_length": 459.0, "epoch": 0.0347242365226039, "grad_norm": 2.3763939108754006, "kl": 0.35546875, "learning_rate": 9.947131512557205e-07, "loss": -0.14373289048671722, "memory(GiB)": 77.29, "reward": 1.5, "reward_std": 0.7977240085601807, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.6666666865348816, "rewards/FormatCorrectnessReward/std": 0.4923659861087799, "step": 838, "train_speed(iter/s)": 0.013722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 836.0, "completions/mean_length": 535.1666870117188, "completions/min_length": 407.0, "epoch": 0.03476567355902706, "grad_norm": 3.375150400739744, "kl": 0.392578125, "learning_rate": 9.946977985804357e-07, "loss": 0.01571667194366455, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 839, "train_speed(iter/s)": 0.013731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1099.0, "completions/mean_length": 649.8333740234375, "completions/min_length": 419.0, "epoch": 0.03480711059545021, "grad_norm": 2.6848535168687073, "kl": 0.330078125, "learning_rate": 9.946824237646824e-07, "loss": 0.013193766586482525, "memory(GiB)": 77.29, "reward": 0.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 840, "train_speed(iter/s)": 0.013738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 7999.0, "completions/mean_length": 1147.0833740234375, "completions/min_length": 382.0, "epoch": 0.03484854763187337, "grad_norm": 1.6324534203201082, "kl": 0.35107421875, "learning_rate": 9.94667026809148e-07, "loss": -0.2517050504684448, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 841, "train_speed(iter/s)": 0.013726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 8001.0, "completions/mean_length": 1839.0833740234375, "completions/min_length": 485.0, "epoch": 0.03488998466829652, "grad_norm": 2.1519897574346487, "kl": 0.3505859375, "learning_rate": 9.946516077145217e-07, "loss": -0.3042542338371277, "memory(GiB)": 77.29, "reward": 1.6666667461395264, "reward_std": 0.6154574751853943, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.39886200428009033, "step": 842, "train_speed(iter/s)": 0.01371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1399.0, "completions/mean_length": 664.5, "completions/min_length": 452.0, "epoch": 0.034931421704719676, "grad_norm": 2.996079977507608, "kl": 0.4033203125, "learning_rate": 9.946361664814943e-07, "loss": 0.016162356361746788, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 843, "train_speed(iter/s)": 0.013717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 7999.0, "completions/mean_length": 1271.0, "completions/min_length": 431.0, "epoch": 0.03497285874114283, "grad_norm": 2.186555746904691, "kl": 0.349609375, "learning_rate": 9.94620703110756e-07, "loss": -0.16556429862976074, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3892494738101959, "step": 844, "train_speed(iter/s)": 0.013704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/mean_length": 527.3333740234375, "completions/min_length": 414.0, "epoch": 0.03501429577756599, "grad_norm": 3.6814836350570115, "kl": 0.44482421875, "learning_rate": 9.946052176029993e-07, "loss": 0.01777329109609127, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 845, "train_speed(iter/s)": 0.013713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1109.5833740234375, "completions/min_length": 327.0, "epoch": 0.035055732813989146, "grad_norm": 2.7330521147277684, "kl": 0.4462890625, "learning_rate": 9.945897099589173e-07, "loss": -0.16206568479537964, "memory(GiB)": 77.29, "reward": 0.8333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3892494738101959, "step": 846, "train_speed(iter/s)": 0.0137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/mean_length": 671.8333740234375, "completions/min_length": 419.0, "epoch": 0.0350971698504123, "grad_norm": 2.973872670477998, "kl": 0.392578125, "learning_rate": 9.945741801792041e-07, "loss": 0.015732750296592712, "memory(GiB)": 77.29, "reward": 1.4166667461395264, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 847, "train_speed(iter/s)": 0.013708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8000.0, "completions/mean_length": 1257.916748046875, "completions/min_length": 466.0, "epoch": 0.035138606886835455, "grad_norm": 1.395351993927468, "kl": 0.44140625, "learning_rate": 9.945586282645545e-07, "loss": -0.2485295981168747, "memory(GiB)": 77.29, "reward": 0.9166666865348816, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 848, "train_speed(iter/s)": 0.013695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1048.0, "completions/mean_length": 605.75, "completions/min_length": 440.0, "epoch": 0.03518004392325861, "grad_norm": 0.3542883096508919, "kl": 0.40771484375, "learning_rate": 9.945430542156646e-07, "loss": 0.016342610120773315, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 849, "train_speed(iter/s)": 0.013703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/mean_length": 586.25, "completions/min_length": 438.0, "epoch": 0.035221480959681764, "grad_norm": 2.7658560722987335, "kl": 0.52734375, "learning_rate": 9.945274580332315e-07, "loss": 0.021111449226737022, "memory(GiB)": 77.29, "reward": 1.7916667461395264, "reward_std": 0.3964807391166687, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.3964807391166687, "step": 850, "train_speed(iter/s)": 0.013711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1529.0, "completions/mean_length": 589.4166870117188, "completions/min_length": 284.0, "epoch": 0.03526291799610492, "grad_norm": 3.8641048859768947, "kl": 0.4814453125, "learning_rate": 9.94511839717953e-07, "loss": 0.019283901900053024, "memory(GiB)": 77.29, "reward": 1.75, "reward_std": 0.6215815544128418, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 851, "train_speed(iter/s)": 0.013717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1203.5, "completions/min_length": 502.0, "epoch": 0.03530435503252807, "grad_norm": 1.8918626457147254, "kl": 0.46484375, "learning_rate": 9.944961992705286e-07, "loss": -0.21753227710723877, "memory(GiB)": 77.29, "reward": 1.75, "reward_std": 0.6215815544128418, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3892494738101959, "step": 852, "train_speed(iter/s)": 0.013705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1275.0, "completions/mean_length": 597.3333740234375, "completions/min_length": 368.0, "epoch": 0.03534579206895123, "grad_norm": 1.0334017548580565, "kl": 0.5029296875, "learning_rate": 9.94480536691658e-07, "loss": 0.020056836307048798, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 853, "train_speed(iter/s)": 0.013712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1237.5833740234375, "completions/min_length": 373.0, "epoch": 0.03538722910537438, "grad_norm": 2.097894642812142, "kl": 0.458984375, "learning_rate": 9.94464851982042e-07, "loss": -0.19638334214687347, "memory(GiB)": 77.29, "reward": 1.6666667461395264, "reward_std": 0.6513389348983765, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3892494738101959, "step": 854, "train_speed(iter/s)": 0.013697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/mean_length": 655.0833740234375, "completions/min_length": 444.0, "epoch": 0.03542866614179754, "grad_norm": 2.925905672807966, "kl": 0.46923828125, "learning_rate": 9.944491451423827e-07, "loss": 0.01874067448079586, "memory(GiB)": 77.29, "reward": 1.0833333730697632, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 855, "train_speed(iter/s)": 0.013705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 8001.0, "completions/mean_length": 1915.5833740234375, "completions/min_length": 518.0, "epoch": 0.03547010317822069, "grad_norm": 2.8528636852505986, "kl": 0.5322265625, "learning_rate": 9.944334161733835e-07, "loss": -0.3387773036956787, "memory(GiB)": 77.29, "reward": 1.6666667461395264, "reward_std": 0.7784988880157471, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3892494738101959, "step": 856, "train_speed(iter/s)": 0.013691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 8001.0, "completions/mean_length": 1776.666748046875, "completions/min_length": 442.0, "epoch": 0.035511540214643846, "grad_norm": 1.945350515022136, "kl": 0.54248046875, "learning_rate": 9.944176650757476e-07, "loss": -0.3384985327720642, "memory(GiB)": 77.29, "reward": 1.6666667461395264, "reward_std": 0.7784988880157471, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3892494738101959, "step": 857, "train_speed(iter/s)": 0.013677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8000.0, "completions/mean_length": 1163.166748046875, "completions/min_length": 414.0, "epoch": 0.03555297725106701, "grad_norm": 2.1553420971967094, "kl": 0.5615234375, "learning_rate": 9.944018918501805e-07, "loss": -0.19257669150829315, "memory(GiB)": 77.29, "reward": 1.6666667461395264, "reward_std": 0.6513389348983765, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3892494738101959, "step": 858, "train_speed(iter/s)": 0.013665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/mean_length": 576.0833740234375, "completions/min_length": 388.0, "epoch": 0.03559441428749016, "grad_norm": 0.3748303847256177, "kl": 0.5244140625, "learning_rate": 9.943860964973878e-07, "loss": 0.020949868485331535, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 859, "train_speed(iter/s)": 0.013673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1178.5, "completions/min_length": 395.0, "epoch": 0.035635851323913316, "grad_norm": 1.3618085974622027, "kl": 0.56494140625, "learning_rate": 9.943702790180768e-07, "loss": -0.2442256659269333, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.5773502588272095, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 860, "train_speed(iter/s)": 0.01366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1266.166748046875, "completions/min_length": 514.0, "epoch": 0.03567728836033647, "grad_norm": 1.2801267468571063, "kl": 0.51708984375, "learning_rate": 9.94354439412955e-07, "loss": -0.245189368724823, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.5773502588272095, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 861, "train_speed(iter/s)": 0.013648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/mean_length": 593.8333740234375, "completions/min_length": 357.0, "epoch": 0.035718725396759625, "grad_norm": 0.3680388045217188, "kl": 0.595703125, "learning_rate": 9.943385776827319e-07, "loss": 0.023834042251110077, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 862, "train_speed(iter/s)": 0.013656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 3022.416748046875, "completions/min_length": 445.0, "epoch": 0.03576016243318278, "grad_norm": 2.8944714991800375, "kl": 0.5673828125, "learning_rate": 9.943226938281168e-07, "loss": -0.3541322350502014, "memory(GiB)": 77.29, "reward": 1.2083333730697632, "reward_std": 0.8649312257766724, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.5416666865348816, "rewards/FormatCorrectnessReward/std": 0.45016831159591675, "step": 863, "train_speed(iter/s)": 0.013641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4546.0, "completions/mean_length": 947.25, "completions/min_length": 408.0, "epoch": 0.035801599469605934, "grad_norm": 2.8382951801406255, "kl": 0.4990234375, "learning_rate": 9.943067878498209e-07, "loss": 0.01997016929090023, "memory(GiB)": 77.29, "reward": 1.375, "reward_std": 0.7111130952835083, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 864, "train_speed(iter/s)": 0.013638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1221.0, "completions/mean_length": 591.3333740234375, "completions/min_length": 347.0, "epoch": 0.03584303650602909, "grad_norm": 3.6207444230519585, "kl": 0.591796875, "learning_rate": 9.942908597485558e-07, "loss": 0.023650091141462326, "memory(GiB)": 77.29, "reward": 1.6666667461395264, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 865, "train_speed(iter/s)": 0.013645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/mean_length": 590.5833740234375, "completions/min_length": 396.0, "epoch": 0.03588447354245224, "grad_norm": 3.435770244508789, "kl": 0.56689453125, "learning_rate": 9.942749095250347e-07, "loss": 0.02260897122323513, "memory(GiB)": 77.29, "reward": 1.5833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.6666666865348816, "rewards/AnswerAccuracyReward/std": 0.4923659861087799, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 866, "train_speed(iter/s)": 0.013652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1042.0, "completions/mean_length": 618.3333740234375, "completions/min_length": 386.0, "epoch": 0.0359259105788754, "grad_norm": 394.22814461238124, "kl": 3.6708984375, "learning_rate": 9.942589371799714e-07, "loss": 0.14688533544540405, "memory(GiB)": 77.29, "reward": 1.5, "reward_std": 0.5222329497337341, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 867, "train_speed(iter/s)": 0.01366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/mean_length": 532.5833740234375, "completions/min_length": 330.0, "epoch": 0.03596734761529855, "grad_norm": 3.6612722405253058, "kl": 0.6435546875, "learning_rate": 9.942429427140806e-07, "loss": 0.02580675482749939, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 868, "train_speed(iter/s)": 0.013668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 7998.0, "completions/mean_length": 1088.416748046875, "completions/min_length": 270.0, "epoch": 0.036008784651721706, "grad_norm": 2.9344441527792156, "kl": 0.7060546875, "learning_rate": 9.942269261280782e-07, "loss": -0.20850470662117004, "memory(GiB)": 77.29, "reward": 1.75, "reward_std": 0.6215815544128418, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 869, "train_speed(iter/s)": 0.013656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1206.0, "completions/mean_length": 600.4166870117188, "completions/min_length": 340.0, "epoch": 0.03605022168814486, "grad_norm": 0.3394501855923119, "kl": 0.5908203125, "learning_rate": 9.94210887422681e-07, "loss": 0.02362603321671486, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 870, "train_speed(iter/s)": 0.013663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2123.0, "completions/mean_length": 801.9166870117188, "completions/min_length": 434.0, "epoch": 0.03609165872456802, "grad_norm": 2.1148727158274636, "kl": 0.56982421875, "learning_rate": 9.94194826598607e-07, "loss": 0.022731363773345947, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 871, "train_speed(iter/s)": 0.013667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1175.0, "completions/min_length": 359.0, "epoch": 0.03613309576099118, "grad_norm": 1.3767658640064862, "kl": 0.587890625, "learning_rate": 9.94178743656575e-07, "loss": -0.24318695068359375, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.5773502588272095, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 872, "train_speed(iter/s)": 0.013655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/mean_length": 450.66668701171875, "completions/min_length": 339.0, "epoch": 0.03617453279741433, "grad_norm": 4.246021322918724, "kl": 0.654296875, "learning_rate": 9.941626385973047e-07, "loss": 0.026179810985922813, "memory(GiB)": 77.29, "reward": 0.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 873, "train_speed(iter/s)": 0.013658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4166666666666667, "completions/max_length": 8001.0, "completions/mean_length": 3753.416748046875, "completions/min_length": 394.0, "epoch": 0.036215969833837486, "grad_norm": 3.224362559595315, "kl": 0.6494140625, "learning_rate": 9.941465114215166e-07, "loss": -0.4217795729637146, "memory(GiB)": 77.29, "reward": 0.5833333730697632, "reward_std": 0.5573204159736633, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.5, "rewards/FormatCorrectnessReward/std": 0.4767313003540039, "step": 874, "train_speed(iter/s)": 0.013642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/mean_length": 483.41668701171875, "completions/min_length": 357.0, "epoch": 0.03625740687026064, "grad_norm": 16.33011578916079, "kl": 0.71875, "learning_rate": 9.941303621299331e-07, "loss": 0.028784772381186485, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 875, "train_speed(iter/s)": 0.013649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/mean_length": 459.66668701171875, "completions/min_length": 267.0, "epoch": 0.036298843906683795, "grad_norm": 3.8882613571958418, "kl": 0.6748046875, "learning_rate": 9.941141907232763e-07, "loss": 0.027050774544477463, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 876, "train_speed(iter/s)": 0.013657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1131.166748046875, "completions/min_length": 258.0, "epoch": 0.03634028094310695, "grad_norm": 2.1790952317611456, "kl": 0.6572265625, "learning_rate": 9.940979972022706e-07, "loss": -0.21018624305725098, "memory(GiB)": 77.29, "reward": 0.875, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 877, "train_speed(iter/s)": 0.013645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/mean_length": 407.75, "completions/min_length": 264.0, "epoch": 0.036381717979530104, "grad_norm": 4.338490305477394, "kl": 0.7490234375, "learning_rate": 9.940817815676402e-07, "loss": 0.029855147004127502, "memory(GiB)": 77.29, "reward": 1.5416667461395264, "reward_std": 0.7821396589279175, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.3964807391166687, "step": 878, "train_speed(iter/s)": 0.013652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1217.8333740234375, "completions/min_length": 441.0, "epoch": 0.03642315501595326, "grad_norm": 3.2890764138941146, "kl": 0.65234375, "learning_rate": 9.94065543820111e-07, "loss": -0.10441508889198303, "memory(GiB)": 77.29, "reward": 0.875, "reward_std": 0.5690901875495911, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.625, "rewards/FormatCorrectnessReward/std": 0.4330126941204071, "step": 879, "train_speed(iter/s)": 0.01364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/mean_length": 428.8333435058594, "completions/min_length": 307.0, "epoch": 0.03646459205237641, "grad_norm": 3.8153283580312136, "kl": 0.7158203125, "learning_rate": 9.940492839604103e-07, "loss": 0.028645619750022888, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.36927446722984314, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 880, "train_speed(iter/s)": 0.013642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8000.0, "completions/mean_length": 1182.0833740234375, "completions/min_length": 284.0, "epoch": 0.03650602908879957, "grad_norm": 1.863815655603354, "kl": 0.7734375, "learning_rate": 9.94033001989265e-07, "loss": -0.2062443494796753, "memory(GiB)": 77.29, "reward": 0.875, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 881, "train_speed(iter/s)": 0.01363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/mean_length": 518.5833740234375, "completions/min_length": 366.0, "epoch": 0.03654746612522272, "grad_norm": 4.896646493067101, "kl": 0.7470703125, "learning_rate": 9.94016697907404e-07, "loss": 0.029881011694669724, "memory(GiB)": 77.29, "reward": 0.9583333730697632, "reward_std": 0.45016831159591675, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 882, "train_speed(iter/s)": 0.013638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/mean_length": 376.0833435058594, "completions/min_length": 251.0, "epoch": 0.036588903161645876, "grad_norm": 4.816144689311011, "kl": 0.80078125, "learning_rate": 9.940003717155572e-07, "loss": 0.032070092856884, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.32566946744918823, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3256694972515106, "step": 883, "train_speed(iter/s)": 0.013647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/mean_length": 457.0, "completions/min_length": 282.0, "epoch": 0.03663034019806903, "grad_norm": 3.7527469874113346, "kl": 0.81640625, "learning_rate": 9.939840234144554e-07, "loss": 0.03266346454620361, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 884, "train_speed(iter/s)": 0.013654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/mean_length": 436.16668701171875, "completions/min_length": 296.0, "epoch": 0.03667177723449219, "grad_norm": 3.979658178558651, "kl": 0.755859375, "learning_rate": 9.9396765300483e-07, "loss": 0.030241116881370544, "memory(GiB)": 77.29, "reward": 1.5, "reward_std": 0.5222329497337341, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 885, "train_speed(iter/s)": 0.013662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/mean_length": 412.66668701171875, "completions/min_length": 207.0, "epoch": 0.03671321427091535, "grad_norm": 0.4356399351039987, "kl": 0.83203125, "learning_rate": 9.939512604874138e-07, "loss": 0.03330053761601448, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 886, "train_speed(iter/s)": 0.013671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 375.5, "completions/min_length": 292.0, "epoch": 0.0367546513073385, "grad_norm": 3.947594308081162, "kl": 0.7578125, "learning_rate": 9.939348458629404e-07, "loss": 0.030303776264190674, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 887, "train_speed(iter/s)": 0.01368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/mean_length": 432.16668701171875, "completions/min_length": 280.0, "epoch": 0.036796088343761656, "grad_norm": 3.52874642744449, "kl": 0.7509765625, "learning_rate": 9.939184091321444e-07, "loss": 0.029985010623931885, "memory(GiB)": 77.29, "reward": 0.9166666865348816, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 888, "train_speed(iter/s)": 0.013688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/mean_length": 409.8333435058594, "completions/min_length": 70.0, "epoch": 0.03683752538018481, "grad_norm": 5.643713344890922, "kl": 0.8271484375, "learning_rate": 9.939019502957615e-07, "loss": 0.03311961889266968, "memory(GiB)": 77.29, "reward": 1.75, "reward_std": 0.5838742256164551, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3256694972515106, "step": 889, "train_speed(iter/s)": 0.013697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/mean_length": 383.8333435058594, "completions/min_length": 281.0, "epoch": 0.036878962416607965, "grad_norm": 4.701388801645431, "kl": 0.82421875, "learning_rate": 9.938854693545284e-07, "loss": 0.03297802433371544, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 890, "train_speed(iter/s)": 0.013705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/mean_length": 453.5833435058594, "completions/min_length": 228.0, "epoch": 0.03692039945303112, "grad_norm": 3.6337621794447728, "kl": 0.796875, "learning_rate": 9.938689663091827e-07, "loss": 0.031876277178525925, "memory(GiB)": 77.29, "reward": 1.875, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 891, "train_speed(iter/s)": 0.013713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/mean_length": 469.25, "completions/min_length": 248.0, "epoch": 0.036961836489454274, "grad_norm": 4.065309239764392, "kl": 0.78515625, "learning_rate": 9.938524411604629e-07, "loss": 0.03135504573583603, "memory(GiB)": 77.29, "reward": 1.25, "reward_std": 0.5838742256164551, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3256694972515106, "step": 892, "train_speed(iter/s)": 0.01372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1083.5, "completions/min_length": 263.0, "epoch": 0.03700327352587743, "grad_norm": 3.8109090556705185, "kl": 0.7451171875, "learning_rate": 9.938358939091088e-07, "loss": -0.15801331400871277, "memory(GiB)": 77.29, "reward": 0.75, "reward_std": 0.33709993958473206, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.33709993958473206, "step": 893, "train_speed(iter/s)": 0.013707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/mean_length": 349.41668701171875, "completions/min_length": 161.0, "epoch": 0.03704471056230058, "grad_norm": 5.477904661462522, "kl": 0.876953125, "learning_rate": 9.938193245558604e-07, "loss": 0.03513896465301514, "memory(GiB)": 77.29, "reward": 1.7083333730697632, "reward_std": 0.5418123602867126, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 894, "train_speed(iter/s)": 0.013715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/mean_length": 433.25, "completions/min_length": 271.0, "epoch": 0.03708614759872374, "grad_norm": 3.9269556880738516, "kl": 0.7548828125, "learning_rate": 9.9380273310146e-07, "loss": 0.030152400955557823, "memory(GiB)": 77.29, "reward": 1.75, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 895, "train_speed(iter/s)": 0.013724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/mean_length": 396.8333435058594, "completions/min_length": 278.0, "epoch": 0.03712758463514689, "grad_norm": 0.4342327899173028, "kl": 0.8447265625, "learning_rate": 9.937861195466497e-07, "loss": 0.033678796142339706, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 896, "train_speed(iter/s)": 0.013724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 885.0, "completions/mean_length": 457.25, "completions/min_length": 302.0, "epoch": 0.037169021671570046, "grad_norm": 3.096673154987482, "kl": 0.76171875, "learning_rate": 9.937694838921733e-07, "loss": 0.030522506684064865, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 897, "train_speed(iter/s)": 0.013731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666666, "completions/max_length": 8001.0, "completions/mean_length": 1675.916748046875, "completions/min_length": 324.0, "epoch": 0.03721045870799321, "grad_norm": 2.2654907190399207, "kl": 0.708984375, "learning_rate": 9.937528261387752e-07, "loss": -0.32714328169822693, "memory(GiB)": 77.29, "reward": 1.625, "reward_std": 0.772392988204956, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.3964807391166687, "step": 898, "train_speed(iter/s)": 0.013717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/mean_length": 429.3333435058594, "completions/min_length": 333.0, "epoch": 0.03725189574441636, "grad_norm": 0.4088986985058263, "kl": 0.734375, "learning_rate": 9.937361462872009e-07, "loss": 0.029381832107901573, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 899, "train_speed(iter/s)": 0.013726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/mean_length": 433.3333435058594, "completions/min_length": 300.0, "epoch": 0.037293332780839517, "grad_norm": 3.704208049932114, "kl": 0.7265625, "learning_rate": 9.93719444338197e-07, "loss": 0.029041191563010216, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 900, "train_speed(iter/s)": 0.013734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 466.0833435058594, "completions/min_length": 334.0, "epoch": 0.03733476981726267, "grad_norm": 3.988197850102574, "kl": 0.732421875, "learning_rate": 9.937027202925112e-07, "loss": 0.029338419437408447, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 901, "train_speed(iter/s)": 0.013743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/mean_length": 389.25, "completions/min_length": 222.0, "epoch": 0.037376206853685826, "grad_norm": 4.594126201116596, "kl": 0.79296875, "learning_rate": 9.936859741508916e-07, "loss": 0.03173085302114487, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 902, "train_speed(iter/s)": 0.013751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1110.0, "completions/mean_length": 590.4166870117188, "completions/min_length": 366.0, "epoch": 0.03741764389010898, "grad_norm": 3.5081710314616643, "kl": 0.6728515625, "learning_rate": 9.936692059140878e-07, "loss": 0.02692393586039543, "memory(GiB)": 77.29, "reward": 1.375, "reward_std": 0.4826536476612091, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 903, "train_speed(iter/s)": 0.013758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/mean_length": 402.25, "completions/min_length": 274.0, "epoch": 0.037459080926532135, "grad_norm": 0.442498276768245, "kl": 0.6943359375, "learning_rate": 9.936524155828503e-07, "loss": 0.027755819261074066, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 904, "train_speed(iter/s)": 0.013767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/mean_length": 512.0833740234375, "completions/min_length": 362.0, "epoch": 0.03750051796295529, "grad_norm": 4.689972241031429, "kl": 0.7431640625, "learning_rate": 9.936356031579308e-07, "loss": 0.029764613136649132, "memory(GiB)": 77.29, "reward": 1.875, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 905, "train_speed(iter/s)": 0.013774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/mean_length": 529.5, "completions/min_length": 384.0, "epoch": 0.037541954999378443, "grad_norm": 3.686119639701721, "kl": 0.6875, "learning_rate": 9.936187686400814e-07, "loss": 0.027502015233039856, "memory(GiB)": 77.29, "reward": 1.25, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 906, "train_speed(iter/s)": 0.01378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/mean_length": 524.1666870117188, "completions/min_length": 242.0, "epoch": 0.0375833920358016, "grad_norm": 3.134170302113837, "kl": 0.642578125, "learning_rate": 9.936019120300555e-07, "loss": 0.025708159431815147, "memory(GiB)": 77.29, "reward": 1.0833333730697632, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 907, "train_speed(iter/s)": 0.013787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/mean_length": 625.5, "completions/min_length": 499.0, "epoch": 0.03762482907222475, "grad_norm": 0.41640493399719825, "kl": 0.640625, "learning_rate": 9.93585033328608e-07, "loss": 0.025571022182703018, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 908, "train_speed(iter/s)": 0.013795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/mean_length": 452.41668701171875, "completions/min_length": 270.0, "epoch": 0.03766626610864791, "grad_norm": 0.3291776416003058, "kl": 0.6787109375, "learning_rate": 9.93568132536494e-07, "loss": 0.027140453457832336, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 909, "train_speed(iter/s)": 0.013803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1273.25, "completions/min_length": 385.0, "epoch": 0.03770770314507106, "grad_norm": 2.6912936500770983, "kl": 0.45166015625, "learning_rate": 9.935512096544698e-07, "loss": -0.15342214703559875, "memory(GiB)": 77.29, "reward": 1.3333333730697632, "reward_std": 0.6513389348983765, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 910, "train_speed(iter/s)": 0.01379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1442.0, "completions/mean_length": 551.4166870117188, "completions/min_length": 226.0, "epoch": 0.03774914018149422, "grad_norm": 0.3571911862227409, "kl": 0.5830078125, "learning_rate": 9.935342646832929e-07, "loss": 0.02333758771419525, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 911, "train_speed(iter/s)": 0.013796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1169.0833740234375, "completions/min_length": 316.0, "epoch": 0.03779057721791738, "grad_norm": 1.9031206851259028, "kl": 0.529296875, "learning_rate": 9.935172976237217e-07, "loss": -0.24483048915863037, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 912, "train_speed(iter/s)": 0.013785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/mean_length": 511.0, "completions/min_length": 338.0, "epoch": 0.03783201425434053, "grad_norm": 28.936432154146534, "kl": 0.6962890625, "learning_rate": 9.935003084765157e-07, "loss": 0.02782795950770378, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 913, "train_speed(iter/s)": 0.013793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/mean_length": 576.5833740234375, "completions/min_length": 343.0, "epoch": 0.037873451290763686, "grad_norm": 3.2659287034134534, "kl": 0.6474609375, "learning_rate": 9.93483297242435e-07, "loss": 0.025868427008390427, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.36927446722984314, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 914, "train_speed(iter/s)": 0.0138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/mean_length": 634.5833740234375, "completions/min_length": 469.0, "epoch": 0.03791488832718684, "grad_norm": 3.076593208794491, "kl": 0.46044921875, "learning_rate": 9.93466263922241e-07, "loss": 0.01836572214961052, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 915, "train_speed(iter/s)": 0.013808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/mean_length": 547.8333740234375, "completions/min_length": 418.0, "epoch": 0.037956325363609995, "grad_norm": 0.36117735492095515, "kl": 0.6123046875, "learning_rate": 9.934492085166963e-07, "loss": 0.024492627009749413, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 916, "train_speed(iter/s)": 0.013816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/mean_length": 584.3333740234375, "completions/min_length": 395.0, "epoch": 0.03799776240003315, "grad_norm": 0.40115722016508054, "kl": 0.5224609375, "learning_rate": 9.934321310265639e-07, "loss": 0.020843997597694397, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 917, "train_speed(iter/s)": 0.013823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/mean_length": 520.0833740234375, "completions/min_length": 350.0, "epoch": 0.038039199436456304, "grad_norm": 3.497991226015052, "kl": 0.51806640625, "learning_rate": 9.934150314526083e-07, "loss": 0.020704012364149094, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 918, "train_speed(iter/s)": 0.013831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/mean_length": 610.1666870117188, "completions/min_length": 369.0, "epoch": 0.03808063647287946, "grad_norm": 4.230678088837339, "kl": 0.5498046875, "learning_rate": 9.933979097955948e-07, "loss": 0.021968921646475792, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 919, "train_speed(iter/s)": 0.013839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/mean_length": 555.4166870117188, "completions/min_length": 435.0, "epoch": 0.03812207350930261, "grad_norm": 0.3135038891096566, "kl": 0.455078125, "learning_rate": 9.933807660562896e-07, "loss": 0.018252883106470108, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 920, "train_speed(iter/s)": 0.013846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/mean_length": 533.5, "completions/min_length": 411.0, "epoch": 0.03816351054572577, "grad_norm": 0.3160582015620942, "kl": 0.568359375, "learning_rate": 9.933636002354599e-07, "loss": 0.02270912192761898, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 921, "train_speed(iter/s)": 0.013855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/mean_length": 645.5, "completions/min_length": 521.0, "epoch": 0.03820494758214892, "grad_norm": 3.4763325923422506, "kl": 0.451171875, "learning_rate": 9.933464123338742e-07, "loss": 0.018037479370832443, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 922, "train_speed(iter/s)": 0.013861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/mean_length": 582.75, "completions/min_length": 441.0, "epoch": 0.03824638461857208, "grad_norm": 3.08727087075446, "kl": 0.4833984375, "learning_rate": 9.933292023523017e-07, "loss": 0.019378583878278732, "memory(GiB)": 77.29, "reward": 1.0833333730697632, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 923, "train_speed(iter/s)": 0.013868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1056.0, "completions/mean_length": 612.4166870117188, "completions/min_length": 385.0, "epoch": 0.03828782165499523, "grad_norm": 3.4158014887414767, "kl": 0.4482421875, "learning_rate": 9.933119702915124e-07, "loss": 0.017919838428497314, "memory(GiB)": 77.29, "reward": 1.875, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 924, "train_speed(iter/s)": 0.013875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1053.0, "completions/mean_length": 707.5, "completions/min_length": 471.0, "epoch": 0.03832925869141839, "grad_norm": 0.43245736942191354, "kl": 0.4375, "learning_rate": 9.932947161522779e-07, "loss": 0.01752474531531334, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 925, "train_speed(iter/s)": 0.013882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1195.0, "completions/mean_length": 712.9166870117188, "completions/min_length": 414.0, "epoch": 0.03837069572784155, "grad_norm": 3.439550142028035, "kl": 0.39892578125, "learning_rate": 9.932774399353699e-07, "loss": 0.015948250889778137, "memory(GiB)": 77.29, "reward": 1.4166667461395264, "reward_std": 0.4174235463142395, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 926, "train_speed(iter/s)": 0.013883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/mean_length": 612.5, "completions/min_length": 391.0, "epoch": 0.0384121327642647, "grad_norm": 0.2997960738610458, "kl": 0.43896484375, "learning_rate": 9.93260141641562e-07, "loss": 0.017523860558867455, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 927, "train_speed(iter/s)": 0.01389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1107.0, "completions/mean_length": 669.1666870117188, "completions/min_length": 472.0, "epoch": 0.038453569800687856, "grad_norm": 11.715409480275174, "kl": 0.4970703125, "learning_rate": 9.932428212716285e-07, "loss": 0.01985098421573639, "memory(GiB)": 77.29, "reward": 1.7916667461395264, "reward_std": 0.33427897095680237, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 928, "train_speed(iter/s)": 0.013897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1196.0, "completions/mean_length": 753.9166870117188, "completions/min_length": 513.0, "epoch": 0.03849500683711101, "grad_norm": 41.256835057363254, "kl": 1.34619140625, "learning_rate": 9.932254788263443e-07, "loss": 0.053793445229530334, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 929, "train_speed(iter/s)": 0.013903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 7999.0, "completions/mean_length": 1347.3333740234375, "completions/min_length": 523.0, "epoch": 0.038536443873534165, "grad_norm": 2.55784564400833, "kl": 0.33642578125, "learning_rate": 9.932081143064858e-07, "loss": -0.14614030718803406, "memory(GiB)": 77.29, "reward": 1.25, "reward_std": 0.6571287512779236, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.3256694972515106, "step": 930, "train_speed(iter/s)": 0.013891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/mean_length": 646.6666870117188, "completions/min_length": 500.0, "epoch": 0.03857788090995732, "grad_norm": 0.2411807932241839, "kl": 0.310791015625, "learning_rate": 9.9319072771283e-07, "loss": 0.012420586310327053, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 931, "train_speed(iter/s)": 0.013898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/mean_length": 581.1666870117188, "completions/min_length": 236.0, "epoch": 0.038619317946380474, "grad_norm": 2744.868941098523, "kl": 9.2958984375, "learning_rate": 9.93173319046155e-07, "loss": 0.37122830748558044, "memory(GiB)": 77.29, "reward": 1.7083333730697632, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 932, "train_speed(iter/s)": 0.013904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1269.0, "completions/mean_length": 725.0, "completions/min_length": 537.0, "epoch": 0.03866075498280363, "grad_norm": 2.3235319192215242, "kl": 0.3779296875, "learning_rate": 9.931558883072402e-07, "loss": 0.015122572891414165, "memory(GiB)": 77.29, "reward": 1.625, "reward_std": 0.6440284848213196, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 933, "train_speed(iter/s)": 0.013907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/mean_length": 735.75, "completions/min_length": 523.0, "epoch": 0.03870219201922678, "grad_norm": 2.853977923107317, "kl": 0.30517578125, "learning_rate": 9.931384354968653e-07, "loss": 0.01219940185546875, "memory(GiB)": 77.29, "reward": 1.3333333730697632, "reward_std": 0.44381269812583923, "rewards/AnswerAccuracyReward/mean": 0.4166666567325592, "rewards/AnswerAccuracyReward/std": 0.5149286985397339, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 934, "train_speed(iter/s)": 0.013914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/mean_length": 693.5833740234375, "completions/min_length": 593.0, "epoch": 0.03874362905564994, "grad_norm": 2.493112514240492, "kl": 0.296875, "learning_rate": 9.931209606158117e-07, "loss": 0.01184060238301754, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 935, "train_speed(iter/s)": 0.013922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/mean_length": 553.8333740234375, "completions/min_length": 400.0, "epoch": 0.03878506609207309, "grad_norm": 6.022345951742038, "kl": 0.4287109375, "learning_rate": 9.931034636648616e-07, "loss": 0.017143061384558678, "memory(GiB)": 77.29, "reward": 0.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 936, "train_speed(iter/s)": 0.013929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/mean_length": 768.6666870117188, "completions/min_length": 573.0, "epoch": 0.03882650312849625, "grad_norm": 2.835552668604868, "kl": 0.30859375, "learning_rate": 9.930859446447976e-07, "loss": 0.012340962886810303, "memory(GiB)": 77.29, "reward": 1.5416667461395264, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 937, "train_speed(iter/s)": 0.013936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1354.0, "completions/mean_length": 808.75, "completions/min_length": 566.0, "epoch": 0.03886794016491941, "grad_norm": 2.818613118036697, "kl": 0.330078125, "learning_rate": 9.930684035564046e-07, "loss": 0.013218904845416546, "memory(GiB)": 77.29, "reward": 1.5833333730697632, "reward_std": 0.4174235463142395, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.33709993958473206, "step": 938, "train_speed(iter/s)": 0.013941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/mean_length": 549.9166870117188, "completions/min_length": 388.0, "epoch": 0.03890937720134256, "grad_norm": 2.886544104753282, "kl": 0.37060546875, "learning_rate": 9.930508404004666e-07, "loss": 0.014839078299701214, "memory(GiB)": 77.29, "reward": 1.6666667461395264, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 939, "train_speed(iter/s)": 0.013949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/mean_length": 684.25, "completions/min_length": 494.0, "epoch": 0.03895081423776572, "grad_norm": 2.482338402733557, "kl": 0.330078125, "learning_rate": 9.930332551777707e-07, "loss": 0.013202786445617676, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 940, "train_speed(iter/s)": 0.013956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/mean_length": 683.25, "completions/min_length": 511.0, "epoch": 0.03899225127418887, "grad_norm": 2.8965968916673237, "kl": 0.322265625, "learning_rate": 9.930156478891031e-07, "loss": 0.012873401865363121, "memory(GiB)": 77.29, "reward": 1.3333333730697632, "reward_std": 0.5773502588272095, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 941, "train_speed(iter/s)": 0.013963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/mean_length": 639.3333740234375, "completions/min_length": 441.0, "epoch": 0.039033688310612026, "grad_norm": 2.8528220240934488, "kl": 0.296875, "learning_rate": 9.929980185352525e-07, "loss": 0.011889686807990074, "memory(GiB)": 77.29, "reward": 1.875, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 942, "train_speed(iter/s)": 0.01397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/mean_length": 617.8333740234375, "completions/min_length": 388.0, "epoch": 0.03907512534703518, "grad_norm": 4.327935438403914, "kl": 0.367431640625, "learning_rate": 9.929803671170073e-07, "loss": 0.014682318083941936, "memory(GiB)": 77.29, "reward": 1.375, "reward_std": 0.6077155470848083, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 943, "train_speed(iter/s)": 0.013978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1077.0, "completions/mean_length": 697.4166870117188, "completions/min_length": 515.0, "epoch": 0.039116562383458335, "grad_norm": 2.9710094899458017, "kl": 0.3271484375, "learning_rate": 9.92962693635158e-07, "loss": 0.013056179508566856, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 944, "train_speed(iter/s)": 0.013984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3629.0, "completions/mean_length": 772.3333740234375, "completions/min_length": 314.0, "epoch": 0.03915799941988149, "grad_norm": 2.8862905387254134, "kl": 0.35205078125, "learning_rate": 9.929449980904951e-07, "loss": 0.014083008281886578, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 945, "train_speed(iter/s)": 0.013984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1120.0, "completions/mean_length": 607.9166870117188, "completions/min_length": 395.0, "epoch": 0.039199436456304644, "grad_norm": 2.7205128487539887, "kl": 0.32470703125, "learning_rate": 9.929272804838112e-07, "loss": 0.012990991584956646, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 946, "train_speed(iter/s)": 0.01399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1265.0, "completions/mean_length": 677.0, "completions/min_length": 529.0, "epoch": 0.0392408734927278, "grad_norm": 3.3729314732088813, "kl": 0.30419921875, "learning_rate": 9.929095408158987e-07, "loss": 0.012181749567389488, "memory(GiB)": 77.29, "reward": 1.2083333730697632, "reward_std": 0.49810245633125305, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 947, "train_speed(iter/s)": 0.013996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/mean_length": 589.1666870117188, "completions/min_length": 315.0, "epoch": 0.03928231052915095, "grad_norm": 2.6900779446248464, "kl": 0.270263671875, "learning_rate": 9.928917790875516e-07, "loss": 0.01081150770187378, "memory(GiB)": 77.29, "reward": 1.5416667461395264, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.25746431946754456, "step": 948, "train_speed(iter/s)": 0.014003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 880.0, "completions/mean_length": 649.25, "completions/min_length": 448.0, "epoch": 0.03932374756557411, "grad_norm": 2.7022960002119, "kl": 0.25537109375, "learning_rate": 9.928739952995653e-07, "loss": 0.010222057811915874, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 949, "train_speed(iter/s)": 0.01401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/mean_length": 534.5833740234375, "completions/min_length": 367.0, "epoch": 0.03936518460199726, "grad_norm": 0.251842874890078, "kl": 0.37060546875, "learning_rate": 9.928561894527351e-07, "loss": 0.014808356761932373, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 950, "train_speed(iter/s)": 0.014018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/mean_length": 591.5, "completions/min_length": 445.0, "epoch": 0.039406621638420423, "grad_norm": 2.7489064748404655, "kl": 0.293212890625, "learning_rate": 9.928383615478586e-07, "loss": 0.011723628267645836, "memory(GiB)": 77.29, "reward": 0.9166666865348816, "reward_std": 0.19462472200393677, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 951, "train_speed(iter/s)": 0.014025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/mean_length": 568.75, "completions/min_length": 444.0, "epoch": 0.03944805867484358, "grad_norm": 0.1961025795320823, "kl": 0.260986328125, "learning_rate": 9.92820511585733e-07, "loss": 0.010435778647661209, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 952, "train_speed(iter/s)": 0.014032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/mean_length": 602.8333740234375, "completions/min_length": 413.0, "epoch": 0.03948949571126673, "grad_norm": 3.110719740573238, "kl": 0.32470703125, "learning_rate": 9.928026395671576e-07, "loss": 0.012998293153941631, "memory(GiB)": 77.29, "reward": 1.625, "reward_std": 0.4330126941204071, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 953, "train_speed(iter/s)": 0.01404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/mean_length": 577.3333740234375, "completions/min_length": 383.0, "epoch": 0.03953093274768989, "grad_norm": 2.1019215595688343, "kl": 0.29248046875, "learning_rate": 9.927847454929322e-07, "loss": 0.011696269735693932, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 954, "train_speed(iter/s)": 0.014047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/mean_length": 572.5833740234375, "completions/min_length": 437.0, "epoch": 0.03957236978411304, "grad_norm": 3.2506905513842224, "kl": 0.38623046875, "learning_rate": 9.927668293638575e-07, "loss": 0.01544051431119442, "memory(GiB)": 77.29, "reward": 1.75, "reward_std": 0.26111647486686707, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.26111647486686707, "step": 955, "train_speed(iter/s)": 0.014053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/mean_length": 514.0833740234375, "completions/min_length": 372.0, "epoch": 0.039613806820536196, "grad_norm": 2.940126025938043, "kl": 0.3583984375, "learning_rate": 9.927488911807357e-07, "loss": 0.014310186728835106, "memory(GiB)": 77.29, "reward": 1.75, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 956, "train_speed(iter/s)": 0.014061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1033.0, "completions/mean_length": 563.5833740234375, "completions/min_length": 404.0, "epoch": 0.03965524385695935, "grad_norm": 3.4477849557710427, "kl": 0.32666015625, "learning_rate": 9.927309309443695e-07, "loss": 0.013081233017146587, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 957, "train_speed(iter/s)": 0.014068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1135.0, "completions/mean_length": 602.6666870117188, "completions/min_length": 384.0, "epoch": 0.039696680893382505, "grad_norm": 3.2040150260156475, "kl": 0.37451171875, "learning_rate": 9.927129486555624e-07, "loss": 0.01496422290802002, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 958, "train_speed(iter/s)": 0.014074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/mean_length": 636.0, "completions/min_length": 419.0, "epoch": 0.03973811792980566, "grad_norm": 4.026702813046575, "kl": 0.345703125, "learning_rate": 9.926949443151193e-07, "loss": 0.013769835233688354, "memory(GiB)": 77.29, "reward": 1.875, "reward_std": 0.3107907772064209, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 959, "train_speed(iter/s)": 0.01408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1171.0, "completions/mean_length": 614.1666870117188, "completions/min_length": 399.0, "epoch": 0.039779554966228814, "grad_norm": 3.33938483386924, "kl": 0.36328125, "learning_rate": 9.926769179238464e-07, "loss": 0.014488279819488525, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 960, "train_speed(iter/s)": 0.014086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/mean_length": 520.3333740234375, "completions/min_length": 418.0, "epoch": 0.03982099200265197, "grad_norm": 3.667919747668475, "kl": 0.345703125, "learning_rate": 9.926588694825502e-07, "loss": 0.013814112171530724, "memory(GiB)": 77.29, "reward": 1.3333333730697632, "reward_std": 0.4923659861087799, "rewards/AnswerAccuracyReward/mean": 0.3333333432674408, "rewards/AnswerAccuracyReward/std": 0.4923659563064575, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 961, "train_speed(iter/s)": 0.014093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1139.3333740234375, "completions/min_length": 448.0, "epoch": 0.03986242903907512, "grad_norm": 3.2096083777572337, "kl": 0.400390625, "learning_rate": 9.926407989920384e-07, "loss": -0.14570772647857666, "memory(GiB)": 77.29, "reward": 1.0416667461395264, "reward_std": 0.5418123602867126, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.3107907772064209, "step": 962, "train_speed(iter/s)": 0.014079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/mean_length": 439.5833435058594, "completions/min_length": 321.0, "epoch": 0.03990386607549828, "grad_norm": 0.54833383778454, "kl": 0.521484375, "learning_rate": 9.926227064531199e-07, "loss": 0.02083185315132141, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 963, "train_speed(iter/s)": 0.014087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/mean_length": 511.5, "completions/min_length": 332.0, "epoch": 0.03994530311192144, "grad_norm": 0.43211591279516615, "kl": 0.42138671875, "learning_rate": 9.926045918666043e-07, "loss": 0.01686156541109085, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 964, "train_speed(iter/s)": 0.014094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/mean_length": 517.9166870117188, "completions/min_length": 367.0, "epoch": 0.03998674014834459, "grad_norm": 0.3202357434125389, "kl": 0.42236328125, "learning_rate": 9.925864552333024e-07, "loss": 0.016888316720724106, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 965, "train_speed(iter/s)": 0.014101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8001.0, "completions/mean_length": 1090.0833740234375, "completions/min_length": 333.0, "epoch": 0.04002817718476775, "grad_norm": 1.695945302698301, "kl": 0.48681640625, "learning_rate": 9.925682965540263e-07, "loss": -0.24671845138072968, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.5773502588272095, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 966, "train_speed(iter/s)": 0.014089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/mean_length": 409.0833435058594, "completions/min_length": 310.0, "epoch": 0.0400696142211909, "grad_norm": 5.973619465058169, "kl": 0.51708984375, "learning_rate": 9.92550115829588e-07, "loss": 0.020660649985074997, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 967, "train_speed(iter/s)": 0.014096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333333, "completions/max_length": 8000.0, "completions/mean_length": 1151.25, "completions/min_length": 427.0, "epoch": 0.04011105125761406, "grad_norm": 3.0660222454979986, "kl": 0.40771484375, "learning_rate": 9.925319130608014e-07, "loss": -0.16048018634319305, "memory(GiB)": 77.29, "reward": 1.0833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.28867512941360474, "step": 968, "train_speed(iter/s)": 0.014084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 444.41668701171875, "completions/min_length": 338.0, "epoch": 0.04015248829403721, "grad_norm": 0.3355581728291013, "kl": 0.4833984375, "learning_rate": 9.925136882484815e-07, "loss": 0.01937449723482132, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 969, "train_speed(iter/s)": 0.014092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/mean_length": 441.25, "completions/min_length": 309.0, "epoch": 0.040193925330460366, "grad_norm": 0.3313082621447308, "kl": 0.5224609375, "learning_rate": 9.924954413934437e-07, "loss": 0.020931756123900414, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 970, "train_speed(iter/s)": 0.0141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/mean_length": 479.25, "completions/min_length": 373.0, "epoch": 0.04023536236688352, "grad_norm": 0.3397386764607375, "kl": 0.35205078125, "learning_rate": 9.924771724965047e-07, "loss": 0.014088407158851624, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 971, "train_speed(iter/s)": 0.014107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/mean_length": 495.3333435058594, "completions/min_length": 388.0, "epoch": 0.040276799403306675, "grad_norm": 3.8906524960615316, "kl": 0.42626953125, "learning_rate": 9.92458881558482e-07, "loss": 0.017100393772125244, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 972, "train_speed(iter/s)": 0.014115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5514.0, "completions/mean_length": 900.75, "completions/min_length": 383.0, "epoch": 0.04031823643972983, "grad_norm": 1.8828692505269797, "kl": 0.4306640625, "learning_rate": 9.924405685801945e-07, "loss": 0.017274480313062668, "memory(GiB)": 77.29, "reward": 1.9166667461395264, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 973, "train_speed(iter/s)": 0.014109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/mean_length": 428.5, "completions/min_length": 332.0, "epoch": 0.040359673476152984, "grad_norm": 0.9748945137276098, "kl": 0.5263671875, "learning_rate": 9.924222335624617e-07, "loss": 0.021042045205831528, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 974, "train_speed(iter/s)": 0.014117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/mean_length": 433.16668701171875, "completions/min_length": 316.0, "epoch": 0.04040111051257614, "grad_norm": 2.53202414329309, "kl": 0.55615234375, "learning_rate": 9.92403876506104e-07, "loss": 0.022226866334676743, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 975, "train_speed(iter/s)": 0.014125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 445.0833435058594, "completions/min_length": 342.0, "epoch": 0.04044254754899929, "grad_norm": 3.2337506889025542, "kl": 0.4765625, "learning_rate": 9.923854974119433e-07, "loss": 0.019068092107772827, "memory(GiB)": 77.29, "reward": 0.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 976, "train_speed(iter/s)": 0.014133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/mean_length": 468.3333435058594, "completions/min_length": 309.0, "epoch": 0.04048398458542245, "grad_norm": 3.0347715190543156, "kl": 0.4677734375, "learning_rate": 9.923670962808017e-07, "loss": 0.018695255741477013, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 977, "train_speed(iter/s)": 0.014141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/mean_length": 482.0, "completions/min_length": 327.0, "epoch": 0.04052542162184561, "grad_norm": 3.8974570295672413, "kl": 0.44873046875, "learning_rate": 9.923486731135033e-07, "loss": 0.017899876460433006, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 978, "train_speed(iter/s)": 0.014148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/mean_length": 522.4166870117188, "completions/min_length": 390.0, "epoch": 0.04056685865826876, "grad_norm": 3.4090942310992274, "kl": 0.494140625, "learning_rate": 9.923302279108724e-07, "loss": 0.019727688282728195, "memory(GiB)": 77.29, "reward": 1.75, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.75, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 979, "train_speed(iter/s)": 0.014155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/mean_length": 408.91668701171875, "completions/min_length": 301.0, "epoch": 0.04060829569469192, "grad_norm": 0.3673959126231699, "kl": 0.5693359375, "learning_rate": 9.923117606737346e-07, "loss": 0.022740546613931656, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 980, "train_speed(iter/s)": 0.01416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/mean_length": 404.75, "completions/min_length": 294.0, "epoch": 0.04064973273111507, "grad_norm": 3.6855594526871585, "kl": 0.56640625, "learning_rate": 9.922932714029163e-07, "loss": 0.022718986496329308, "memory(GiB)": 77.29, "reward": 0.9166666865348816, "reward_std": 0.19462473690509796, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 981, "train_speed(iter/s)": 0.014167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 447.66668701171875, "completions/min_length": 316.0, "epoch": 0.04069116976753823, "grad_norm": 4.354011408046556, "kl": 0.49755859375, "learning_rate": 9.922747600992447e-07, "loss": 0.0199466310441494, "memory(GiB)": 77.29, "reward": 1.875, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 982, "train_speed(iter/s)": 0.014173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/mean_length": 599.3333740234375, "completions/min_length": 444.0, "epoch": 0.04073260680396138, "grad_norm": 3.169351483930088, "kl": 0.4775390625, "learning_rate": 9.922562267635488e-07, "loss": 0.01911400817334652, "memory(GiB)": 77.29, "reward": 1.5833333730697632, "reward_std": 0.5149286389350891, "rewards/AnswerAccuracyReward/mean": 0.5833333134651184, "rewards/AnswerAccuracyReward/std": 0.5149286389350891, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 983, "train_speed(iter/s)": 0.01418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/mean_length": 447.66668701171875, "completions/min_length": 339.0, "epoch": 0.040774043840384536, "grad_norm": 3.486027207293237, "kl": 0.50146484375, "learning_rate": 9.92237671396658e-07, "loss": 0.02001803182065487, "memory(GiB)": 77.29, "reward": 1.0833333730697632, "reward_std": 0.28867512941360474, "rewards/AnswerAccuracyReward/mean": 0.0833333358168602, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 984, "train_speed(iter/s)": 0.014187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 881.0, "completions/mean_length": 471.0, "completions/min_length": 327.0, "epoch": 0.04081548087680769, "grad_norm": 3.502632659090111, "kl": 0.5087890625, "learning_rate": 9.922190939994027e-07, "loss": 0.02037050388753414, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 985, "train_speed(iter/s)": 0.014194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/mean_length": 504.5833435058594, "completions/min_length": 367.0, "epoch": 0.040856917913230845, "grad_norm": 3.7264697261906985, "kl": 0.43896484375, "learning_rate": 9.922004945726139e-07, "loss": 0.017576774582266808, "memory(GiB)": 77.29, "reward": 1.4166667461395264, "reward_std": 0.5573204159736633, "rewards/AnswerAccuracyReward/mean": 0.5, "rewards/AnswerAccuracyReward/std": 0.5222329497337341, "rewards/FormatCorrectnessReward/mean": 0.9166666865348816, "rewards/FormatCorrectnessReward/std": 0.19462473690509796, "step": 986, "train_speed(iter/s)": 0.014201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/mean_length": 444.25, "completions/min_length": 218.0, "epoch": 0.040898354949654, "grad_norm": 3.9128209374888288, "kl": 0.46728515625, "learning_rate": 9.921818731171248e-07, "loss": 0.018666332587599754, "memory(GiB)": 77.29, "reward": 1.7083333730697632, "reward_std": 0.5822500586509705, "rewards/AnswerAccuracyReward/mean": 0.9166666865348816, "rewards/AnswerAccuracyReward/std": 0.28867512941360474, "rewards/FormatCorrectnessReward/mean": 0.7916666865348816, "rewards/FormatCorrectnessReward/std": 0.33427897095680237, "step": 987, "train_speed(iter/s)": 0.014208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/mean_length": 462.5833435058594, "completions/min_length": 394.0, "epoch": 0.040939791986077154, "grad_norm": 2.693921476403777, "kl": 0.5556640625, "learning_rate": 9.921632296337682e-07, "loss": 0.02222275733947754, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 988, "train_speed(iter/s)": 0.014216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/mean_length": 453.41668701171875, "completions/min_length": 339.0, "epoch": 0.04098122902250031, "grad_norm": 4.971435198728759, "kl": 0.56005859375, "learning_rate": 9.921445641233787e-07, "loss": 0.022375881671905518, "memory(GiB)": 77.29, "reward": 1.75, "reward_std": 0.26111647486686707, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.75, "rewards/FormatCorrectnessReward/std": 0.26111647486686707, "step": 989, "train_speed(iter/s)": 0.014222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1189.0, "completions/mean_length": 564.1666870117188, "completions/min_length": 396.0, "epoch": 0.04102266605892346, "grad_norm": 4.983019252168708, "kl": 0.52392578125, "learning_rate": 9.921258765867919e-07, "loss": 0.020955583080649376, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 990, "train_speed(iter/s)": 0.014228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1236.0, "completions/mean_length": 501.5, "completions/min_length": 301.0, "epoch": 0.041064103095346624, "grad_norm": 4.005792449510885, "kl": 0.45703125, "learning_rate": 9.921071670248437e-07, "loss": 0.018258657306432724, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.38924944400787354, "rewards/AnswerAccuracyReward/mean": 0.8333333134651184, "rewards/AnswerAccuracyReward/std": 0.3892494738101959, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 991, "train_speed(iter/s)": 0.014234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/mean_length": 444.41668701171875, "completions/min_length": 246.0, "epoch": 0.04110554013176978, "grad_norm": 4.661646295870962, "kl": 0.54296875, "learning_rate": 9.920884354383718e-07, "loss": 0.021696042269468307, "memory(GiB)": 77.29, "reward": 1.8333333730697632, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.8333333134651184, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 992, "train_speed(iter/s)": 0.014242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/mean_length": 444.91668701171875, "completions/min_length": 343.0, "epoch": 0.04114697716819293, "grad_norm": 3.9447928110484347, "kl": 0.61865234375, "learning_rate": 9.920696818282147e-07, "loss": 0.024678707122802734, "memory(GiB)": 77.29, "reward": 1.125, "reward_std": 0.4330126941204071, "rewards/AnswerAccuracyReward/mean": 0.1666666716337204, "rewards/AnswerAccuracyReward/std": 0.38924944400787354, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 993, "train_speed(iter/s)": 0.014249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/mean_length": 392.8333435058594, "completions/min_length": 304.0, "epoch": 0.04118841420461609, "grad_norm": 0.5208686573364121, "kl": 0.6650390625, "learning_rate": 9.920509061952112e-07, "loss": 0.026593949645757675, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 994, "train_speed(iter/s)": 0.014258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/mean_length": 410.41668701171875, "completions/min_length": 316.0, "epoch": 0.04122985124103924, "grad_norm": 0.4799666765334212, "kl": 0.66015625, "learning_rate": 9.920321085402022e-07, "loss": 0.02642553672194481, "memory(GiB)": 77.29, "reward": 2.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 995, "train_speed(iter/s)": 0.014266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/mean_length": 495.91668701171875, "completions/min_length": 393.0, "epoch": 0.0412712882774624, "grad_norm": 5.512247295156388, "kl": 0.6650390625, "learning_rate": 9.920132888640284e-07, "loss": 0.026641588658094406, "memory(GiB)": 77.29, "reward": 1.6666667461395264, "reward_std": 0.24618299305438995, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.6666666865348816, "rewards/FormatCorrectnessReward/std": 0.24618297815322876, "step": 996, "train_speed(iter/s)": 0.014266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1142.0, "completions/mean_length": 543.1666870117188, "completions/min_length": 365.0, "epoch": 0.04131272531388555, "grad_norm": 2.6307645107776323, "kl": 0.49755859375, "learning_rate": 9.919944471675326e-07, "loss": 0.01988711953163147, "memory(GiB)": 77.29, "reward": 1.9583333730697632, "reward_std": 0.14433756470680237, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.9583333134651184, "rewards/FormatCorrectnessReward/std": 0.14433756470680237, "step": 997, "train_speed(iter/s)": 0.014272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/mean_length": 527.0, "completions/min_length": 355.0, "epoch": 0.041354162350308706, "grad_norm": 0.5206285823398438, "kl": 0.50537109375, "learning_rate": 9.91975583451558e-07, "loss": 0.020225808024406433, "memory(GiB)": 77.29, "reward": 1.0, "reward_std": 0.0, "rewards/AnswerAccuracyReward/mean": 0.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 998, "train_speed(iter/s)": 0.014279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/mean_length": 452.16668701171875, "completions/min_length": 268.0, "epoch": 0.04139559938673186, "grad_norm": 3.7237614672815327, "kl": 0.54541015625, "learning_rate": 9.919566977169485e-07, "loss": 0.021831359714269638, "memory(GiB)": 77.29, "reward": 1.875, "reward_std": 0.22613351047039032, "rewards/AnswerAccuracyReward/mean": 1.0, "rewards/AnswerAccuracyReward/std": 0.0, "rewards/FormatCorrectnessReward/mean": 0.875, "rewards/FormatCorrectnessReward/std": 0.22613351047039032, "step": 999, "train_speed(iter/s)": 0.014287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/mean_length": 526.0833740234375, "completions/min_length": 367.0, "epoch": 0.041437036423155015, "grad_norm": 3.8459221061001543, "kl": 0.53515625, "learning_rate": 9.919377899645496e-07, "loss": 0.02139456570148468, "memory(GiB)": 77.29, "reward": 1.25, "reward_std": 0.45226702094078064, "rewards/AnswerAccuracyReward/mean": 0.25, "rewards/AnswerAccuracyReward/std": 0.45226702094078064, "rewards/FormatCorrectnessReward/mean": 1.0, "rewards/FormatCorrectnessReward/std": 0.0, "step": 1000, "train_speed(iter/s)": 0.014293 }, { "epoch": 0.041437036423155015, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.008230452674897115, "eval_completions/max_length": 1498.9958847736625, "eval_completions/mean_length": 563.0850627883472, "eval_completions/min_length": 351.9053497942387, "eval_kl": 27.711829668209877, "eval_loss": 1.2083091735839844, "eval_reward": 1.6066529799881295, "eval_reward_std": 0.2486333052930518, "eval_rewards/AnswerAccuracyReward/mean": 0.650891635091707, "eval_rewards/AnswerAccuracyReward/std": 0.175463980730669, "eval_rewards/FormatCorrectnessReward/mean": 0.9557613158912815, "eval_rewards/FormatCorrectnessReward/std": 0.10297087760864462, "eval_runtime": 6310.2515, "eval_samples_per_second": 0.039, "eval_steps_per_second": 0.003, "step": 1000 } ], "logging_steps": 1, "max_steps": 15000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }