{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.1655011655011656, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 464.8125, "epoch": 0.0011655011655011655, "grad_norm": 3.538745880126953, "kl": 0.0006933212280273438, "learning_rate": 9.996114996114996e-07, "loss": 0.0, "reward": 0.4375, "reward_std": 0.24860583990812302, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.40625, "rewards/score_task": 1.0, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 524.5625, "epoch": 0.002331002331002331, "grad_norm": 2.8740155696868896, "kl": 0.0008192062377929688, "learning_rate": 9.992229992229992e-07, "loss": 0.0, "reward": 0.28125, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.28125, "rewards/score_task": 1.0, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 595.3125, "epoch": 0.0034965034965034965, "grad_norm": 4.53759765625, "kl": 0.0010042190551757812, "learning_rate": 9.988344988344988e-07, "loss": 0.0, "reward": 0.15625, "reward_std": 0.22201896458864212, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.15625, "rewards/score_task": 1.0, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 498.28125, "epoch": 0.004662004662004662, "grad_norm": 17.004980087280273, "kl": 0.000858306884765625, "learning_rate": 9.984459984459985e-07, "loss": 0.0, "reward": 0.59375, "reward_std": 0.22201896458864212, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.46875, "rewards/score_task": 1.0, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 440.71875, "epoch": 0.005827505827505828, "grad_norm": 2.2158868312835693, "kl": 0.00086212158203125, "learning_rate": 9.98057498057498e-07, "loss": 0.0, "reward": 0.625, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.46875, "rewards/score_task": 1.5, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 570.5, "epoch": 0.006993006993006993, "grad_norm": 6.156116962432861, "kl": 0.0013628005981445312, "learning_rate": 9.976689976689975e-07, "loss": 0.0, "reward": 0.59375, "reward_std": 0.45200014114379883, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.40625, "rewards/score_task": 1.0, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 536.90625, "epoch": 0.008158508158508158, "grad_norm": 17.941560745239258, "kl": 0.001087188720703125, "learning_rate": 9.972804972804973e-07, "loss": 0.0, "reward": 0.5, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.3125, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 312.75, "epoch": 0.009324009324009324, "grad_norm": 7.910667896270752, "kl": 0.002010345458984375, "learning_rate": 9.96891996891997e-07, "loss": 0.0, "reward": 1.21875, "reward_std": 0.5123760402202606, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/format_reward": 0.90625, "rewards/score_task": 1.25, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 437.6875, "epoch": 0.01048951048951049, "grad_norm": 4.629382133483887, "kl": 0.0019817352294921875, "learning_rate": 9.965034965034964e-07, "loss": 0.0, "reward": 0.75, "reward_std": 0.3745020925998688, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.625, "rewards/score_task": 1.0, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 510.8125, "epoch": 0.011655011655011656, "grad_norm": 10.523804664611816, "kl": 0.002048492431640625, "learning_rate": 9.96114996114996e-07, "loss": 0.0, "reward": 0.65625, "reward_std": 0.3471629247069359, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.40625, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 400.65625, "epoch": 0.01282051282051282, "grad_norm": 2.9209351539611816, "kl": 0.00374603271484375, "learning_rate": 9.957264957264956e-07, "loss": 0.0, "reward": 1.03125, "reward_std": 0.4628904387354851, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.84375, "rewards/score_task": 1.3333333333333333, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 471.5, "epoch": 0.013986013986013986, "grad_norm": 6.433743953704834, "kl": 0.00469970703125, "learning_rate": 9.953379953379953e-07, "loss": 0.0, "reward": 0.875, "reward_std": 0.4663814753293991, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.8125, "rewards/score_task": 1.5, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 477.40625, "epoch": 0.015151515151515152, "grad_norm": 21.03217887878418, "kl": 0.00638580322265625, "learning_rate": 9.949494949494949e-07, "loss": 0.0, "reward": 0.96875, "reward_std": 0.5347195863723755, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.78125, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 555.90625, "epoch": 0.016317016317016316, "grad_norm": 3.559480905532837, "kl": 0.008636474609375, "learning_rate": 9.945609945609945e-07, "loss": 0.0, "reward": 0.84375, "reward_std": 0.5116237476468086, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.75, "rewards/format_reward": 0.78125, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 541.5625, "epoch": 0.017482517482517484, "grad_norm": 2.4178566932678223, "kl": 0.0124969482421875, "learning_rate": 9.941724941724941e-07, "loss": 0.0, "reward": 0.78125, "reward_std": 0.40882333368062973, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.75, "rewards/format_reward": 0.625, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 327.53125, "epoch": 0.018648018648018648, "grad_norm": 13.203842163085938, "kl": 0.0140533447265625, "learning_rate": 9.937839937839938e-07, "loss": 0.0, "reward": 1.03125, "reward_std": 0.2759450078010559, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.9375, "rewards/score_task": 1.0, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 334.8125, "epoch": 0.019813519813519812, "grad_norm": 3.8159587383270264, "kl": 0.0088958740234375, "learning_rate": 9.933954933954934e-07, "loss": 0.0, "reward": 1.125, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 493.21875, "epoch": 0.02097902097902098, "grad_norm": 1.869828224182129, "kl": 0.00896453857421875, "learning_rate": 9.93006993006993e-07, "loss": 0.0, "reward": 0.5, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.4375, "rewards/score_task": 1.0, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 316.8125, "epoch": 0.022144522144522144, "grad_norm": 7.761906147003174, "kl": 0.01288604736328125, "learning_rate": 9.926184926184926e-07, "loss": 0.0, "reward": 1.03125, "reward_std": 0.35564958304166794, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.3333333333333333, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 454.375, "epoch": 0.023310023310023312, "grad_norm": 3.539435863494873, "kl": 0.00811004638671875, "learning_rate": 9.92229992229992e-07, "loss": 0.0, "reward": 1.0625, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 534.3125, "epoch": 0.024475524475524476, "grad_norm": 7.701175689697266, "kl": 0.01033782958984375, "learning_rate": 9.91841491841492e-07, "loss": 0.0, "reward": 0.8125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 426.78125, "epoch": 0.02564102564102564, "grad_norm": 12.345130920410156, "kl": 0.0100860595703125, "learning_rate": 9.914529914529915e-07, "loss": 0.0, "reward": 0.96875, "reward_std": 0.2630179077386856, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 479.625, "epoch": 0.026806526806526808, "grad_norm": 2.695384979248047, "kl": 0.01190185546875, "learning_rate": 9.91064491064491e-07, "loss": 0.0, "reward": 0.90625, "reward_std": 0.29384811222553253, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.71875, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 477.375, "epoch": 0.027972027972027972, "grad_norm": 4.525103569030762, "kl": 0.0083770751953125, "learning_rate": 9.906759906759906e-07, "loss": 0.0, "reward": 0.8125, "reward_std": 0.3104073107242584, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.65625, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 474.6875, "epoch": 0.029137529137529136, "grad_norm": 5.728568077087402, "kl": 0.010101318359375, "learning_rate": 9.902874902874902e-07, "loss": 0.0, "reward": 0.90625, "reward_std": 0.3787454217672348, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.71875, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 291.5625, "epoch": 0.030303030303030304, "grad_norm": 6.2152323722839355, "kl": 0.0230712890625, "learning_rate": 9.898989898989898e-07, "loss": 0.0, "reward": 1.34375, "reward_std": 0.494472935795784, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 485.15625, "epoch": 0.03146853146853147, "grad_norm": 1.1838486194610596, "kl": 0.014373779296875, "learning_rate": 9.895104895104894e-07, "loss": 0.0, "reward": 0.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.71875, "rewards/score_task": 1.0, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 480.96875, "epoch": 0.03263403263403263, "grad_norm": 2.7154533863067627, "kl": 0.0200653076171875, "learning_rate": 9.89121989121989e-07, "loss": 0.0, "reward": 1.09375, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 483.125, "epoch": 0.0337995337995338, "grad_norm": 6.2733588218688965, "kl": 0.0172882080078125, "learning_rate": 9.887334887334887e-07, "loss": 0.0, "reward": 1.125, "reward_std": 0.44403792917728424, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.84375, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 362.09375, "epoch": 0.03496503496503497, "grad_norm": 9.701471328735352, "kl": 0.023590087890625, "learning_rate": 9.883449883449883e-07, "loss": 0.0, "reward": 0.90625, "reward_std": 0.3061639815568924, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.71875, "rewards/score_task": 1.0, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 390.75, "epoch": 0.03613053613053613, "grad_norm": 5.922471523284912, "kl": 0.029510498046875, "learning_rate": 9.87956487956488e-07, "loss": 0.0, "reward": 0.9375, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 468.3125, "epoch": 0.037296037296037296, "grad_norm": 2.2394609451293945, "kl": 0.0210113525390625, "learning_rate": 9.875679875679876e-07, "loss": 0.0, "reward": 1.1875, "reward_std": 0.4053322970867157, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 284.40625, "epoch": 0.038461538461538464, "grad_norm": 7.822108745574951, "kl": 0.0888519287109375, "learning_rate": 9.871794871794872e-07, "loss": 0.0001, "reward": 1.3125, "reward_std": 0.4671337679028511, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 500.875, "epoch": 0.039627039627039624, "grad_norm": 1.8779712915420532, "kl": 0.01904296875, "learning_rate": 9.867909867909866e-07, "loss": 0.0, "reward": 0.5625, "reward_std": 0.24860583990812302, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.75, "rewards/format_reward": 0.46875, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 444.09375, "epoch": 0.04079254079254079, "grad_norm": 2.456920623779297, "kl": 0.035552978515625, "learning_rate": 9.864024864024865e-07, "loss": 0.0, "reward": 0.625, "reward_std": 0.2896047830581665, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.46875, "rewards/score_task": 2.0, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 257.5, "epoch": 0.04195804195804196, "grad_norm": 11.376919746398926, "kl": 0.06744384765625, "learning_rate": 9.86013986013986e-07, "loss": 0.0001, "reward": 1.1875, "reward_std": 0.49216993153095245, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.9375, "rewards/score_task": 1.0, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 369.875, "epoch": 0.04312354312354312, "grad_norm": 2.0796971321105957, "kl": 0.036712646484375, "learning_rate": 9.856254856254855e-07, "loss": 0.0, "reward": 1.125, "reward_std": 0.26726123690605164, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 443.625, "epoch": 0.04428904428904429, "grad_norm": 7.44196891784668, "kl": 0.029571533203125, "learning_rate": 9.852369852369851e-07, "loss": 0.0, "reward": 0.5625, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.5, "rewards/score_task": 2.0, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 156.375, "epoch": 0.045454545454545456, "grad_norm": 4.882200241088867, "kl": 0.084228515625, "learning_rate": 9.848484848484847e-07, "loss": 0.0001, "reward": 1.34375, "reward_std": 0.4397946000099182, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.28125, "rewards/comparison_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 478.84375, "epoch": 0.046620046620046623, "grad_norm": 2.2851486206054688, "kl": 0.024505615234375, "learning_rate": 9.844599844599844e-07, "loss": 0.0, "reward": 0.75, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.71875, "rewards/score_task": 1.0, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 379.96875, "epoch": 0.047785547785547784, "grad_norm": 5.070146083831787, "kl": 0.038116455078125, "learning_rate": 9.84071484071484e-07, "loss": 0.0, "reward": 0.875, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 416.46875, "epoch": 0.04895104895104895, "grad_norm": 10.295511245727539, "kl": 0.034454345703125, "learning_rate": 9.836829836829836e-07, "loss": 0.0, "reward": 0.90625, "reward_std": 0.3198433741927147, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 273.125, "epoch": 0.05011655011655012, "grad_norm": 2.5976150035858154, "kl": 0.057586669921875, "learning_rate": 9.832944832944833e-07, "loss": 0.0001, "reward": 0.875, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 353.28125, "epoch": 0.05128205128205128, "grad_norm": 4.0504865646362305, "kl": 0.065155029296875, "learning_rate": 9.829059829059829e-07, "loss": 0.0001, "reward": 1.28125, "reward_std": 0.3198433741927147, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 478.75, "epoch": 0.05244755244755245, "grad_norm": 1.135304570198059, "kl": 0.0367431640625, "learning_rate": 9.825174825174825e-07, "loss": 0.0, "reward": 1.03125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 196.59375, "epoch": 0.053613053613053616, "grad_norm": 6.580746650695801, "kl": 0.0831298828125, "learning_rate": 9.821289821289821e-07, "loss": 0.0001, "reward": 1.15625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 420.5625, "epoch": 0.054778554778554776, "grad_norm": 6.723421573638916, "kl": 0.0419921875, "learning_rate": 9.817404817404818e-07, "loss": 0.0, "reward": 1.0625, "reward_std": 0.38223645836114883, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.9375, "rewards/score_task": 1.5, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 411.84375, "epoch": 0.055944055944055944, "grad_norm": 6.957960605621338, "kl": 0.06591796875, "learning_rate": 9.813519813519812e-07, "loss": 0.0001, "reward": 1.21875, "reward_std": 0.493720643222332, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.09375, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.9375, "rewards/score_task": 2.0, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 370.34375, "epoch": 0.05710955710955711, "grad_norm": 23.184762954711914, "kl": 0.073455810546875, "learning_rate": 9.80963480963481e-07, "loss": 0.0001, "reward": 0.6875, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 449.78125, "epoch": 0.05827505827505827, "grad_norm": 10.027260780334473, "kl": 0.046173095703125, "learning_rate": 9.805749805749806e-07, "loss": 0.0, "reward": 0.625, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.09375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.5, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 281.21875, "epoch": 0.05944055944055944, "grad_norm": 10.98452377319336, "kl": 0.100433349609375, "learning_rate": 9.8018648018648e-07, "loss": 0.0001, "reward": 1.3125, "reward_std": 0.38298875093460083, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 1.5, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 450.78125, "epoch": 0.06060606060606061, "grad_norm": 1.803593635559082, "kl": 0.032318115234375, "learning_rate": 9.797979797979797e-07, "loss": 0.0, "reward": 0.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.25, "rewards/score_task": 1.0, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 262.5, "epoch": 0.06177156177156177, "grad_norm": 6.191781044006348, "kl": 0.07080078125, "learning_rate": 9.794094794094793e-07, "loss": 0.0001, "reward": 1.21875, "reward_std": 0.3987956568598747, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 478.53125, "epoch": 0.06293706293706294, "grad_norm": 3.35585618019104, "kl": 0.05419921875, "learning_rate": 9.79020979020979e-07, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.46875, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 400.0, "epoch": 0.0641025641025641, "grad_norm": 6.3674845695495605, "kl": 0.0548095703125, "learning_rate": 9.786324786324786e-07, "loss": 0.0001, "reward": 0.84375, "reward_std": 0.3061639815568924, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.09375, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.71875, "rewards/score_task": 1.0, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 392.46875, "epoch": 0.06526806526806526, "grad_norm": 10.6104097366333, "kl": 0.056854248046875, "learning_rate": 9.782439782439782e-07, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 496.8125, "epoch": 0.06643356643356643, "grad_norm": 12.83187198638916, "kl": 0.043548583984375, "learning_rate": 9.778554778554778e-07, "loss": 0.0, "reward": 0.875, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.03125, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 426.0625, "epoch": 0.0675990675990676, "grad_norm": 13.33549690246582, "kl": 0.047882080078125, "learning_rate": 9.774669774669774e-07, "loss": 0.0, "reward": 0.9375, "reward_std": 0.249358132481575, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.71875, "rewards/score_task": 1.0, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 526.90625, "epoch": 0.06876456876456877, "grad_norm": 1.461938738822937, "kl": 0.0289306640625, "learning_rate": 9.77078477078477e-07, "loss": 0.0, "reward": 0.78125, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.65625, "rewards/score_task": 1.0, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 398.09375, "epoch": 0.06993006993006994, "grad_norm": 6.535000324249268, "kl": 0.05029296875, "learning_rate": 9.766899766899767e-07, "loss": 0.0001, "reward": 1.1875, "reward_std": 0.38298875093460083, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.3125, "rewards/comparison_task": 1.5, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 401.65625, "epoch": 0.07109557109557109, "grad_norm": 5.389815807342529, "kl": 0.05352783203125, "learning_rate": 9.763014763014763e-07, "loss": 0.0001, "reward": 1.40625, "reward_std": 0.4807935431599617, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 1.5, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 462.78125, "epoch": 0.07226107226107226, "grad_norm": 4.489046573638916, "kl": 0.044830322265625, "learning_rate": 9.759129759129757e-07, "loss": 0.0, "reward": 0.84375, "reward_std": 0.3061639815568924, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.09375, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.71875, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 355.125, "epoch": 0.07342657342657342, "grad_norm": 4.437483787536621, "kl": 0.052398681640625, "learning_rate": 9.755244755244756e-07, "loss": 0.0001, "reward": 1.1875, "reward_std": 0.249358132481575, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 266.34375, "epoch": 0.07459207459207459, "grad_norm": 10.862178802490234, "kl": 0.07537841796875, "learning_rate": 9.751359751359752e-07, "loss": 0.0001, "reward": 1.5, "reward_std": 0.3650856465101242, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.5, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 488.4375, "epoch": 0.07575757575757576, "grad_norm": 4.659424781799316, "kl": 0.05145263671875, "learning_rate": 9.747474747474746e-07, "loss": 0.0001, "reward": 1.15625, "reward_std": 0.4397946000099182, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.09375, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 377.34375, "epoch": 0.07692307692307693, "grad_norm": 7.502078533172607, "kl": 0.06787109375, "learning_rate": 9.743589743589742e-07, "loss": 0.0001, "reward": 1.4375, "reward_std": 0.4671337679028511, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.28125, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 409.40625, "epoch": 0.0780885780885781, "grad_norm": 3.867584466934204, "kl": 0.05792236328125, "learning_rate": 9.739704739704739e-07, "loss": 0.0001, "reward": 1.375, "reward_std": 0.4355512708425522, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 440.40625, "epoch": 0.07925407925407925, "grad_norm": 4.3467488288879395, "kl": 0.0626220703125, "learning_rate": 9.735819735819735e-07, "loss": 0.0001, "reward": 0.9375, "reward_std": 0.249358132481575, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 284.84375, "epoch": 0.08041958041958042, "grad_norm": 3.4992847442626953, "kl": 0.06201171875, "learning_rate": 9.731934731934731e-07, "loss": 0.0001, "reward": 1.375, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.3333333333333333, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 499.25, "epoch": 0.08158508158508158, "grad_norm": 1.836920976638794, "kl": 0.0513916015625, "learning_rate": 9.728049728049727e-07, "loss": 0.0001, "reward": 0.84375, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 511.8125, "epoch": 0.08275058275058275, "grad_norm": 4.456302165985107, "kl": 0.04815673828125, "learning_rate": 9.724164724164724e-07, "loss": 0.0, "reward": 0.9375, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 493.6875, "epoch": 0.08391608391608392, "grad_norm": 5.069329738616943, "kl": 0.0635986328125, "learning_rate": 9.72027972027972e-07, "loss": 0.0001, "reward": 0.90625, "reward_std": 0.35564958304166794, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.71875, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 486.21875, "epoch": 0.08508158508158509, "grad_norm": 1.9067591428756714, "kl": 0.0548095703125, "learning_rate": 9.716394716394716e-07, "loss": 0.0001, "reward": 0.375, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.25, "rewards/score_task": 2.0, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 251.84375, "epoch": 0.08624708624708624, "grad_norm": 5.2235565185546875, "kl": 0.0643310546875, "learning_rate": 9.712509712509713e-07, "loss": 0.0001, "reward": 1.5625, "reward_std": 0.5175491571426392, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.3125, "rewards/comparison_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 561.75, "epoch": 0.08741258741258741, "grad_norm": 2.6209218502044678, "kl": 0.0472412109375, "learning_rate": 9.708624708624709e-07, "loss": 0.0, "reward": 1.0, "reward_std": 0.26726123690605164, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 488.125, "epoch": 0.08857808857808858, "grad_norm": 3.1442127227783203, "kl": 0.04730224609375, "learning_rate": 9.704739704739703e-07, "loss": 0.0, "reward": 1.25, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 425.1875, "epoch": 0.08974358974358974, "grad_norm": 4.578776836395264, "kl": 0.05242919921875, "learning_rate": 9.700854700854701e-07, "loss": 0.0001, "reward": 1.3125, "reward_std": 0.3924051970243454, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 392.71875, "epoch": 0.09090909090909091, "grad_norm": 3.687410593032837, "kl": 0.04791259765625, "learning_rate": 9.696969696969698e-07, "loss": 0.0, "reward": 1.28125, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 544.90625, "epoch": 0.09207459207459208, "grad_norm": 4.179947853088379, "kl": 0.0494384765625, "learning_rate": 9.693084693084692e-07, "loss": 0.0, "reward": 0.90625, "reward_std": 0.3198433741927147, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 437.21875, "epoch": 0.09324009324009325, "grad_norm": 3.146735429763794, "kl": 0.05316162109375, "learning_rate": 9.689199689199688e-07, "loss": 0.0001, "reward": 0.90625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 388.0, "epoch": 0.0944055944055944, "grad_norm": 19.455196380615234, "kl": 0.06353759765625, "learning_rate": 9.685314685314684e-07, "loss": 0.0001, "reward": 1.375, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.34375, "rewards/comparison_task": 1.5, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 548.78125, "epoch": 0.09557109557109557, "grad_norm": 9.02927017211914, "kl": 0.0384521484375, "learning_rate": 9.68142968142968e-07, "loss": 0.0, "reward": 1.21875, "reward_std": 0.47062480449676514, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.5, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 393.90625, "epoch": 0.09673659673659674, "grad_norm": 7.148171901702881, "kl": 0.04473876953125, "learning_rate": 9.677544677544677e-07, "loss": 0.0, "reward": 0.90625, "reward_std": 0.3061639815568924, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.3333333333333333, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 430.25, "epoch": 0.0979020979020979, "grad_norm": 12.719574928283691, "kl": 0.09429931640625, "learning_rate": 9.673659673659673e-07, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.09375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 417.8125, "epoch": 0.09906759906759907, "grad_norm": 5.325240135192871, "kl": 0.0570068359375, "learning_rate": 9.66977466977467e-07, "loss": 0.0001, "reward": 1.125, "reward_std": 0.3945523276925087, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.09375, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 574.53125, "epoch": 0.10023310023310024, "grad_norm": 2.8836400508880615, "kl": 0.05975341796875, "learning_rate": 9.665889665889666e-07, "loss": 0.0001, "reward": 1.25, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 484.84375, "epoch": 0.10139860139860139, "grad_norm": 1.0963538885116577, "kl": 0.0518798828125, "learning_rate": 9.662004662004662e-07, "loss": 0.0001, "reward": 0.8125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 489.25, "epoch": 0.10256410256410256, "grad_norm": 2.0923655033111572, "kl": 0.0501708984375, "learning_rate": 9.658119658119658e-07, "loss": 0.0001, "reward": 1.0625, "reward_std": 0.3514062538743019, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 564.65625, "epoch": 0.10372960372960373, "grad_norm": 2.5235090255737305, "kl": 0.05767822265625, "learning_rate": 9.654234654234654e-07, "loss": 0.0001, "reward": 0.53125, "reward_std": 0.1602174937725067, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.46875, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 365.28125, "epoch": 0.1048951048951049, "grad_norm": 15.712270736694336, "kl": 0.0826416015625, "learning_rate": 9.650349650349648e-07, "loss": 0.0001, "reward": 1.46875, "reward_std": 0.5038893818855286, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.3125, "rewards/comparison_task": 1.5, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 371.8125, "epoch": 0.10606060606060606, "grad_norm": 4.732244968414307, "kl": 0.0755615234375, "learning_rate": 9.646464646464647e-07, "loss": 0.0001, "reward": 1.25, "reward_std": 0.481486439704895, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 476.09375, "epoch": 0.10722610722610723, "grad_norm": 4.046402454376221, "kl": 0.06207275390625, "learning_rate": 9.642579642579643e-07, "loss": 0.0001, "reward": 1.46875, "reward_std": 0.5784111469984055, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 0.96875, "rewards/score_task": 0.0, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 477.0, "epoch": 0.10839160839160839, "grad_norm": 4.641849994659424, "kl": 0.0457763671875, "learning_rate": 9.638694638694637e-07, "loss": 0.0, "reward": 0.9375, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 705.78125, "epoch": 0.10955710955710955, "grad_norm": 1.6420258283615112, "kl": 0.0416259765625, "learning_rate": 9.634809634809634e-07, "loss": 0.0, "reward": 1.03125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 289.46875, "epoch": 0.11072261072261072, "grad_norm": 19.641193389892578, "kl": 0.1019287109375, "learning_rate": 9.63092463092463e-07, "loss": 0.0001, "reward": 1.40625, "reward_std": 0.5195090994238853, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.40625, "rewards/comparison_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 406.71875, "epoch": 0.11188811188811189, "grad_norm": 6.50870943069458, "kl": 0.07318115234375, "learning_rate": 9.627039627039626e-07, "loss": 0.0001, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.71875, "rewards/score_task": 1.0, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 370.4375, "epoch": 0.11305361305361306, "grad_norm": 4.022256374359131, "kl": 0.071533203125, "learning_rate": 9.623154623154622e-07, "loss": 0.0001, "reward": 1.34375, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.3333333333333333, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 387.0625, "epoch": 0.11421911421911422, "grad_norm": 5.392679691314697, "kl": 0.0975341796875, "learning_rate": 9.619269619269619e-07, "loss": 0.0001, "reward": 1.4375, "reward_std": 0.5389629155397415, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.09375, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 0.9375, "rewards/score_task": 1.5, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 525.71875, "epoch": 0.11538461538461539, "grad_norm": 4.132760047912598, "kl": 0.0684814453125, "learning_rate": 9.615384615384615e-07, "loss": 0.0001, "reward": 1.25, "reward_std": 0.4671337679028511, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 452.09375, "epoch": 0.11655011655011654, "grad_norm": 12.979758262634277, "kl": 0.07391357421875, "learning_rate": 9.611499611499611e-07, "loss": 0.0001, "reward": 1.46875, "reward_std": 0.494472935795784, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 603.46875, "epoch": 0.11771561771561771, "grad_norm": 3.693584680557251, "kl": 0.07562255859375, "learning_rate": 9.607614607614607e-07, "loss": 0.0001, "reward": 0.96875, "reward_std": 0.3377464786171913, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 485.59375, "epoch": 0.11888111888111888, "grad_norm": 3.9985692501068115, "kl": 0.07513427734375, "learning_rate": 9.603729603729604e-07, "loss": 0.0001, "reward": 0.875, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.09375, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.71875, "rewards/score_task": 0.0, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 458.46875, "epoch": 0.12004662004662005, "grad_norm": 5.552523612976074, "kl": 0.0872802734375, "learning_rate": 9.5998445998446e-07, "loss": 0.0001, "reward": 1.28125, "reward_std": 0.35564958304166794, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 538.625, "epoch": 0.12121212121212122, "grad_norm": 6.804409503936768, "kl": 0.072265625, "learning_rate": 9.595959595959596e-07, "loss": 0.0001, "reward": 0.71875, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.0625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 580.15625, "epoch": 0.12237762237762238, "grad_norm": 1.6134785413742065, "kl": 0.05584716796875, "learning_rate": 9.592074592074592e-07, "loss": 0.0001, "reward": 1.15625, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 393.53125, "epoch": 0.12354312354312354, "grad_norm": 6.883267402648926, "kl": 0.10333251953125, "learning_rate": 9.588189588189589e-07, "loss": 0.0001, "reward": 1.5, "reward_std": 0.49022960662841797, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.34375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 572.90625, "epoch": 0.1247086247086247, "grad_norm": 2.428581714630127, "kl": 0.05303955078125, "learning_rate": 9.584304584304583e-07, "loss": 0.0001, "reward": 1.03125, "reward_std": 0.3787454217672348, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 402.53125, "epoch": 0.1258741258741259, "grad_norm": 23.773250579833984, "kl": 0.1387939453125, "learning_rate": 9.58041958041958e-07, "loss": 0.0001, "reward": 0.96875, "reward_std": 0.2630179077386856, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 1.5, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 584.21875, "epoch": 0.12703962703962704, "grad_norm": 6.72467565536499, "kl": 0.07867431640625, "learning_rate": 9.576534576534575e-07, "loss": 0.0001, "reward": 1.21875, "reward_std": 0.3471629247069359, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.09375, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 593.65625, "epoch": 0.1282051282051282, "grad_norm": 1.3530240058898926, "kl": 0.052001953125, "learning_rate": 9.572649572649572e-07, "loss": 0.0001, "reward": 0.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 638.15625, "epoch": 0.12937062937062938, "grad_norm": 4.701434135437012, "kl": 0.048828125, "learning_rate": 9.568764568764568e-07, "loss": 0.0, "reward": 1.34375, "reward_std": 0.494472935795784, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 498.46875, "epoch": 0.13053613053613053, "grad_norm": 5.449715614318848, "kl": 0.07489013671875, "learning_rate": 9.564879564879564e-07, "loss": 0.0001, "reward": 0.90625, "reward_std": 0.29384811222553253, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.71875, "rewards/score_task": 1.0, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 752.53125, "epoch": 0.1317016317016317, "grad_norm": 2.3221986293792725, "kl": 0.04443359375, "learning_rate": 9.56099456099456e-07, "loss": 0.0, "reward": 1.34375, "reward_std": 0.4628904387354851, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 527.15625, "epoch": 0.13286713286713286, "grad_norm": 3.8853156566619873, "kl": 0.07843017578125, "learning_rate": 9.557109557109557e-07, "loss": 0.0001, "reward": 0.96875, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 544.46875, "epoch": 0.13403263403263405, "grad_norm": 1.4354820251464844, "kl": 0.0545654296875, "learning_rate": 9.553224553224553e-07, "loss": 0.0001, "reward": 0.84375, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.71875, "rewards/score_task": 1.0, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 409.28125, "epoch": 0.1351981351981352, "grad_norm": 3.0735044479370117, "kl": 0.0743408203125, "learning_rate": 9.54933954933955e-07, "loss": 0.0001, "reward": 1.46875, "reward_std": 0.4807935431599617, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.3333333333333333, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 756.09375, "epoch": 0.13636363636363635, "grad_norm": 3.0346522331237793, "kl": 0.04791259765625, "learning_rate": 9.545454545454546e-07, "loss": 0.0, "reward": 0.90625, "reward_std": 0.3061639815568924, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.75, "rewards/format_reward": 0.75, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 550.1875, "epoch": 0.13752913752913754, "grad_norm": 4.542072296142578, "kl": 0.1181640625, "learning_rate": 9.541569541569542e-07, "loss": 0.0001, "reward": 1.21875, "reward_std": 0.3061639815568924, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 522.90625, "epoch": 0.1386946386946387, "grad_norm": 1.9401509761810303, "kl": 0.0538330078125, "learning_rate": 9.537684537684538e-07, "loss": 0.0001, "reward": 1.4375, "reward_std": 0.4492306634783745, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 666.5, "epoch": 0.13986013986013987, "grad_norm": 10.723546028137207, "kl": 0.041748046875, "learning_rate": 9.533799533799533e-07, "loss": 0.0, "reward": 0.84375, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.71875, "rewards/score_task": 1.5, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 587.5, "epoch": 0.14102564102564102, "grad_norm": 6.485595703125, "kl": 0.09832763671875, "learning_rate": 9.529914529914528e-07, "loss": 0.0001, "reward": 1.125, "reward_std": 0.3745020925998688, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 613.4375, "epoch": 0.14219114219114218, "grad_norm": 1.8983736038208008, "kl": 0.05035400390625, "learning_rate": 9.526029526029526e-07, "loss": 0.0001, "reward": 1.125, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 449.28125, "epoch": 0.14335664335664336, "grad_norm": 8.413186073303223, "kl": 0.11541748046875, "learning_rate": 9.522144522144522e-07, "loss": 0.0001, "reward": 0.96875, "reward_std": 0.3198433741927147, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.71875, "rewards/score_task": 1.0, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 776.0625, "epoch": 0.1445221445221445, "grad_norm": 1.7356297969818115, "kl": 0.048095703125, "learning_rate": 9.518259518259518e-07, "loss": 0.0, "reward": 1.0, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 512.84375, "epoch": 0.1456876456876457, "grad_norm": 2.15535831451416, "kl": 0.05584716796875, "learning_rate": 9.514374514374513e-07, "loss": 0.0001, "reward": 1.15625, "reward_std": 0.5145231708884239, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 0.90625, "rewards/score_task": 1.0, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 507.21875, "epoch": 0.14685314685314685, "grad_norm": 2.939136266708374, "kl": 0.17120361328125, "learning_rate": 9.51048951048951e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.26726123690605164, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 639.8125, "epoch": 0.14801864801864803, "grad_norm": 1.0609225034713745, "kl": 0.06402587890625, "learning_rate": 9.506604506604506e-07, "loss": 0.0001, "reward": 0.8125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 467.90625, "epoch": 0.14918414918414918, "grad_norm": 3.8447279930114746, "kl": 0.09454345703125, "learning_rate": 9.502719502719501e-07, "loss": 0.0001, "reward": 1.46875, "reward_std": 0.3471629247069359, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.5, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 552.8125, "epoch": 0.15034965034965034, "grad_norm": 4.611315727233887, "kl": 0.096923828125, "learning_rate": 9.498834498834499e-07, "loss": 0.0001, "reward": 0.78125, "reward_std": 0.2630179077386856, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 744.53125, "epoch": 0.15151515151515152, "grad_norm": 2.403369665145874, "kl": 0.0467529296875, "learning_rate": 9.494949494949495e-07, "loss": 0.0, "reward": 1.125, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 549.40625, "epoch": 0.15268065268065267, "grad_norm": 6.583850383758545, "kl": 0.07965087890625, "learning_rate": 9.491064491064491e-07, "loss": 0.0001, "reward": 1.28125, "reward_std": 0.3471629247069359, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 587.84375, "epoch": 0.15384615384615385, "grad_norm": 5.168365478515625, "kl": 0.128662109375, "learning_rate": 9.487179487179486e-07, "loss": 0.0001, "reward": 1.28125, "reward_std": 0.3377464786171913, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.40625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 713.53125, "epoch": 0.155011655011655, "grad_norm": 4.069382190704346, "kl": 0.05364990234375, "learning_rate": 9.483294483294483e-07, "loss": 0.0001, "reward": 0.9375, "reward_std": 0.3104073107242584, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 555.0625, "epoch": 0.1561771561771562, "grad_norm": 4.48373556137085, "kl": 0.1026611328125, "learning_rate": 9.479409479409479e-07, "loss": 0.0001, "reward": 1.46875, "reward_std": 0.4355708882212639, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 717.71875, "epoch": 0.15734265734265734, "grad_norm": 2.873666763305664, "kl": 0.05224609375, "learning_rate": 9.475524475524476e-07, "loss": 0.0001, "reward": 1.375, "reward_std": 0.48718400299549103, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 0.96875, "rewards/score_task": 1.5, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 916.625, "epoch": 0.1585081585081585, "grad_norm": 1.5718294382095337, "kl": 0.05047607421875, "learning_rate": 9.471639471639471e-07, "loss": 0.0001, "reward": 0.8125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.75, "rewards/format_reward": 0.75, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 920.09375, "epoch": 0.15967365967365968, "grad_norm": 1.9318997859954834, "kl": 0.0477294921875, "learning_rate": 9.467754467754468e-07, "loss": 0.0, "reward": 1.0625, "reward_std": 0.3104073107242584, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 767.5625, "epoch": 0.16083916083916083, "grad_norm": 4.977046966552734, "kl": 0.0982666015625, "learning_rate": 9.463869463869464e-07, "loss": 0.0001, "reward": 1.375, "reward_std": 0.48503687232732773, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.96875, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 511.84375, "epoch": 0.16200466200466201, "grad_norm": 5.190191268920898, "kl": 0.0902099609375, "learning_rate": 9.459984459984459e-07, "loss": 0.0001, "reward": 1.25, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 689.84375, "epoch": 0.16317016317016317, "grad_norm": 3.0159456729888916, "kl": 0.09295654296875, "learning_rate": 9.456099456099455e-07, "loss": 0.0001, "reward": 1.5625, "reward_std": 0.4765502139925957, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 791.0, "epoch": 0.16433566433566432, "grad_norm": 2.859438419342041, "kl": 0.0623779296875, "learning_rate": 9.452214452214452e-07, "loss": 0.0001, "reward": 0.84375, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "rewards/score_task": 2.0, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 1086.5625, "epoch": 0.1655011655011655, "grad_norm": 2.963080406188965, "kl": 0.06707763671875, "learning_rate": 9.448329448329449e-07, "loss": 0.0001, "reward": 1.5, "reward_std": 0.5260358154773712, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 632.78125, "epoch": 0.16666666666666666, "grad_norm": 3.451690673828125, "kl": 0.06201171875, "learning_rate": 9.444444444444444e-07, "loss": 0.0001, "reward": 0.96875, "reward_std": 0.3196365684270859, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.71875, "rewards/score_task": 1.3333333333333333, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 762.53125, "epoch": 0.16783216783216784, "grad_norm": 4.361408233642578, "kl": 0.084716796875, "learning_rate": 9.44055944055944e-07, "loss": 0.0001, "reward": 1.34375, "reward_std": 0.5913382470607758, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.9375, "rewards/score_task": 1.0, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 874.03125, "epoch": 0.168997668997669, "grad_norm": 1.1066713333129883, "kl": 0.06402587890625, "learning_rate": 9.436674436674437e-07, "loss": 0.0001, "reward": 0.625, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 523.0, "epoch": 0.17016317016317017, "grad_norm": 12.08517074584961, "kl": 0.16094970703125, "learning_rate": 9.432789432789432e-07, "loss": 0.0002, "reward": 1.53125, "reward_std": 0.494472935795784, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.28125, "rewards/comparison_task": 1.5, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 714.53125, "epoch": 0.17132867132867133, "grad_norm": 18.253753662109375, "kl": 0.1090087890625, "learning_rate": 9.428904428904428e-07, "loss": 0.0001, "reward": 1.25, "reward_std": 0.5102798417210579, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.09375, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 651.03125, "epoch": 0.17249417249417248, "grad_norm": 8.226325035095215, "kl": 0.1346435546875, "learning_rate": 9.425019425019424e-07, "loss": 0.0001, "reward": 0.96875, "reward_std": 0.3787454217672348, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.71875, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 477.5, "epoch": 0.17365967365967366, "grad_norm": 6.7204909324646, "kl": 0.1427001953125, "learning_rate": 9.421134421134422e-07, "loss": 0.0001, "reward": 1.34375, "reward_std": 0.4628904387354851, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 877.4375, "epoch": 0.17482517482517482, "grad_norm": 7.096918106079102, "kl": 0.0697021484375, "learning_rate": 9.417249417249417e-07, "loss": 0.0001, "reward": 1.40625, "reward_std": 0.5395593345165253, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 0.9375, "rewards/score_task": 2.0, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 834.53125, "epoch": 0.175990675990676, "grad_norm": 4.323293209075928, "kl": 0.10888671875, "learning_rate": 9.413364413364413e-07, "loss": 0.0001, "reward": 1.125, "reward_std": 0.4053322970867157, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.71875, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 908.34375, "epoch": 0.17715617715617715, "grad_norm": 0.7870212197303772, "kl": 0.06878662109375, "learning_rate": 9.409479409479409e-07, "loss": 0.0001, "reward": 1.0625, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 743.1875, "epoch": 0.17832167832167833, "grad_norm": 2.2145771980285645, "kl": 0.0670166015625, "learning_rate": 9.405594405594405e-07, "loss": 0.0001, "reward": 1.3125, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 613.5625, "epoch": 0.1794871794871795, "grad_norm": 3.979191303253174, "kl": 0.1068115234375, "learning_rate": 9.401709401709401e-07, "loss": 0.0001, "reward": 1.375, "reward_std": 0.4671337679028511, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 907.0625, "epoch": 0.18065268065268064, "grad_norm": 2.136356830596924, "kl": 0.06884765625, "learning_rate": 9.397824397824397e-07, "loss": 0.0001, "reward": 1.0625, "reward_std": 0.2587745785713196, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 760.375, "epoch": 0.18181818181818182, "grad_norm": 1.6035881042480469, "kl": 0.0762939453125, "learning_rate": 9.393939393939395e-07, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.5, "rewards/score_task": 2.0, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 934.53125, "epoch": 0.18298368298368298, "grad_norm": 1.8581409454345703, "kl": 0.0709228515625, "learning_rate": 9.39005439005439e-07, "loss": 0.0001, "reward": 1.40625, "reward_std": 0.4628904387354851, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 713.53125, "epoch": 0.18414918414918416, "grad_norm": 1.0222011804580688, "kl": 0.0750732421875, "learning_rate": 9.386169386169386e-07, "loss": 0.0001, "reward": 1.15625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 541.90625, "epoch": 0.1853146853146853, "grad_norm": 6.4049530029296875, "kl": 0.104736328125, "learning_rate": 9.382284382284382e-07, "loss": 0.0001, "reward": 1.40625, "reward_std": 0.3787454217672348, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.3333333333333333, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 633.375, "epoch": 0.1864801864801865, "grad_norm": 0.005755224265158176, "kl": 0.07568359375, "learning_rate": 9.378399378399377e-07, "loss": 0.0001, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 683.9375, "epoch": 0.18764568764568765, "grad_norm": 2.6044585704803467, "kl": 0.0821533203125, "learning_rate": 9.374514374514374e-07, "loss": 0.0001, "reward": 1.09375, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 784.53125, "epoch": 0.1888111888111888, "grad_norm": 0.8209207653999329, "kl": 0.0732421875, "learning_rate": 9.37062937062937e-07, "loss": 0.0001, "reward": 0.84375, "reward_std": 0.18600594997406006, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.71875, "rewards/score_task": 1.0, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 742.53125, "epoch": 0.18997668997668998, "grad_norm": 3.2099320888519287, "kl": 0.107666015625, "learning_rate": 9.366744366744367e-07, "loss": 0.0001, "reward": 1.25, "reward_std": 0.3514062538743019, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 821.65625, "epoch": 0.19114219114219114, "grad_norm": 2.0123395919799805, "kl": 0.080078125, "learning_rate": 9.362859362859363e-07, "loss": 0.0001, "reward": 1.375, "reward_std": 0.49938954412937164, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 0.9375, "rewards/score_task": 1.0, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 699.1875, "epoch": 0.19230769230769232, "grad_norm": 3.3524725437164307, "kl": 0.107177734375, "learning_rate": 9.358974358974359e-07, "loss": 0.0001, "reward": 1.28125, "reward_std": 0.35564958304166794, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 603.1875, "epoch": 0.19347319347319347, "grad_norm": 6.088827610015869, "kl": 0.1502685546875, "learning_rate": 9.355089355089355e-07, "loss": 0.0002, "reward": 1.375, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 815.5625, "epoch": 0.19463869463869463, "grad_norm": 5.630931854248047, "kl": 0.0831298828125, "learning_rate": 9.35120435120435e-07, "loss": 0.0001, "reward": 1.0625, "reward_std": 0.3745020925998688, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 779.40625, "epoch": 0.1958041958041958, "grad_norm": 0.9311386942863464, "kl": 0.08984375, "learning_rate": 9.347319347319347e-07, "loss": 0.0001, "reward": 0.59375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 631.46875, "epoch": 0.19696969696969696, "grad_norm": 3.2721848487854004, "kl": 0.1363525390625, "learning_rate": 9.343434343434343e-07, "loss": 0.0001, "reward": 1.4375, "reward_std": 0.4765502139925957, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 721.9375, "epoch": 0.19813519813519814, "grad_norm": 2.170353412628174, "kl": 0.090576171875, "learning_rate": 9.33954933954934e-07, "loss": 0.0001, "reward": 1.03125, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 668.6875, "epoch": 0.1993006993006993, "grad_norm": 2.677029848098755, "kl": 0.0970458984375, "learning_rate": 9.335664335664335e-07, "loss": 0.0001, "reward": 1.375, "reward_std": 0.4671337679028511, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.6666666666666667, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 810.25, "epoch": 0.20046620046620048, "grad_norm": 4.08341121673584, "kl": 0.0955810546875, "learning_rate": 9.331779331779332e-07, "loss": 0.0001, "reward": 1.5, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 759.28125, "epoch": 0.20163170163170163, "grad_norm": 1.0255314111709595, "kl": 0.1055908203125, "learning_rate": 9.327894327894328e-07, "loss": 0.0001, "reward": 0.78125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 773.375, "epoch": 0.20279720279720279, "grad_norm": 2.5832104682922363, "kl": 0.117919921875, "learning_rate": 9.324009324009323e-07, "loss": 0.0001, "reward": 1.40625, "reward_std": 0.3608423173427582, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 828.84375, "epoch": 0.20396270396270397, "grad_norm": 2.846411943435669, "kl": 0.133056640625, "learning_rate": 9.320124320124319e-07, "loss": 0.0001, "reward": 1.53125, "reward_std": 0.4628904387354851, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 789.0, "epoch": 0.20512820512820512, "grad_norm": 0.9800822138786316, "kl": 0.1048583984375, "learning_rate": 9.316239316239316e-07, "loss": 0.0001, "reward": 0.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 699.46875, "epoch": 0.2062937062937063, "grad_norm": 2.5001745223999023, "kl": 0.145263671875, "learning_rate": 9.312354312354313e-07, "loss": 0.0001, "reward": 1.15625, "reward_std": 0.38816186785697937, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 794.09375, "epoch": 0.20745920745920746, "grad_norm": 1.990135908126831, "kl": 0.1396484375, "learning_rate": 9.308469308469308e-07, "loss": 0.0001, "reward": 1.3125, "reward_std": 0.5568660199642181, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.9375, "rewards/score_task": 1.0, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 818.59375, "epoch": 0.20862470862470864, "grad_norm": 1.8339804410934448, "kl": 0.146484375, "learning_rate": 9.304584304584304e-07, "loss": 0.0001, "reward": 1.15625, "reward_std": 0.3779931291937828, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.6875, "rewards/score_task": 2.0, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 887.875, "epoch": 0.2097902097902098, "grad_norm": 3.872621774673462, "kl": 0.177734375, "learning_rate": 9.300699300699301e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.3650856465101242, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 615.5, "epoch": 0.21095571095571095, "grad_norm": 3.482492685317993, "kl": 0.19384765625, "learning_rate": 9.296814296814296e-07, "loss": 0.0002, "reward": 0.90625, "reward_std": 0.3061639815568924, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 1.5, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 805.40625, "epoch": 0.21212121212121213, "grad_norm": 2.1183865070343018, "kl": 0.14404296875, "learning_rate": 9.292929292929292e-07, "loss": 0.0001, "reward": 1.21875, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 731.15625, "epoch": 0.21328671328671328, "grad_norm": 2.3756191730499268, "kl": 0.16259765625, "learning_rate": 9.289044289044288e-07, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.09375, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 757.09375, "epoch": 0.21445221445221446, "grad_norm": 2.9915904998779297, "kl": 0.1490478515625, "learning_rate": 9.285159285159286e-07, "loss": 0.0001, "reward": 1.375, "reward_std": 0.4492306634783745, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 764.65625, "epoch": 0.21561771561771562, "grad_norm": 1.2692840099334717, "kl": 0.1519775390625, "learning_rate": 9.281274281274281e-07, "loss": 0.0002, "reward": 0.875, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 890.34375, "epoch": 0.21678321678321677, "grad_norm": 8.458389282226562, "kl": 0.1630859375, "learning_rate": 9.277389277389277e-07, "loss": 0.0002, "reward": 1.375, "reward_std": 0.49022960662841797, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 861.75, "epoch": 0.21794871794871795, "grad_norm": 1.192478895187378, "kl": 0.161865234375, "learning_rate": 9.273504273504273e-07, "loss": 0.0002, "reward": 0.84375, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.75, "rewards/format_reward": 0.5, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 706.15625, "epoch": 0.2191142191142191, "grad_norm": 3.366788387298584, "kl": 0.1654052734375, "learning_rate": 9.269619269619269e-07, "loss": 0.0002, "reward": 1.375, "reward_std": 0.3945523276925087, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 940.8125, "epoch": 0.2202797202797203, "grad_norm": 3.9245548248291016, "kl": 0.150146484375, "learning_rate": 9.265734265734265e-07, "loss": 0.0002, "reward": 0.9375, "reward_std": 0.3471825420856476, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 667.34375, "epoch": 0.22144522144522144, "grad_norm": 6.174204349517822, "kl": 0.186279296875, "learning_rate": 9.261849261849261e-07, "loss": 0.0002, "reward": 1.34375, "reward_std": 0.3608423173427582, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 783.625, "epoch": 0.22261072261072262, "grad_norm": 5.247681617736816, "kl": 0.1591796875, "learning_rate": 9.257964257964258e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.4628904387354851, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 726.21875, "epoch": 0.22377622377622378, "grad_norm": 3.3866488933563232, "kl": 0.19189453125, "learning_rate": 9.254079254079254e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.26726123690605164, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 772.9375, "epoch": 0.22494172494172493, "grad_norm": 2.0346479415893555, "kl": 0.16064453125, "learning_rate": 9.25019425019425e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.3745020925998688, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 813.53125, "epoch": 0.2261072261072261, "grad_norm": 5.7331390380859375, "kl": 0.173828125, "learning_rate": 9.246309246309246e-07, "loss": 0.0002, "reward": 1.625, "reward_std": 0.4671337679028511, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 641.9375, "epoch": 0.22727272727272727, "grad_norm": 14.053421020507812, "kl": 0.21923828125, "learning_rate": 9.242424242424241e-07, "loss": 0.0002, "reward": 1.4375, "reward_std": 0.4492306634783745, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.34375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 712.15625, "epoch": 0.22843822843822845, "grad_norm": 5.321450233459473, "kl": 0.171875, "learning_rate": 9.238539238539238e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.3198433741927147, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 711.96875, "epoch": 0.2296037296037296, "grad_norm": 3.7412779331207275, "kl": 0.21240234375, "learning_rate": 9.234654234654234e-07, "loss": 0.0002, "reward": 1.125, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 840.3125, "epoch": 0.23076923076923078, "grad_norm": 1.8248549699783325, "kl": 0.162841796875, "learning_rate": 9.230769230769231e-07, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 661.875, "epoch": 0.23193473193473194, "grad_norm": 6.382082939147949, "kl": 0.2158203125, "learning_rate": 9.226884226884226e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.3787454217672348, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.34375, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 767.875, "epoch": 0.2331002331002331, "grad_norm": 4.214197635650635, "kl": 0.16845703125, "learning_rate": 9.222999222999223e-07, "loss": 0.0002, "reward": 0.875, "reward_std": 0.3104073107242584, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.71875, "rewards/score_task": 1.0, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 787.46875, "epoch": 0.23426573426573427, "grad_norm": 58.97422790527344, "kl": 0.15625, "learning_rate": 9.219114219114219e-07, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 754.1875, "epoch": 0.23543123543123542, "grad_norm": 3.1183454990386963, "kl": 0.1767578125, "learning_rate": 9.215229215229214e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.31539323925971985, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.96875, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 796.75, "epoch": 0.2365967365967366, "grad_norm": 2.1943016052246094, "kl": 0.268310546875, "learning_rate": 9.21134421134421e-07, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "rewards/score_task": 2.0, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 697.9375, "epoch": 0.23776223776223776, "grad_norm": 6.757773399353027, "kl": 0.18603515625, "learning_rate": 9.207459207459207e-07, "loss": 0.0002, "reward": 1.125, "reward_std": 0.3924051970243454, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.28125, "rewards/comparison_task": 1.5, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 742.15625, "epoch": 0.23892773892773891, "grad_norm": 3.9283432960510254, "kl": 0.1591796875, "learning_rate": 9.203574203574204e-07, "loss": 0.0002, "reward": 1.5625, "reward_std": 0.48503687232732773, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 0.96875, "rewards/score_task": 2.0, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 915.71875, "epoch": 0.2400932400932401, "grad_norm": 9.934305191040039, "kl": 0.158935546875, "learning_rate": 9.199689199689199e-07, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.3471629247069359, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 755.21875, "epoch": 0.24125874125874125, "grad_norm": 2.4902279376983643, "kl": 0.14453125, "learning_rate": 9.195804195804196e-07, "loss": 0.0001, "reward": 0.875, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 725.59375, "epoch": 0.24242424242424243, "grad_norm": 1.2978657484054565, "kl": 0.168701171875, "learning_rate": 9.191919191919192e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.2587745785713196, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 713.21875, "epoch": 0.24358974358974358, "grad_norm": 1.6494033336639404, "kl": 0.16015625, "learning_rate": 9.188034188034187e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.249358132481575, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.3333333333333333, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 905.1875, "epoch": 0.24475524475524477, "grad_norm": 1.5671844482421875, "kl": 0.150146484375, "learning_rate": 9.184149184149183e-07, "loss": 0.0002, "reward": 1.125, "reward_std": 0.43112075328826904, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.71875, "rewards/score_task": 1.0, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 802.03125, "epoch": 0.24592074592074592, "grad_norm": 14.042095184326172, "kl": 0.225830078125, "learning_rate": 9.180264180264181e-07, "loss": 0.0002, "reward": 0.875, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.75, "rewards/format_reward": 0.5, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 708.71875, "epoch": 0.24708624708624707, "grad_norm": 3.10853910446167, "kl": 0.182373046875, "learning_rate": 9.176379176379177e-07, "loss": 0.0002, "reward": 1.4375, "reward_std": 0.5166193693876266, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.96875, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 613.8125, "epoch": 0.24825174825174826, "grad_norm": 5.355035781860352, "kl": 0.216064453125, "learning_rate": 9.172494172494172e-07, "loss": 0.0002, "reward": 0.75, "reward_std": 0.26726123690605164, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.5, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 548.875, "epoch": 0.2494172494172494, "grad_norm": 17.018232345581055, "kl": 0.201416015625, "learning_rate": 9.168609168609168e-07, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.2587745785713196, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.3125, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.5, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 529.25, "epoch": 0.2505827505827506, "grad_norm": 7.224838733673096, "kl": 0.196044921875, "learning_rate": 9.164724164724165e-07, "loss": 0.0002, "reward": 1.375, "reward_std": 0.3745020925998688, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 630.9375, "epoch": 0.2517482517482518, "grad_norm": 0.9876692891120911, "kl": 0.179443359375, "learning_rate": 9.16083916083916e-07, "loss": 0.0002, "reward": 1.3125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.3333333333333333, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 800.03125, "epoch": 0.2529137529137529, "grad_norm": 2.762221097946167, "kl": 0.183837890625, "learning_rate": 9.156954156954156e-07, "loss": 0.0002, "reward": 1.625, "reward_std": 0.4671337679028511, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 731.75, "epoch": 0.2540792540792541, "grad_norm": 2.314610004425049, "kl": 0.172607421875, "learning_rate": 9.153069153069153e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.3471629247069359, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 743.46875, "epoch": 0.25524475524475526, "grad_norm": 1.8412854671478271, "kl": 0.18896484375, "learning_rate": 9.14918414918415e-07, "loss": 0.0002, "reward": 1.09375, "reward_std": 0.3787454217672348, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 684.40625, "epoch": 0.2564102564102564, "grad_norm": 1.663041114807129, "kl": 0.184814453125, "learning_rate": 9.145299145299145e-07, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 661.15625, "epoch": 0.25757575757575757, "grad_norm": 3.2668890953063965, "kl": 0.203369140625, "learning_rate": 9.141414141414141e-07, "loss": 0.0002, "reward": 1.34375, "reward_std": 0.3198433741927147, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 831.15625, "epoch": 0.25874125874125875, "grad_norm": 2.2381465435028076, "kl": 0.165771484375, "learning_rate": 9.137529137529137e-07, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.3061639815568924, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 726.96875, "epoch": 0.25990675990675993, "grad_norm": 2.8532071113586426, "kl": 0.195556640625, "learning_rate": 9.133644133644133e-07, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.3047097474336624, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.46875, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 777.875, "epoch": 0.26107226107226106, "grad_norm": 3.0078935623168945, "kl": 0.202392578125, "learning_rate": 9.129759129759129e-07, "loss": 0.0002, "reward": 1.53125, "reward_std": 0.5217924863100052, "rewards/accuracy_reward": 0.53125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 690.75, "epoch": 0.26223776223776224, "grad_norm": 3.7765004634857178, "kl": 0.1778564453125, "learning_rate": 9.125874125874126e-07, "loss": 0.0002, "reward": 1.5625, "reward_std": 0.3945523276925087, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.4375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 840.78125, "epoch": 0.2634032634032634, "grad_norm": 1.7220113277435303, "kl": 0.180419921875, "learning_rate": 9.121989121989122e-07, "loss": 0.0002, "reward": 1.375, "reward_std": 0.4663814753293991, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 0.96875, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 854.25, "epoch": 0.26456876456876455, "grad_norm": 2.2034661769866943, "kl": 0.192138671875, "learning_rate": 9.118104118104118e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.3471629247069359, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 647.90625, "epoch": 0.26573426573426573, "grad_norm": 3.8688642978668213, "kl": 0.224853515625, "learning_rate": 9.114219114219114e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.3924051970243454, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 852.4375, "epoch": 0.2668997668997669, "grad_norm": 1.8697229623794556, "kl": 0.15673828125, "learning_rate": 9.11033411033411e-07, "loss": 0.0002, "reward": 1.34375, "reward_std": 0.3471629247069359, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 627.46875, "epoch": 0.2680652680652681, "grad_norm": 4.749410629272461, "kl": 0.19384765625, "learning_rate": 9.106449106449105e-07, "loss": 0.0002, "reward": 1.40625, "reward_std": 0.482940673828125, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 674.25, "epoch": 0.2692307692307692, "grad_norm": 1.022552251815796, "kl": 0.207275390625, "learning_rate": 9.102564102564102e-07, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 584.0625, "epoch": 0.2703962703962704, "grad_norm": 10.142642974853516, "kl": 0.18701171875, "learning_rate": 9.098679098679099e-07, "loss": 0.0002, "reward": 1.59375, "reward_std": 0.4628904387354851, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.375, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 629.46875, "epoch": 0.2715617715617716, "grad_norm": 2.097764015197754, "kl": 0.22705078125, "learning_rate": 9.094794094794095e-07, "loss": 0.0002, "reward": 1.65625, "reward_std": 0.38816186785697937, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 531.71875, "epoch": 0.2727272727272727, "grad_norm": 3.6296863555908203, "kl": 0.224609375, "learning_rate": 9.09090909090909e-07, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.3061639815568924, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 582.96875, "epoch": 0.2738927738927739, "grad_norm": 8.096195220947266, "kl": 0.262451171875, "learning_rate": 9.087024087024087e-07, "loss": 0.0003, "reward": 0.625, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.5, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 584.5, "epoch": 0.27505827505827507, "grad_norm": 11.153486251831055, "kl": 0.212158203125, "learning_rate": 9.083139083139083e-07, "loss": 0.0002, "reward": 1.53125, "reward_std": 0.3198433741927147, "rewards/accuracy_reward": 0.53125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 500.9375, "epoch": 0.2762237762237762, "grad_norm": 3.004753828048706, "kl": 0.255126953125, "learning_rate": 9.079254079254078e-07, "loss": 0.0003, "reward": 1.53125, "reward_std": 0.5038893818855286, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 471.46875, "epoch": 0.2773892773892774, "grad_norm": 2.8283252716064453, "kl": 0.238525390625, "learning_rate": 9.075369075369074e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 572.75, "epoch": 0.27855477855477856, "grad_norm": 8.493833541870117, "kl": 0.217041015625, "learning_rate": 9.071484071484072e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 442.1875, "epoch": 0.27972027972027974, "grad_norm": 1.4136253595352173, "kl": 0.2939453125, "learning_rate": 9.067599067599068e-07, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 430.34375, "epoch": 0.28088578088578087, "grad_norm": 4.982465744018555, "kl": 0.269775390625, "learning_rate": 9.063714063714063e-07, "loss": 0.0003, "reward": 1.5, "reward_std": 0.4671337679028511, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 496.25, "epoch": 0.28205128205128205, "grad_norm": 3.5758814811706543, "kl": 0.2529296875, "learning_rate": 9.059829059829059e-07, "loss": 0.0003, "reward": 1.28125, "reward_std": 0.4628904387354851, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 460.75, "epoch": 0.28321678321678323, "grad_norm": 8.987251281738281, "kl": 0.31494140625, "learning_rate": 9.055944055944056e-07, "loss": 0.0003, "reward": 1.125, "reward_std": 0.3104073107242584, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.71875, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 405.96875, "epoch": 0.28438228438228436, "grad_norm": 9.033342361450195, "kl": 0.27783203125, "learning_rate": 9.052059052059051e-07, "loss": 0.0003, "reward": 1.625, "reward_std": 0.49022960662841797, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.34375, "rewards/comparison_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 387.03125, "epoch": 0.28554778554778554, "grad_norm": 2.5479629039764404, "kl": 0.3466796875, "learning_rate": 9.048174048174047e-07, "loss": 0.0003, "reward": 0.59375, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.5, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 345.46875, "epoch": 0.2867132867132867, "grad_norm": 4.82274866104126, "kl": 0.3779296875, "learning_rate": 9.044289044289045e-07, "loss": 0.0004, "reward": 1.40625, "reward_std": 0.4628904387354851, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.25, "rewards/format_reward": 1.0, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 310.90625, "epoch": 0.2878787878787879, "grad_norm": 5.044784069061279, "kl": 0.29638671875, "learning_rate": 9.040404040404041e-07, "loss": 0.0003, "reward": 1.09375, "reward_std": 0.3787454217672348, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 402.6875, "epoch": 0.289044289044289, "grad_norm": 2.803541421890259, "kl": 0.275146484375, "learning_rate": 9.036519036519036e-07, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 400.28125, "epoch": 0.2902097902097902, "grad_norm": 3.122346878051758, "kl": 0.302490234375, "learning_rate": 9.032634032634032e-07, "loss": 0.0003, "reward": 1.0, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 435.40625, "epoch": 0.2913752913752914, "grad_norm": 3.7915008068084717, "kl": 0.25830078125, "learning_rate": 9.028749028749029e-07, "loss": 0.0003, "reward": 1.34375, "reward_std": 0.4807935431599617, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 385.9375, "epoch": 0.2925407925407925, "grad_norm": 4.066257476806641, "kl": 0.30712890625, "learning_rate": 9.024864024864024e-07, "loss": 0.0003, "reward": 1.53125, "reward_std": 0.4628904387354851, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "step": 251 }, { "clip_ratio": 0.0, "completion_length": 432.125, "epoch": 0.2937062937062937, "grad_norm": 3.0087358951568604, "kl": 0.25537109375, "learning_rate": 9.02097902097902e-07, "loss": 0.0003, "reward": 1.25, "reward_std": 0.4261348247528076, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 252 }, { "clip_ratio": 0.0, "completion_length": 332.71875, "epoch": 0.2948717948717949, "grad_norm": 3.2638747692108154, "kl": 0.3291015625, "learning_rate": 9.017094017094017e-07, "loss": 0.0003, "reward": 1.46875, "reward_std": 0.3061639815568924, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 253 }, { "clip_ratio": 0.0, "completion_length": 385.1875, "epoch": 0.29603729603729606, "grad_norm": 22.57358741760254, "kl": 0.3193359375, "learning_rate": 9.013209013209014e-07, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.3924051970243454, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.34375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.71875, "step": 254 }, { "clip_ratio": 0.0, "completion_length": 490.125, "epoch": 0.2972027972027972, "grad_norm": 77.45325469970703, "kl": 0.337646484375, "learning_rate": 9.009324009324009e-07, "loss": 0.0003, "reward": 1.28125, "reward_std": 0.3198433741927147, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 393.125, "epoch": 0.29836829836829837, "grad_norm": 11.281222343444824, "kl": 0.29833984375, "learning_rate": 9.005439005439005e-07, "loss": 0.0003, "reward": 1.59375, "reward_std": 0.3787454217672348, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.40625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 256 }, { "clip_ratio": 0.0, "completion_length": 396.0625, "epoch": 0.29953379953379955, "grad_norm": 7.568979263305664, "kl": 0.267578125, "learning_rate": 9.001554001554001e-07, "loss": 0.0003, "reward": 0.90625, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 257 }, { "clip_ratio": 0.0, "completion_length": 424.53125, "epoch": 0.3006993006993007, "grad_norm": 1.6824454069137573, "kl": 0.28125, "learning_rate": 8.997668997668997e-07, "loss": 0.0003, "reward": 0.65625, "reward_std": 0.22201896458864212, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.75, "rewards/format_reward": 0.5, "step": 258 }, { "clip_ratio": 0.0, "completion_length": 470.21875, "epoch": 0.30186480186480186, "grad_norm": 5.5588788986206055, "kl": 0.24267578125, "learning_rate": 8.993783993783993e-07, "loss": 0.0002, "reward": 1.40625, "reward_std": 0.4807935431599617, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 259 }, { "clip_ratio": 0.0, "completion_length": 408.71875, "epoch": 0.30303030303030304, "grad_norm": 5.079070091247559, "kl": 0.22265625, "learning_rate": 8.98989898989899e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 424.03125, "epoch": 0.3041958041958042, "grad_norm": 3.83410906791687, "kl": 0.294921875, "learning_rate": 8.986013986013986e-07, "loss": 0.0003, "reward": 1.40625, "reward_std": 0.5038893818855286, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "step": 261 }, { "clip_ratio": 0.0, "completion_length": 432.1875, "epoch": 0.30536130536130535, "grad_norm": 2.0396158695220947, "kl": 0.21875, "learning_rate": 8.982128982128982e-07, "loss": 0.0002, "reward": 1.375, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.3333333333333333, "step": 262 }, { "clip_ratio": 0.0, "completion_length": 502.21875, "epoch": 0.30652680652680653, "grad_norm": 1.6081359386444092, "kl": 0.28857421875, "learning_rate": 8.978243978243978e-07, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.2630179077386856, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.75, "rewards/format_reward": 0.5, "step": 263 }, { "clip_ratio": 0.0, "completion_length": 391.71875, "epoch": 0.3076923076923077, "grad_norm": 2.350064754486084, "kl": 0.25341796875, "learning_rate": 8.974358974358974e-07, "loss": 0.0003, "reward": 1.25, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 264 }, { "clip_ratio": 0.0, "completion_length": 269.46875, "epoch": 0.30885780885780884, "grad_norm": 6.780157089233398, "kl": 0.3251953125, "learning_rate": 8.970473970473969e-07, "loss": 0.0003, "reward": 1.40625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.40625, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 421.125, "epoch": 0.31002331002331, "grad_norm": 4.236743450164795, "kl": 0.23681640625, "learning_rate": 8.966588966588966e-07, "loss": 0.0002, "reward": 1.3125, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.3125, "rewards/comparison_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 266 }, { "clip_ratio": 0.0, "completion_length": 575.5625, "epoch": 0.3111888111888112, "grad_norm": 3.4059455394744873, "kl": 0.26318359375, "learning_rate": 8.962703962703963e-07, "loss": 0.0003, "reward": 1.625, "reward_std": 0.3924051970243454, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "step": 267 }, { "clip_ratio": 0.0, "completion_length": 562.0, "epoch": 0.3123543123543124, "grad_norm": 5.649753570556641, "kl": 0.2470703125, "learning_rate": 8.958818958818959e-07, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.2630179077386856, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 268 }, { "clip_ratio": 0.0, "completion_length": 412.125, "epoch": 0.3135198135198135, "grad_norm": 4.85097599029541, "kl": 0.2880859375, "learning_rate": 8.954933954933954e-07, "loss": 0.0003, "reward": 1.28125, "reward_std": 0.3787454217672348, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 269 }, { "clip_ratio": 0.0, "completion_length": 617.6875, "epoch": 0.3146853146853147, "grad_norm": 23.614322662353516, "kl": 0.25048828125, "learning_rate": 8.951048951048951e-07, "loss": 0.0003, "reward": 1.28125, "reward_std": 0.4397946000099182, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 387.25, "epoch": 0.31585081585081587, "grad_norm": 5.726500988006592, "kl": 0.32177734375, "learning_rate": 8.947163947163947e-07, "loss": 0.0003, "reward": 1.09375, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.34375, "rewards/comparison_task": 1.5, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "step": 271 }, { "clip_ratio": 0.0, "completion_length": 465.15625, "epoch": 0.317016317016317, "grad_norm": 4.745253562927246, "kl": 0.297607421875, "learning_rate": 8.943278943278942e-07, "loss": 0.0003, "reward": 1.71875, "reward_std": 0.40378158539533615, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.65625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "step": 272 }, { "clip_ratio": 0.0, "completion_length": 485.5, "epoch": 0.3181818181818182, "grad_norm": 1.5292056798934937, "kl": 0.267822265625, "learning_rate": 8.939393939393938e-07, "loss": 0.0003, "reward": 0.78125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.5, "rewards/score_task": 2.0, "step": 273 }, { "clip_ratio": 0.0, "completion_length": 407.09375, "epoch": 0.31934731934731936, "grad_norm": 2.5557804107666016, "kl": 0.272216796875, "learning_rate": 8.935508935508936e-07, "loss": 0.0003, "reward": 1.53125, "reward_std": 0.3471629247069359, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 274 }, { "clip_ratio": 0.0, "completion_length": 552.59375, "epoch": 0.32051282051282054, "grad_norm": 4.694098472595215, "kl": 0.257568359375, "learning_rate": 8.931623931623932e-07, "loss": 0.0003, "reward": 0.9375, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.75, "rewards/format_reward": 0.75, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 480.46875, "epoch": 0.32167832167832167, "grad_norm": 1.4271161556243896, "kl": 0.20751953125, "learning_rate": 8.927738927738927e-07, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 276 }, { "clip_ratio": 0.0, "completion_length": 513.4375, "epoch": 0.32284382284382285, "grad_norm": 11.405556678771973, "kl": 0.241943359375, "learning_rate": 8.923853923853923e-07, "loss": 0.0002, "reward": 1.375, "reward_std": 0.4355512708425522, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 277 }, { "clip_ratio": 0.0, "completion_length": 445.78125, "epoch": 0.32400932400932403, "grad_norm": 14.21035099029541, "kl": 0.249755859375, "learning_rate": 8.91996891996892e-07, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.3061639815568924, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 278 }, { "clip_ratio": 0.0, "completion_length": 610.625, "epoch": 0.32517482517482516, "grad_norm": 2.0938267707824707, "kl": 0.208251953125, "learning_rate": 8.916083916083915e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.3514062538743019, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 279 }, { "clip_ratio": 0.0, "completion_length": 529.09375, "epoch": 0.32634032634032634, "grad_norm": 2.7483129501342773, "kl": 0.22021484375, "learning_rate": 8.912198912198911e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.5081327110528946, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 557.0, "epoch": 0.3275058275058275, "grad_norm": 3.816436529159546, "kl": 0.547119140625, "learning_rate": 8.908313908313908e-07, "loss": 0.0005, "reward": 1.375, "reward_std": 0.4765502139925957, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "step": 281 }, { "clip_ratio": 0.0, "completion_length": 807.1875, "epoch": 0.32867132867132864, "grad_norm": 2.400531530380249, "kl": 0.1883544921875, "learning_rate": 8.904428904428905e-07, "loss": 0.0002, "reward": 0.9375, "reward_std": 0.249358132481575, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.75, "rewards/format_reward": 0.75, "step": 282 }, { "clip_ratio": 0.0, "completion_length": 627.375, "epoch": 0.3298368298368298, "grad_norm": 2.8331358432769775, "kl": 0.23193359375, "learning_rate": 8.9005439005439e-07, "loss": 0.0002, "reward": 0.9375, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 283 }, { "clip_ratio": 0.0, "completion_length": 593.15625, "epoch": 0.331002331002331, "grad_norm": 2.5468549728393555, "kl": 0.185546875, "learning_rate": 8.896658896658896e-07, "loss": 0.0002, "reward": 1.40625, "reward_std": 0.4218914955854416, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 284 }, { "clip_ratio": 0.0, "completion_length": 857.40625, "epoch": 0.3321678321678322, "grad_norm": 2.73610520362854, "kl": 0.169189453125, "learning_rate": 8.892773892773892e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.4628904387354851, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 687.9375, "epoch": 0.3333333333333333, "grad_norm": 7.19685173034668, "kl": 0.1826171875, "learning_rate": 8.888888888888888e-07, "loss": 0.0002, "reward": 1.4375, "reward_std": 0.4492306634783745, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 286 }, { "clip_ratio": 0.0, "completion_length": 568.75, "epoch": 0.3344988344988345, "grad_norm": 7.850941181182861, "kl": 0.244140625, "learning_rate": 8.885003885003884e-07, "loss": 0.0002, "reward": 0.875, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.5, "rewards/score_task": 2.0, "step": 287 }, { "clip_ratio": 0.0, "completion_length": 740.59375, "epoch": 0.3356643356643357, "grad_norm": 2.993661880493164, "kl": 0.310302734375, "learning_rate": 8.881118881118881e-07, "loss": 0.0003, "reward": 1.25, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 288 }, { "clip_ratio": 0.0, "completion_length": 583.8125, "epoch": 0.3368298368298368, "grad_norm": 5.473693370819092, "kl": 0.178955078125, "learning_rate": 8.877233877233878e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.3333333333333333, "step": 289 }, { "clip_ratio": 0.0, "completion_length": 563.75, "epoch": 0.337995337995338, "grad_norm": 2.651963472366333, "kl": 0.20703125, "learning_rate": 8.873348873348873e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.3650856465101242, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 666.21875, "epoch": 0.33916083916083917, "grad_norm": 3.145451068878174, "kl": 0.19677734375, "learning_rate": 8.869463869463869e-07, "loss": 0.0002, "reward": 1.53125, "reward_std": 0.3377464786171913, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.40625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 291 }, { "clip_ratio": 0.0, "completion_length": 772.15625, "epoch": 0.34032634032634035, "grad_norm": 1.9458675384521484, "kl": 0.18798828125, "learning_rate": 8.865578865578865e-07, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.3377464786171913, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 292 }, { "clip_ratio": 0.0, "completion_length": 875.46875, "epoch": 0.3414918414918415, "grad_norm": 15.95018482208252, "kl": 0.18310546875, "learning_rate": 8.86169386169386e-07, "loss": 0.0002, "reward": 1.09375, "reward_std": 0.3471629247069359, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 293 }, { "clip_ratio": 0.0, "completion_length": 947.6875, "epoch": 0.34265734265734266, "grad_norm": 3.5490031242370605, "kl": 0.1707763671875, "learning_rate": 8.857808857808857e-07, "loss": 0.0002, "reward": 0.875, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.6875, "step": 294 }, { "clip_ratio": 0.0, "completion_length": 696.875, "epoch": 0.34382284382284384, "grad_norm": 1.9214801788330078, "kl": 0.221923828125, "learning_rate": 8.853923853923854e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.32261285185813904, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.40625, "rewards/comparison_task": 1.5, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.71875, "step": 295 }, { "clip_ratio": 0.0, "completion_length": 739.59375, "epoch": 0.34498834498834496, "grad_norm": 1.4986287355422974, "kl": 0.184814453125, "learning_rate": 8.85003885003885e-07, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 296 }, { "clip_ratio": 0.0, "completion_length": 695.71875, "epoch": 0.34615384615384615, "grad_norm": 2.525725841522217, "kl": 0.1904296875, "learning_rate": 8.846153846153846e-07, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.318369522690773, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.71875, "rewards/score_task": 1.0, "step": 297 }, { "clip_ratio": 0.0, "completion_length": 794.625, "epoch": 0.3473193473193473, "grad_norm": 3.0795788764953613, "kl": 0.1767578125, "learning_rate": 8.842268842268842e-07, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.38816186785697937, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 298 }, { "clip_ratio": 0.0, "completion_length": 789.875, "epoch": 0.3484848484848485, "grad_norm": 1.9947104454040527, "kl": 0.167236328125, "learning_rate": 8.838383838383838e-07, "loss": 0.0002, "reward": 1.125, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 299 }, { "clip_ratio": 0.0, "completion_length": 642.8125, "epoch": 0.34965034965034963, "grad_norm": 2.835761070251465, "kl": 0.1826171875, "learning_rate": 8.834498834498833e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.3745020925998688, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.09375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 805.0625, "epoch": 0.3508158508158508, "grad_norm": 10.479671478271484, "kl": 0.1728515625, "learning_rate": 8.83061383061383e-07, "loss": 0.0002, "reward": 1.09375, "reward_std": 0.3377464786171913, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 301 }, { "clip_ratio": 0.0, "completion_length": 832.5, "epoch": 0.351981351981352, "grad_norm": 1.8571937084197998, "kl": 0.1689453125, "learning_rate": 8.826728826728827e-07, "loss": 0.0002, "reward": 0.875, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 302 }, { "clip_ratio": 0.0, "completion_length": 636.09375, "epoch": 0.3531468531468531, "grad_norm": 3.13515305519104, "kl": 0.2001953125, "learning_rate": 8.822843822843823e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.3930980935692787, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.34375, "rewards/comparison_task": 1.5, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.71875, "rewards/score_task": 2.0, "step": 303 }, { "clip_ratio": 0.0, "completion_length": 822.90625, "epoch": 0.3543123543123543, "grad_norm": 1.6403309106826782, "kl": 0.171142578125, "learning_rate": 8.818958818958818e-07, "loss": 0.0002, "reward": 1.3125, "reward_std": 0.3745020925998688, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 304 }, { "clip_ratio": 0.0, "completion_length": 465.28125, "epoch": 0.3554778554778555, "grad_norm": 15.041519165039062, "kl": 0.22412109375, "learning_rate": 8.815073815073815e-07, "loss": 0.0002, "reward": 1.59375, "reward_std": 0.3966485261917114, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.3333333333333333, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 821.9375, "epoch": 0.35664335664335667, "grad_norm": 2.403963565826416, "kl": 0.184814453125, "learning_rate": 8.811188811188811e-07, "loss": 0.0002, "reward": 1.3125, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 306 }, { "clip_ratio": 0.0, "completion_length": 990.8125, "epoch": 0.3578088578088578, "grad_norm": 2.9409210681915283, "kl": 0.1689453125, "learning_rate": 8.807303807303806e-07, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 307 }, { "clip_ratio": 0.0, "completion_length": 940.65625, "epoch": 0.358974358974359, "grad_norm": 1.9722038507461548, "kl": 0.169189453125, "learning_rate": 8.803418803418803e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.3787454217672348, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 308 }, { "clip_ratio": 0.0, "completion_length": 800.09375, "epoch": 0.36013986013986016, "grad_norm": 5.3638505935668945, "kl": 0.20263671875, "learning_rate": 8.7995337995338e-07, "loss": 0.0002, "reward": 1.125, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.71875, "rewards/score_task": 1.0, "step": 309 }, { "clip_ratio": 0.0, "completion_length": 850.34375, "epoch": 0.3613053613053613, "grad_norm": 2.345106601715088, "kl": 0.195556640625, "learning_rate": 8.795648795648796e-07, "loss": 0.0002, "reward": 1.46875, "reward_std": 0.5684492141008377, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.9375, "rewards/score_task": 2.0, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 720.71875, "epoch": 0.36247086247086246, "grad_norm": 4.158276557922363, "kl": 0.1953125, "learning_rate": 8.791763791763791e-07, "loss": 0.0002, "reward": 1.125, "reward_std": 0.3104073107242584, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 311 }, { "clip_ratio": 0.0, "completion_length": 645.875, "epoch": 0.36363636363636365, "grad_norm": 5.5590667724609375, "kl": 0.211181640625, "learning_rate": 8.787878787878787e-07, "loss": 0.0002, "reward": 1.65625, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.65625, "rewards/comparison_task": 1.6666666666666667, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "step": 312 }, { "clip_ratio": 0.0, "completion_length": 798.21875, "epoch": 0.36480186480186483, "grad_norm": 6.077655792236328, "kl": 0.218505859375, "learning_rate": 8.783993783993784e-07, "loss": 0.0002, "reward": 1.3125, "reward_std": 0.3643333539366722, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 313 }, { "clip_ratio": 0.0, "completion_length": 460.96875, "epoch": 0.36596736596736595, "grad_norm": 2.4416863918304443, "kl": 0.234375, "learning_rate": 8.780108780108779e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 314 }, { "clip_ratio": 0.0, "completion_length": 959.28125, "epoch": 0.36713286713286714, "grad_norm": 2.5748887062072754, "kl": 0.166748046875, "learning_rate": 8.776223776223776e-07, "loss": 0.0002, "reward": 0.875, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 779.65625, "epoch": 0.3682983682983683, "grad_norm": 1.7845256328582764, "kl": 0.187255859375, "learning_rate": 8.772338772338772e-07, "loss": 0.0002, "reward": 1.125, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 316 }, { "clip_ratio": 0.0, "completion_length": 929.78125, "epoch": 0.36946386946386944, "grad_norm": 1.241930603981018, "kl": 0.171875, "learning_rate": 8.768453768453769e-07, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "rewards/score_task": 2.0, "step": 317 }, { "clip_ratio": 0.0, "completion_length": 952.40625, "epoch": 0.3706293706293706, "grad_norm": 1.033103346824646, "kl": 0.164794921875, "learning_rate": 8.764568764568764e-07, "loss": 0.0002, "reward": 0.59375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 318 }, { "clip_ratio": 0.0, "completion_length": 828.21875, "epoch": 0.3717948717948718, "grad_norm": 8.930593490600586, "kl": 0.1768798828125, "learning_rate": 8.76068376068376e-07, "loss": 0.0002, "reward": 1.625, "reward_std": 0.3945523276925087, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 0.96875, "rewards/score_task": 2.0, "step": 319 }, { "clip_ratio": 0.0, "completion_length": 1146.34375, "epoch": 0.372960372960373, "grad_norm": 0.9499279260635376, "kl": 0.159423828125, "learning_rate": 8.756798756798756e-07, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.22201896458864212, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.46875, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 830.21875, "epoch": 0.3741258741258741, "grad_norm": 1.2103443145751953, "kl": 0.1422119140625, "learning_rate": 8.752913752913752e-07, "loss": 0.0001, "reward": 1.3125, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 0.96875, "rewards/score_task": 1.5, "step": 321 }, { "clip_ratio": 0.0, "completion_length": 859.53125, "epoch": 0.3752913752913753, "grad_norm": 2.514634370803833, "kl": 0.18994140625, "learning_rate": 8.749028749028749e-07, "loss": 0.0002, "reward": 1.53125, "reward_std": 0.3377464786171913, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "step": 322 }, { "clip_ratio": 0.0, "completion_length": 569.78125, "epoch": 0.3764568764568765, "grad_norm": 0.011458951979875565, "kl": 0.1226806640625, "learning_rate": 8.745143745143745e-07, "loss": 0.0001, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 323 }, { "clip_ratio": 0.0, "completion_length": 945.875, "epoch": 0.3776223776223776, "grad_norm": 0.010054988786578178, "kl": 0.16259765625, "learning_rate": 8.741258741258741e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.25, "rewards/score_task": 1.0, "step": 324 }, { "clip_ratio": 0.0, "completion_length": 1057.59375, "epoch": 0.3787878787878788, "grad_norm": 1.5358918905258179, "kl": 0.169189453125, "learning_rate": 8.737373737373737e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.3745020925998688, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 715.46875, "epoch": 0.37995337995337997, "grad_norm": 4.290951251983643, "kl": 0.17138671875, "learning_rate": 8.733488733488733e-07, "loss": 0.0002, "reward": 1.375, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 326 }, { "clip_ratio": 0.0, "completion_length": 914.9375, "epoch": 0.3811188811188811, "grad_norm": 1.3849544525146484, "kl": 0.178466796875, "learning_rate": 8.729603729603729e-07, "loss": 0.0002, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "step": 327 }, { "clip_ratio": 0.0, "completion_length": 888.5, "epoch": 0.3822843822843823, "grad_norm": 0.014905408024787903, "kl": 0.144287109375, "learning_rate": 8.725718725718724e-07, "loss": 0.0001, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 328 }, { "clip_ratio": 0.0, "completion_length": 963.0625, "epoch": 0.38344988344988346, "grad_norm": 5.46189546585083, "kl": 0.13134765625, "learning_rate": 8.721833721833722e-07, "loss": 0.0001, "reward": 1.21875, "reward_std": 0.3061639815568924, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 329 }, { "clip_ratio": 0.0, "completion_length": 709.625, "epoch": 0.38461538461538464, "grad_norm": 2.8524441719055176, "kl": 0.1953125, "learning_rate": 8.717948717948718e-07, "loss": 0.0002, "reward": 1.40625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.46875, "rewards/comparison_task": 1.5, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 737.09375, "epoch": 0.38578088578088576, "grad_norm": 1.8304919004440308, "kl": 0.15380859375, "learning_rate": 8.714063714063714e-07, "loss": 0.0002, "reward": 1.125, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 331 }, { "clip_ratio": 0.0, "completion_length": 755.40625, "epoch": 0.38694638694638694, "grad_norm": 26.441204071044922, "kl": 0.16064453125, "learning_rate": 8.710178710178709e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.3198433741927147, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 332 }, { "clip_ratio": 0.0, "completion_length": 926.5, "epoch": 0.3881118881118881, "grad_norm": 1.435901403427124, "kl": 0.2060546875, "learning_rate": 8.706293706293706e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.32261285185813904, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.71875, "rewards/score_task": 1.0, "step": 333 }, { "clip_ratio": 0.0, "completion_length": 936.8125, "epoch": 0.38927738927738925, "grad_norm": 1.9241125583648682, "kl": 0.1201171875, "learning_rate": 8.702408702408702e-07, "loss": 0.0001, "reward": 1.25, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 334 }, { "clip_ratio": 0.0, "completion_length": 707.8125, "epoch": 0.39044289044289043, "grad_norm": 6.85250997543335, "kl": 0.1524658203125, "learning_rate": 8.698523698523697e-07, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 742.9375, "epoch": 0.3916083916083916, "grad_norm": 0.8466745615005493, "kl": 0.15087890625, "learning_rate": 8.694638694638695e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 336 }, { "clip_ratio": 0.0, "completion_length": 762.125, "epoch": 0.3927738927738928, "grad_norm": 2.6692888736724854, "kl": 0.1551513671875, "learning_rate": 8.690753690753691e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 337 }, { "clip_ratio": 0.0, "completion_length": 756.625, "epoch": 0.3939393939393939, "grad_norm": 0.01109327282756567, "kl": 0.1485595703125, "learning_rate": 8.686868686868687e-07, "loss": 0.0001, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 338 }, { "clip_ratio": 0.0, "completion_length": 725.625, "epoch": 0.3951048951048951, "grad_norm": 1.6078096628189087, "kl": 0.15625, "learning_rate": 8.682983682983682e-07, "loss": 0.0002, "reward": 1.46875, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 339 }, { "clip_ratio": 0.0, "completion_length": 699.71875, "epoch": 0.3962703962703963, "grad_norm": 3.3888585567474365, "kl": 0.1363525390625, "learning_rate": 8.679098679098679e-07, "loss": 0.0001, "reward": 1.34375, "reward_std": 0.3061639815568924, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 807.34375, "epoch": 0.3974358974358974, "grad_norm": 1.1469210386276245, "kl": 0.122314453125, "learning_rate": 8.675213675213675e-07, "loss": 0.0001, "reward": 1.03125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 341 }, { "clip_ratio": 0.0, "completion_length": 761.25, "epoch": 0.3986013986013986, "grad_norm": 0.8117239475250244, "kl": 0.124755859375, "learning_rate": 8.67132867132867e-07, "loss": 0.0001, "reward": 0.90625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 342 }, { "clip_ratio": 0.0, "completion_length": 797.9375, "epoch": 0.3997668997668998, "grad_norm": 1.9287629127502441, "kl": 0.130126953125, "learning_rate": 8.667443667443667e-07, "loss": 0.0001, "reward": 1.0, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 343 }, { "clip_ratio": 0.0, "completion_length": 977.96875, "epoch": 0.40093240093240096, "grad_norm": 1.7311441898345947, "kl": 0.1153564453125, "learning_rate": 8.663558663558664e-07, "loss": 0.0001, "reward": 1.34375, "reward_std": 0.4355708882212639, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "step": 344 }, { "clip_ratio": 0.0, "completion_length": 710.9375, "epoch": 0.4020979020979021, "grad_norm": 1.530247688293457, "kl": 0.137939453125, "learning_rate": 8.65967365967366e-07, "loss": 0.0001, "reward": 1.5, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 907.84375, "epoch": 0.40326340326340326, "grad_norm": 19.040048599243164, "kl": 0.109375, "learning_rate": 8.655788655788655e-07, "loss": 0.0001, "reward": 1.34375, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 346 }, { "clip_ratio": 0.0, "completion_length": 720.1875, "epoch": 0.40442890442890445, "grad_norm": 0.7886906862258911, "kl": 0.118896484375, "learning_rate": 8.651903651903651e-07, "loss": 0.0001, "reward": 1.3125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 347 }, { "clip_ratio": 0.0, "completion_length": 776.78125, "epoch": 0.40559440559440557, "grad_norm": 0.717483639717102, "kl": 0.100830078125, "learning_rate": 8.648018648018648e-07, "loss": 0.0001, "reward": 1.40625, "reward_std": 0.18600594997406006, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.3333333333333333, "step": 348 }, { "clip_ratio": 0.0, "completion_length": 656.28125, "epoch": 0.40675990675990675, "grad_norm": 2.9993555545806885, "kl": 0.1575927734375, "learning_rate": 8.644133644133643e-07, "loss": 0.0002, "reward": 1.375, "reward_std": 0.43112075328826904, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.34375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 349 }, { "clip_ratio": 0.0, "completion_length": 615.84375, "epoch": 0.40792540792540793, "grad_norm": 3.9150185585021973, "kl": 0.1627197265625, "learning_rate": 8.64024864024864e-07, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.40625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 844.75, "epoch": 0.4090909090909091, "grad_norm": 1.3224632740020752, "kl": 0.1396484375, "learning_rate": 8.636363636363636e-07, "loss": 0.0001, "reward": 0.84375, "reward_std": 0.18600594997406006, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.71875, "step": 351 }, { "clip_ratio": 0.0, "completion_length": 689.40625, "epoch": 0.41025641025641024, "grad_norm": 1.1940010786056519, "kl": 0.1370849609375, "learning_rate": 8.632478632478633e-07, "loss": 0.0001, "reward": 1.40625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 352 }, { "clip_ratio": 0.0, "completion_length": 753.0625, "epoch": 0.4114219114219114, "grad_norm": 0.7212651371955872, "kl": 0.137451171875, "learning_rate": 8.628593628593628e-07, "loss": 0.0001, "reward": 1.09375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 353 }, { "clip_ratio": 0.0, "completion_length": 635.09375, "epoch": 0.4125874125874126, "grad_norm": 11.82548999786377, "kl": 0.143310546875, "learning_rate": 8.624708624708624e-07, "loss": 0.0001, "reward": 1.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 354 }, { "clip_ratio": 0.0, "completion_length": 878.5625, "epoch": 0.41375291375291373, "grad_norm": 1.6259218454360962, "kl": 0.135986328125, "learning_rate": 8.62082362082362e-07, "loss": 0.0001, "reward": 1.1875, "reward_std": 0.3471825420856476, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 791.8125, "epoch": 0.4149184149184149, "grad_norm": 1.6451737880706787, "kl": 0.1229248046875, "learning_rate": 8.616938616938616e-07, "loss": 0.0001, "reward": 1.5, "reward_std": 0.26726123690605164, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 356 }, { "clip_ratio": 0.0, "completion_length": 735.40625, "epoch": 0.4160839160839161, "grad_norm": 1.6540080308914185, "kl": 0.1275634765625, "learning_rate": 8.613053613053613e-07, "loss": 0.0001, "reward": 1.09375, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 357 }, { "clip_ratio": 0.0, "completion_length": 794.34375, "epoch": 0.4172494172494173, "grad_norm": 7.60220193862915, "kl": 0.136474609375, "learning_rate": 8.609168609168609e-07, "loss": 0.0001, "reward": 1.28125, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 358 }, { "clip_ratio": 0.0, "completion_length": 564.78125, "epoch": 0.4184149184149184, "grad_norm": 3.1565353870391846, "kl": 0.1669921875, "learning_rate": 8.605283605283605e-07, "loss": 0.0002, "reward": 1.625, "reward_std": 0.3104073107242584, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 359 }, { "clip_ratio": 0.0, "completion_length": 860.53125, "epoch": 0.4195804195804196, "grad_norm": 1.675962209701538, "kl": 0.1129150390625, "learning_rate": 8.601398601398601e-07, "loss": 0.0001, "reward": 0.9375, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 759.25, "epoch": 0.42074592074592077, "grad_norm": 2.261608839035034, "kl": 0.1143798828125, "learning_rate": 8.597513597513597e-07, "loss": 0.0001, "reward": 1.0, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 361 }, { "clip_ratio": 0.0, "completion_length": 695.1875, "epoch": 0.4219114219114219, "grad_norm": 8.048891067504883, "kl": 0.1490478515625, "learning_rate": 8.593628593628593e-07, "loss": 0.0001, "reward": 1.625, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 0.96875, "rewards/score_task": 2.0, "step": 362 }, { "clip_ratio": 0.0, "completion_length": 731.90625, "epoch": 0.4230769230769231, "grad_norm": 1.7958781719207764, "kl": 0.1103515625, "learning_rate": 8.589743589743588e-07, "loss": 0.0001, "reward": 1.40625, "reward_std": 0.3987956568598747, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 363 }, { "clip_ratio": 0.0, "completion_length": 697.09375, "epoch": 0.42424242424242425, "grad_norm": 0.7771614193916321, "kl": 0.1048583984375, "learning_rate": 8.585858585858586e-07, "loss": 0.0001, "reward": 0.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 364 }, { "clip_ratio": 0.0, "completion_length": 663.34375, "epoch": 0.4254079254079254, "grad_norm": 0.00754990940913558, "kl": 0.109619140625, "learning_rate": 8.581973581973582e-07, "loss": 0.0001, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.3333333333333333, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 745.03125, "epoch": 0.42657342657342656, "grad_norm": 1.6991863250732422, "kl": 0.1468505859375, "learning_rate": 8.578088578088578e-07, "loss": 0.0001, "reward": 1.0, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 366 }, { "clip_ratio": 0.0, "completion_length": 444.5, "epoch": 0.42773892773892774, "grad_norm": 4.470001220703125, "kl": 0.217529296875, "learning_rate": 8.574203574203573e-07, "loss": 0.0002, "reward": 1.625, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.375, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 367 }, { "clip_ratio": 0.0, "completion_length": 658.875, "epoch": 0.4289044289044289, "grad_norm": 3.65251088142395, "kl": 0.17431640625, "learning_rate": 8.57031857031857e-07, "loss": 0.0002, "reward": 1.09375, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.34375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 368 }, { "clip_ratio": 0.0, "completion_length": 584.0, "epoch": 0.43006993006993005, "grad_norm": 3.5328567028045654, "kl": 0.1400146484375, "learning_rate": 8.566433566433566e-07, "loss": 0.0001, "reward": 1.4375, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 369 }, { "clip_ratio": 0.0, "completion_length": 854.15625, "epoch": 0.43123543123543123, "grad_norm": 3.8627572059631348, "kl": 0.12548828125, "learning_rate": 8.562548562548561e-07, "loss": 0.0001, "reward": 1.6875, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 817.21875, "epoch": 0.4324009324009324, "grad_norm": 1.3044227361679077, "kl": 0.1485595703125, "learning_rate": 8.558663558663558e-07, "loss": 0.0001, "reward": 1.15625, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 371 }, { "clip_ratio": 0.0, "completion_length": 660.6875, "epoch": 0.43356643356643354, "grad_norm": 2.1386237144470215, "kl": 0.1375732421875, "learning_rate": 8.554778554778555e-07, "loss": 0.0001, "reward": 1.25, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 372 }, { "clip_ratio": 0.0, "completion_length": 787.875, "epoch": 0.4347319347319347, "grad_norm": 2.2214221954345703, "kl": 0.1380615234375, "learning_rate": 8.550893550893551e-07, "loss": 0.0001, "reward": 1.4375, "reward_std": 0.4261348247528076, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 373 }, { "clip_ratio": 0.0, "completion_length": 873.65625, "epoch": 0.4358974358974359, "grad_norm": 0.790764570236206, "kl": 0.1112060546875, "learning_rate": 8.547008547008546e-07, "loss": 0.0001, "reward": 0.8125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 374 }, { "clip_ratio": 0.0, "completion_length": 813.21875, "epoch": 0.4370629370629371, "grad_norm": 0.5710849761962891, "kl": 0.119384765625, "learning_rate": 8.543123543123542e-07, "loss": 0.0001, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 611.9375, "epoch": 0.4382284382284382, "grad_norm": 7.723477363586426, "kl": 0.1767578125, "learning_rate": 8.539238539238539e-07, "loss": 0.0002, "reward": 1.65625, "reward_std": 0.3198433741927147, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "step": 376 }, { "clip_ratio": 0.0, "completion_length": 668.71875, "epoch": 0.4393939393939394, "grad_norm": 2.188457727432251, "kl": 0.17529296875, "learning_rate": 8.535353535353534e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 377 }, { "clip_ratio": 0.0, "completion_length": 704.53125, "epoch": 0.4405594405594406, "grad_norm": 3.059499979019165, "kl": 0.146728515625, "learning_rate": 8.531468531468531e-07, "loss": 0.0001, "reward": 1.375, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 378 }, { "clip_ratio": 0.0, "completion_length": 947.0, "epoch": 0.4417249417249417, "grad_norm": 2.0695106983184814, "kl": 0.1156005859375, "learning_rate": 8.527583527583528e-07, "loss": 0.0001, "reward": 1.46875, "reward_std": 0.4397946000099182, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.25, "rewards/format_reward": 1.0, "step": 379 }, { "clip_ratio": 0.0, "completion_length": 752.40625, "epoch": 0.4428904428904429, "grad_norm": 3.6421706676483154, "kl": 0.112060546875, "learning_rate": 8.523698523698524e-07, "loss": 0.0001, "reward": 1.34375, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 595.21875, "epoch": 0.44405594405594406, "grad_norm": 7.274472713470459, "kl": 0.1505126953125, "learning_rate": 8.519813519813519e-07, "loss": 0.0002, "reward": 1.3125, "reward_std": 0.3745020925998688, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 381 }, { "clip_ratio": 0.0, "completion_length": 930.8125, "epoch": 0.44522144522144524, "grad_norm": 1.2528934478759766, "kl": 0.1121826171875, "learning_rate": 8.515928515928515e-07, "loss": 0.0001, "reward": 1.21875, "reward_std": 0.3377464786171913, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 382 }, { "clip_ratio": 0.0, "completion_length": 830.9375, "epoch": 0.44638694638694637, "grad_norm": 2.2833731174468994, "kl": 0.1290283203125, "learning_rate": 8.512043512043512e-07, "loss": 0.0001, "reward": 1.3125, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "step": 383 }, { "clip_ratio": 0.0, "completion_length": 708.03125, "epoch": 0.44755244755244755, "grad_norm": 4.375660419464111, "kl": 0.1553955078125, "learning_rate": 8.508158508158507e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.3808925524353981, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 384 }, { "clip_ratio": 0.0, "completion_length": 900.875, "epoch": 0.44871794871794873, "grad_norm": 1.1461491584777832, "kl": 0.11767578125, "learning_rate": 8.504273504273504e-07, "loss": 0.0001, "reward": 0.9375, "reward_std": 0.2587745785713196, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.75, "rewards/format_reward": 0.75, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 571.78125, "epoch": 0.44988344988344986, "grad_norm": 3.541147232055664, "kl": 0.1856689453125, "learning_rate": 8.5003885003885e-07, "loss": 0.0002, "reward": 1.625, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.40625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 386 }, { "clip_ratio": 0.0, "completion_length": 674.125, "epoch": 0.45104895104895104, "grad_norm": 3.2181789875030518, "kl": 0.1568603515625, "learning_rate": 8.496503496503497e-07, "loss": 0.0002, "reward": 1.625, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 387 }, { "clip_ratio": 0.0, "completion_length": 800.84375, "epoch": 0.4522144522144522, "grad_norm": 7.698366165161133, "kl": 0.1513671875, "learning_rate": 8.492618492618492e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.2587745785713196, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 388 }, { "clip_ratio": 0.0, "completion_length": 794.03125, "epoch": 0.4533799533799534, "grad_norm": 2.163395643234253, "kl": 0.147216796875, "learning_rate": 8.488733488733488e-07, "loss": 0.0001, "reward": 1.46875, "reward_std": 0.3787454217672348, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "step": 389 }, { "clip_ratio": 0.0, "completion_length": 748.125, "epoch": 0.45454545454545453, "grad_norm": 1.9166104793548584, "kl": 0.13818359375, "learning_rate": 8.484848484848484e-07, "loss": 0.0001, "reward": 1.71875, "reward_std": 0.4397946000099182, "rewards/accuracy_reward": 0.53125, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 0.96875, "rewards/score_task": 2.0, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 806.5, "epoch": 0.4557109557109557, "grad_norm": 5.625556945800781, "kl": 0.1473388671875, "learning_rate": 8.480963480963482e-07, "loss": 0.0001, "reward": 1.125, "reward_std": 0.3745020925998688, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 391 }, { "clip_ratio": 0.0, "completion_length": 744.40625, "epoch": 0.4568764568764569, "grad_norm": 1.5799556970596313, "kl": 0.1248779296875, "learning_rate": 8.477078477078477e-07, "loss": 0.0001, "reward": 1.28125, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 392 }, { "clip_ratio": 0.0, "completion_length": 785.15625, "epoch": 0.458041958041958, "grad_norm": 0.7427676916122437, "kl": 0.107421875, "learning_rate": 8.473193473193473e-07, "loss": 0.0001, "reward": 1.03125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 393 }, { "clip_ratio": 0.0, "completion_length": 654.6875, "epoch": 0.4592074592074592, "grad_norm": 4.764172077178955, "kl": 0.15087890625, "learning_rate": 8.469308469308469e-07, "loss": 0.0002, "reward": 1.40625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 394 }, { "clip_ratio": 0.0, "completion_length": 618.4375, "epoch": 0.4603729603729604, "grad_norm": 4.055548191070557, "kl": 0.1766357421875, "learning_rate": 8.465423465423465e-07, "loss": 0.0002, "reward": 1.8125, "reward_std": 0.408231720328331, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.4375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "step": 395 }, { "clip_ratio": 0.0, "completion_length": 665.4375, "epoch": 0.46153846153846156, "grad_norm": 1.201546311378479, "kl": 0.1234130859375, "learning_rate": 8.461538461538461e-07, "loss": 0.0001, "reward": 1.1875, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 396 }, { "clip_ratio": 0.0, "completion_length": 883.5, "epoch": 0.4627039627039627, "grad_norm": 2.041278600692749, "kl": 0.12646484375, "learning_rate": 8.457653457653457e-07, "loss": 0.0001, "reward": 1.1875, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 397 }, { "clip_ratio": 0.0, "completion_length": 568.4375, "epoch": 0.46386946386946387, "grad_norm": 2.912287712097168, "kl": 0.1524658203125, "learning_rate": 8.453768453768454e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 398 }, { "clip_ratio": 0.0, "completion_length": 815.28125, "epoch": 0.46503496503496505, "grad_norm": 2.1345911026000977, "kl": 0.11279296875, "learning_rate": 8.44988344988345e-07, "loss": 0.0001, "reward": 1.3125, "reward_std": 0.3924051970243454, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 399 }, { "clip_ratio": 0.0, "completion_length": 747.1875, "epoch": 0.4662004662004662, "grad_norm": 0.6871504187583923, "kl": 0.110595703125, "learning_rate": 8.445998445998446e-07, "loss": 0.0001, "reward": 0.84375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 746.0, "epoch": 0.46736596736596736, "grad_norm": 9.267769813537598, "kl": 0.1416015625, "learning_rate": 8.442113442113442e-07, "loss": 0.0001, "reward": 1.4375, "reward_std": 0.4261348247528076, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 401 }, { "clip_ratio": 0.0, "completion_length": 924.78125, "epoch": 0.46853146853146854, "grad_norm": 1.5156805515289307, "kl": 0.1278076171875, "learning_rate": 8.438228438228437e-07, "loss": 0.0001, "reward": 1.28125, "reward_std": 0.35564958304166794, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "step": 402 }, { "clip_ratio": 0.0, "completion_length": 605.4375, "epoch": 0.4696969696969697, "grad_norm": 1.6157333850860596, "kl": 0.1265869140625, "learning_rate": 8.434343434343434e-07, "loss": 0.0001, "reward": 1.21875, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.25, "step": 403 }, { "clip_ratio": 0.0, "completion_length": 789.65625, "epoch": 0.47086247086247085, "grad_norm": 2.9461214542388916, "kl": 0.1134033203125, "learning_rate": 8.43045843045843e-07, "loss": 0.0001, "reward": 1.03125, "reward_std": 0.3198433741927147, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 404 }, { "clip_ratio": 0.0, "completion_length": 880.5, "epoch": 0.47202797202797203, "grad_norm": 2.7524468898773193, "kl": 0.11376953125, "learning_rate": 8.426573426573427e-07, "loss": 0.0001, "reward": 1.21875, "reward_std": 0.35564958304166794, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 947.375, "epoch": 0.4731934731934732, "grad_norm": 2.3911845684051514, "kl": 0.1090087890625, "learning_rate": 8.422688422688422e-07, "loss": 0.0001, "reward": 1.46875, "reward_std": 0.5038893818855286, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 406 }, { "clip_ratio": 0.0, "completion_length": 781.46875, "epoch": 0.47435897435897434, "grad_norm": 3.2014145851135254, "kl": 0.162841796875, "learning_rate": 8.418803418803419e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.3608423173427582, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "step": 407 }, { "clip_ratio": 0.0, "completion_length": 642.53125, "epoch": 0.4755244755244755, "grad_norm": 8.713068008422852, "kl": 0.2125244140625, "learning_rate": 8.414918414918415e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.4375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "step": 408 }, { "clip_ratio": 0.0, "completion_length": 610.6875, "epoch": 0.4766899766899767, "grad_norm": 2.27583384513855, "kl": 0.166015625, "learning_rate": 8.41103341103341e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 409 }, { "clip_ratio": 0.0, "completion_length": 750.03125, "epoch": 0.47785547785547783, "grad_norm": 1.337859034538269, "kl": 0.13134765625, "learning_rate": 8.407148407148406e-07, "loss": 0.0001, "reward": 1.1875, "reward_std": 0.249358132481575, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 833.5, "epoch": 0.479020979020979, "grad_norm": 1.3864781856536865, "kl": 0.1142578125, "learning_rate": 8.403263403263403e-07, "loss": 0.0001, "reward": 1.28125, "reward_std": 0.2630179077386856, "rewards/accuracy_reward": 0.53125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 411 }, { "clip_ratio": 0.0, "completion_length": 483.53125, "epoch": 0.4801864801864802, "grad_norm": 21.36132049560547, "kl": 0.302490234375, "learning_rate": 8.3993783993784e-07, "loss": 0.0003, "reward": 1.6875, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "step": 412 }, { "clip_ratio": 0.0, "completion_length": 700.59375, "epoch": 0.4813519813519814, "grad_norm": 0.12803255021572113, "kl": 0.2978515625, "learning_rate": 8.395493395493395e-07, "loss": 0.0003, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 413 }, { "clip_ratio": 0.0, "completion_length": 638.375, "epoch": 0.4825174825174825, "grad_norm": 1.7832533121109009, "kl": 0.1363525390625, "learning_rate": 8.391608391608391e-07, "loss": 0.0001, "reward": 1.5625, "reward_std": 0.3924051970243454, "rewards/accuracy_reward": 0.5625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.6666666666666667, "step": 414 }, { "clip_ratio": 0.0, "completion_length": 655.875, "epoch": 0.4836829836829837, "grad_norm": 2.9388411045074463, "kl": 0.21533203125, "learning_rate": 8.387723387723388e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.46875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 921.21875, "epoch": 0.48484848484848486, "grad_norm": 1.3081485033035278, "kl": 0.1192626953125, "learning_rate": 8.383838383838383e-07, "loss": 0.0001, "reward": 0.71875, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.5, "rewards/score_task": 2.0, "step": 416 }, { "clip_ratio": 0.0, "completion_length": 755.0, "epoch": 0.486013986013986, "grad_norm": 0.7335625886917114, "kl": 0.1165771484375, "learning_rate": 8.379953379953379e-07, "loss": 0.0001, "reward": 0.78125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 417 }, { "clip_ratio": 0.0, "completion_length": 731.96875, "epoch": 0.48717948717948717, "grad_norm": 0.8384415507316589, "kl": 0.1170654296875, "learning_rate": 8.376068376068375e-07, "loss": 0.0001, "reward": 1.125, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 418 }, { "clip_ratio": 0.0, "completion_length": 734.21875, "epoch": 0.48834498834498835, "grad_norm": 4.455101490020752, "kl": 0.173095703125, "learning_rate": 8.372183372183373e-07, "loss": 0.0002, "reward": 1.375, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 419 }, { "clip_ratio": 0.0, "completion_length": 876.03125, "epoch": 0.48951048951048953, "grad_norm": 1.2502459287643433, "kl": 0.1121826171875, "learning_rate": 8.368298368298368e-07, "loss": 0.0001, "reward": 0.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.5, "rewards/score_task": 2.0, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 884.9375, "epoch": 0.49067599067599066, "grad_norm": 5.212028503417969, "kl": 0.1796875, "learning_rate": 8.364413364413364e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.3514062538743019, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 421 }, { "clip_ratio": 0.0, "completion_length": 694.71875, "epoch": 0.49184149184149184, "grad_norm": 1.9139832258224487, "kl": 0.216552734375, "learning_rate": 8.36052836052836e-07, "loss": 0.0002, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.46875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "step": 422 }, { "clip_ratio": 0.0, "completion_length": 891.34375, "epoch": 0.493006993006993, "grad_norm": 0.6843790411949158, "kl": 0.193115234375, "learning_rate": 8.356643356643356e-07, "loss": 0.0002, "reward": 0.875, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "step": 423 }, { "clip_ratio": 0.0, "completion_length": 849.59375, "epoch": 0.49417249417249415, "grad_norm": 0.010363442823290825, "kl": 0.1898193359375, "learning_rate": 8.352758352758352e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "step": 424 }, { "clip_ratio": 0.0, "completion_length": 610.0, "epoch": 0.49533799533799533, "grad_norm": 11.937487602233887, "kl": 0.1771240234375, "learning_rate": 8.348873348873348e-07, "loss": 0.0002, "reward": 1.53125, "reward_std": 0.2630179077386856, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.6666666666666667, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 774.65625, "epoch": 0.4965034965034965, "grad_norm": 2.07997465133667, "kl": 0.219970703125, "learning_rate": 8.344988344988346e-07, "loss": 0.0002, "reward": 1.125, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 426 }, { "clip_ratio": 0.0, "completion_length": 1060.625, "epoch": 0.4976689976689977, "grad_norm": 17.414432525634766, "kl": 0.12744140625, "learning_rate": 8.341103341103341e-07, "loss": 0.0001, "reward": 1.25, "reward_std": 0.3945523276925087, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 427 }, { "clip_ratio": 0.0, "completion_length": 529.0, "epoch": 0.4988344988344988, "grad_norm": 14.554543495178223, "kl": 0.190185546875, "learning_rate": 8.337218337218337e-07, "loss": 0.0002, "reward": 1.34375, "reward_std": 0.47137709707021713, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.3333333333333333, "step": 428 }, { "clip_ratio": 0.0, "completion_length": 1174.75, "epoch": 0.5, "grad_norm": 1.0349880456924438, "kl": 0.124755859375, "learning_rate": 8.333333333333333e-07, "loss": 0.0001, "reward": 0.90625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.5, "step": 429 }, { "clip_ratio": 0.0, "completion_length": 760.375, "epoch": 0.5011655011655012, "grad_norm": 5.084792137145996, "kl": 0.242919921875, "learning_rate": 8.329448329448329e-07, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.4375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.71875, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 890.5, "epoch": 0.5023310023310024, "grad_norm": 0.6649951934814453, "kl": 0.119140625, "learning_rate": 8.325563325563325e-07, "loss": 0.0001, "reward": 1.15625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 431 }, { "clip_ratio": 0.0, "completion_length": 886.03125, "epoch": 0.5034965034965035, "grad_norm": 10.201469421386719, "kl": 0.144775390625, "learning_rate": 8.321678321678321e-07, "loss": 0.0001, "reward": 1.375, "reward_std": 0.3650856465101242, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 432 }, { "clip_ratio": 0.0, "completion_length": 870.625, "epoch": 0.5046620046620046, "grad_norm": 0.7406434416770935, "kl": 0.1258544921875, "learning_rate": 8.317793317793318e-07, "loss": 0.0001, "reward": 0.9375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 433 }, { "clip_ratio": 0.0, "completion_length": 474.5, "epoch": 0.5058275058275058, "grad_norm": 6.302752494812012, "kl": 0.2119140625, "learning_rate": 8.313908313908314e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.4375, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 434 }, { "clip_ratio": 0.0, "completion_length": 1076.15625, "epoch": 0.506993006993007, "grad_norm": 1.0616064071655273, "kl": 0.1236572265625, "learning_rate": 8.31002331002331e-07, "loss": 0.0001, "reward": 1.15625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 671.09375, "epoch": 0.5081585081585082, "grad_norm": 0.9370782971382141, "kl": 0.1820068359375, "learning_rate": 8.306138306138306e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 436 }, { "clip_ratio": 0.0, "completion_length": 520.40625, "epoch": 0.5093240093240093, "grad_norm": 16.23700714111328, "kl": 0.3092041015625, "learning_rate": 8.302253302253301e-07, "loss": 0.0003, "reward": 1.3125, "reward_std": 0.249358132481575, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.5625, "rewards/comparison_task": 1.6666666666666667, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "step": 437 }, { "clip_ratio": 0.0, "completion_length": 793.78125, "epoch": 0.5104895104895105, "grad_norm": 0.7512879967689514, "kl": 0.1961669921875, "learning_rate": 8.298368298368298e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 438 }, { "clip_ratio": 0.0, "completion_length": 835.78125, "epoch": 0.5116550116550117, "grad_norm": 3.6679623126983643, "kl": 0.1712646484375, "learning_rate": 8.294483294483294e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.3514062538743019, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 439 }, { "clip_ratio": 0.0, "completion_length": 720.5, "epoch": 0.5128205128205128, "grad_norm": 3.103271961212158, "kl": 0.2510986328125, "learning_rate": 8.290598290598291e-07, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.4375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 864.59375, "epoch": 0.513986013986014, "grad_norm": 1.1184890270233154, "kl": 0.14794921875, "learning_rate": 8.286713286713286e-07, "loss": 0.0001, "reward": 1.0625, "reward_std": 0.249358132481575, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 441 }, { "clip_ratio": 0.0, "completion_length": 850.625, "epoch": 0.5151515151515151, "grad_norm": 1.9922239780426025, "kl": 0.12548828125, "learning_rate": 8.282828282828283e-07, "loss": 0.0001, "reward": 1.6875, "reward_std": 0.38298875093460083, "rewards/accuracy_reward": 0.6875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 442 }, { "clip_ratio": 0.0, "completion_length": 999.375, "epoch": 0.5163170163170163, "grad_norm": 2.877669334411621, "kl": 0.1282958984375, "learning_rate": 8.278943278943279e-07, "loss": 0.0001, "reward": 1.3125, "reward_std": 0.2587745785713196, "rewards/accuracy_reward": 0.5625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 443 }, { "clip_ratio": 0.0, "completion_length": 793.03125, "epoch": 0.5174825174825175, "grad_norm": 1.136614441871643, "kl": 0.1676025390625, "learning_rate": 8.275058275058274e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 444 }, { "clip_ratio": 0.0, "completion_length": 909.0, "epoch": 0.5186480186480187, "grad_norm": 2.6212611198425293, "kl": 0.1175537109375, "learning_rate": 8.27117327117327e-07, "loss": 0.0001, "reward": 1.1875, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 917.96875, "epoch": 0.5198135198135199, "grad_norm": 1.165771722793579, "kl": 0.13916015625, "learning_rate": 8.267288267288267e-07, "loss": 0.0001, "reward": 0.84375, "reward_std": 0.22201896458864212, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "rewards/score_task": 2.0, "step": 446 }, { "clip_ratio": 0.0, "completion_length": 972.3125, "epoch": 0.5209790209790209, "grad_norm": 0.7724711298942566, "kl": 0.131591796875, "learning_rate": 8.263403263403264e-07, "loss": 0.0001, "reward": 1.59375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.59375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 447 }, { "clip_ratio": 0.0, "completion_length": 903.3125, "epoch": 0.5221445221445221, "grad_norm": 0.006941840052604675, "kl": 0.125244140625, "learning_rate": 8.259518259518259e-07, "loss": 0.0001, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 448 }, { "clip_ratio": 0.0, "completion_length": 1053.5, "epoch": 0.5233100233100233, "grad_norm": 8.70131778717041, "kl": 0.13525390625, "learning_rate": 8.255633255633255e-07, "loss": 0.0001, "reward": 1.03125, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 449 }, { "clip_ratio": 0.0, "completion_length": 693.03125, "epoch": 0.5244755244755245, "grad_norm": 0.007931019179522991, "kl": 0.1923828125, "learning_rate": 8.251748251748252e-07, "loss": 0.0002, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 955.34375, "epoch": 0.5256410256410257, "grad_norm": 0.005900613032281399, "kl": 0.1298828125, "learning_rate": 8.247863247863247e-07, "loss": 0.0001, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.5, "step": 451 }, { "clip_ratio": 0.0, "completion_length": 627.15625, "epoch": 0.5268065268065268, "grad_norm": 3.4403765201568604, "kl": 0.2109375, "learning_rate": 8.243978243978243e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 452 }, { "clip_ratio": 0.0, "completion_length": 807.3125, "epoch": 0.527972027972028, "grad_norm": 0.6302827000617981, "kl": 0.12744140625, "learning_rate": 8.240093240093239e-07, "loss": 0.0001, "reward": 1.5625, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.5625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 453 }, { "clip_ratio": 0.0, "completion_length": 938.15625, "epoch": 0.5291375291375291, "grad_norm": 0.9920759797096252, "kl": 0.1378173828125, "learning_rate": 8.236208236208237e-07, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 454 }, { "clip_ratio": 0.0, "completion_length": 892.84375, "epoch": 0.5303030303030303, "grad_norm": 2.288360118865967, "kl": 0.1307373046875, "learning_rate": 8.232323232323232e-07, "loss": 0.0001, "reward": 1.03125, "reward_std": 0.2630179077386856, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 744.53125, "epoch": 0.5314685314685315, "grad_norm": 12.81175422668457, "kl": 0.206298828125, "learning_rate": 8.228438228438228e-07, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "step": 456 }, { "clip_ratio": 0.0, "completion_length": 690.875, "epoch": 0.5326340326340326, "grad_norm": 1.8180357217788696, "kl": 0.17333984375, "learning_rate": 8.224553224553224e-07, "loss": 0.0002, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 457 }, { "clip_ratio": 0.0, "completion_length": 790.03125, "epoch": 0.5337995337995338, "grad_norm": 1.1499828100204468, "kl": 0.245849609375, "learning_rate": 8.22066822066822e-07, "loss": 0.0002, "reward": 1.65625, "reward_std": 0.22201896458864212, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "step": 458 }, { "clip_ratio": 0.0, "completion_length": 949.5625, "epoch": 0.534965034965035, "grad_norm": 0.9527376294136047, "kl": 0.141357421875, "learning_rate": 8.216783216783216e-07, "loss": 0.0001, "reward": 1.125, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.25, "rewards/format_reward": 0.75, "step": 459 }, { "clip_ratio": 0.0, "completion_length": 938.875, "epoch": 0.5361305361305362, "grad_norm": 1.1264575719833374, "kl": 0.13427734375, "learning_rate": 8.212898212898212e-07, "loss": 0.0001, "reward": 1.0625, "reward_std": 0.249358132481575, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.75, "rewards/format_reward": 0.75, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 839.5, "epoch": 0.5372960372960373, "grad_norm": 0.005617175716906786, "kl": 0.127197265625, "learning_rate": 8.20901320901321e-07, "loss": 0.0001, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 461 }, { "clip_ratio": 0.0, "completion_length": 694.1875, "epoch": 0.5384615384615384, "grad_norm": 13.17356014251709, "kl": 0.196044921875, "learning_rate": 8.205128205128205e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 462 }, { "clip_ratio": 0.0, "completion_length": 762.65625, "epoch": 0.5396270396270396, "grad_norm": 8.398935317993164, "kl": 0.197509765625, "learning_rate": 8.201243201243201e-07, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "step": 463 }, { "clip_ratio": 0.0, "completion_length": 796.40625, "epoch": 0.5407925407925408, "grad_norm": 2.9575586318969727, "kl": 0.224853515625, "learning_rate": 8.197358197358197e-07, "loss": 0.0002, "reward": 1.34375, "reward_std": 0.3061639815568924, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 464 }, { "clip_ratio": 0.0, "completion_length": 726.5, "epoch": 0.541958041958042, "grad_norm": 2.5497167110443115, "kl": 0.203125, "learning_rate": 8.193473193473192e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 727.625, "epoch": 0.5431235431235432, "grad_norm": 1.3912338018417358, "kl": 0.213134765625, "learning_rate": 8.189588189588189e-07, "loss": 0.0002, "reward": 1.8125, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.5625, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "step": 466 }, { "clip_ratio": 0.0, "completion_length": 521.28125, "epoch": 0.5442890442890443, "grad_norm": 2.025505542755127, "kl": 0.2462158203125, "learning_rate": 8.185703185703186e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.46875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 467 }, { "clip_ratio": 0.0, "completion_length": 845.9375, "epoch": 0.5454545454545454, "grad_norm": 1.7023087739944458, "kl": 0.130859375, "learning_rate": 8.181818181818182e-07, "loss": 0.0001, "reward": 1.46875, "reward_std": 0.4628904387354851, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 468 }, { "clip_ratio": 0.0, "completion_length": 476.125, "epoch": 0.5466200466200466, "grad_norm": 3.5877444744110107, "kl": 0.2047119140625, "learning_rate": 8.177933177933178e-07, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.22201896458864212, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 469 }, { "clip_ratio": 0.0, "completion_length": 673.84375, "epoch": 0.5477855477855478, "grad_norm": 9.94251823425293, "kl": 0.225341796875, "learning_rate": 8.174048174048174e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 727.90625, "epoch": 0.548951048951049, "grad_norm": 49.3470458984375, "kl": 0.2054443359375, "learning_rate": 8.17016317016317e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.4261348247528076, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 471 }, { "clip_ratio": 0.0, "completion_length": 875.53125, "epoch": 0.5501165501165501, "grad_norm": 0.005146713927388191, "kl": 0.1240234375, "learning_rate": 8.166278166278165e-07, "loss": 0.0001, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 472 }, { "clip_ratio": 0.0, "completion_length": 851.0625, "epoch": 0.5512820512820513, "grad_norm": 0.6207136511802673, "kl": 0.1790771484375, "learning_rate": 8.162393162393162e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.71875, "step": 473 }, { "clip_ratio": 0.0, "completion_length": 565.46875, "epoch": 0.5524475524475524, "grad_norm": 3.965127944946289, "kl": 0.292724609375, "learning_rate": 8.158508158508159e-07, "loss": 0.0003, "reward": 1.65625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.40625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 474 }, { "clip_ratio": 0.0, "completion_length": 489.53125, "epoch": 0.5536130536130536, "grad_norm": 1.500765323638916, "kl": 0.314208984375, "learning_rate": 8.154623154623155e-07, "loss": 0.0003, "reward": 1.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.5, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 736.875, "epoch": 0.5547785547785548, "grad_norm": 0.9015250205993652, "kl": 0.13671875, "learning_rate": 8.15073815073815e-07, "loss": 0.0001, "reward": 0.65625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 476 }, { "clip_ratio": 0.0, "completion_length": 731.1875, "epoch": 0.5559440559440559, "grad_norm": 2.6188156604766846, "kl": 0.209716796875, "learning_rate": 8.146853146853147e-07, "loss": 0.0002, "reward": 1.625, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "step": 477 }, { "clip_ratio": 0.0, "completion_length": 776.125, "epoch": 0.5571095571095571, "grad_norm": 1.5987331867218018, "kl": 0.131103515625, "learning_rate": 8.142968142968143e-07, "loss": 0.0001, "reward": 1.125, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 478 }, { "clip_ratio": 0.0, "completion_length": 890.8125, "epoch": 0.5582750582750583, "grad_norm": 0.006465711630880833, "kl": 0.1317138671875, "learning_rate": 8.139083139083138e-07, "loss": 0.0001, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 479 }, { "clip_ratio": 0.0, "completion_length": 1095.25, "epoch": 0.5594405594405595, "grad_norm": 2.79910945892334, "kl": 0.125, "learning_rate": 8.135198135198134e-07, "loss": 0.0001, "reward": 1.34375, "reward_std": 0.22201896458864212, "rewards/accuracy_reward": 0.59375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.25, "rewards/format_reward": 0.75, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 925.96875, "epoch": 0.5606060606060606, "grad_norm": 1.4351104497909546, "kl": 0.1357421875, "learning_rate": 8.131313131313132e-07, "loss": 0.0001, "reward": 1.21875, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 481 }, { "clip_ratio": 0.0, "completion_length": 747.15625, "epoch": 0.5617715617715617, "grad_norm": 2.9541876316070557, "kl": 0.2017822265625, "learning_rate": 8.127428127428128e-07, "loss": 0.0002, "reward": 1.65625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 482 }, { "clip_ratio": 0.0, "completion_length": 898.90625, "epoch": 0.5629370629370629, "grad_norm": 4.0103654861450195, "kl": 0.14013671875, "learning_rate": 8.123543123543123e-07, "loss": 0.0001, "reward": 1.15625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 483 }, { "clip_ratio": 0.0, "completion_length": 605.09375, "epoch": 0.5641025641025641, "grad_norm": 5.096508026123047, "kl": 0.2154541015625, "learning_rate": 8.119658119658119e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 484 }, { "clip_ratio": 0.0, "completion_length": 669.3125, "epoch": 0.5652680652680653, "grad_norm": 0.010297026485204697, "kl": 0.190185546875, "learning_rate": 8.115773115773116e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 932.90625, "epoch": 0.5664335664335665, "grad_norm": 0.8793935775756836, "kl": 0.12939453125, "learning_rate": 8.111888111888111e-07, "loss": 0.0001, "reward": 1.09375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 486 }, { "clip_ratio": 0.0, "completion_length": 857.15625, "epoch": 0.5675990675990676, "grad_norm": 3.229492664337158, "kl": 0.134521484375, "learning_rate": 8.108003108003107e-07, "loss": 0.0001, "reward": 1.3125, "reward_std": 0.3514062538743019, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 487 }, { "clip_ratio": 0.0, "completion_length": 943.71875, "epoch": 0.5687645687645687, "grad_norm": 1.0932947397232056, "kl": 0.132568359375, "learning_rate": 8.104118104118104e-07, "loss": 0.0001, "reward": 1.375, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 488 }, { "clip_ratio": 0.0, "completion_length": 590.96875, "epoch": 0.5699300699300699, "grad_norm": 1.727843165397644, "kl": 0.273193359375, "learning_rate": 8.100233100233101e-07, "loss": 0.0003, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.46875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 489 }, { "clip_ratio": 0.0, "completion_length": 989.4375, "epoch": 0.5710955710955711, "grad_norm": 1.0461091995239258, "kl": 0.1275634765625, "learning_rate": 8.096348096348096e-07, "loss": 0.0001, "reward": 1.59375, "reward_std": 0.22201896458864212, "rewards/accuracy_reward": 0.59375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 911.6875, "epoch": 0.5722610722610723, "grad_norm": 0.9787514209747314, "kl": 0.1376953125, "learning_rate": 8.092463092463092e-07, "loss": 0.0001, "reward": 0.9375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.5, "rewards/score_task": 2.0, "step": 491 }, { "clip_ratio": 0.0, "completion_length": 907.59375, "epoch": 0.5734265734265734, "grad_norm": 1.3085023164749146, "kl": 0.1444091796875, "learning_rate": 8.088578088578088e-07, "loss": 0.0001, "reward": 0.90625, "reward_std": 0.22201896458864212, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 492 }, { "clip_ratio": 0.0, "completion_length": 687.9375, "epoch": 0.5745920745920746, "grad_norm": 4.679673671722412, "kl": 0.244140625, "learning_rate": 8.084693084693084e-07, "loss": 0.0002, "reward": 0.9375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 493 }, { "clip_ratio": 0.0, "completion_length": 963.875, "epoch": 0.5757575757575758, "grad_norm": 2.1537208557128906, "kl": 0.14501953125, "learning_rate": 8.08080808080808e-07, "loss": 0.0001, "reward": 1.46875, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 494 }, { "clip_ratio": 0.0, "completion_length": 745.875, "epoch": 0.5769230769230769, "grad_norm": 1.1613861322402954, "kl": 0.15087890625, "learning_rate": 8.076923076923077e-07, "loss": 0.0002, "reward": 0.75, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.71875, "rewards/score_task": 1.0, "step": 495 }, { "clip_ratio": 0.0, "completion_length": 715.59375, "epoch": 0.578088578088578, "grad_norm": 2.177772045135498, "kl": 0.18994140625, "learning_rate": 8.073038073038073e-07, "loss": 0.0002, "reward": 1.9375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 496 }, { "clip_ratio": 0.0, "completion_length": 845.375, "epoch": 0.5792540792540792, "grad_norm": 16.282629013061523, "kl": 0.22900390625, "learning_rate": 8.069153069153069e-07, "loss": 0.0002, "reward": 1.5625, "reward_std": 0.249358132481575, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "step": 497 }, { "clip_ratio": 0.0, "completion_length": 870.84375, "epoch": 0.5804195804195804, "grad_norm": 0.7765024304389954, "kl": 0.1494140625, "learning_rate": 8.065268065268065e-07, "loss": 0.0001, "reward": 0.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.5, "rewards/score_task": 1.5, "step": 498 }, { "clip_ratio": 0.0, "completion_length": 985.84375, "epoch": 0.5815850815850816, "grad_norm": 0.7569090723991394, "kl": 0.148193359375, "learning_rate": 8.061383061383061e-07, "loss": 0.0001, "reward": 0.9375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 499 }, { "clip_ratio": 0.0, "completion_length": 545.96875, "epoch": 0.5827505827505828, "grad_norm": 18.709705352783203, "kl": 0.2413330078125, "learning_rate": 8.057498057498056e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 819.5625, "epoch": 0.583916083916084, "grad_norm": 1.3821516036987305, "kl": 0.266357421875, "learning_rate": 8.053613053613053e-07, "loss": 0.0003, "reward": 1.40625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "step": 501 }, { "clip_ratio": 0.0, "completion_length": 861.65625, "epoch": 0.585081585081585, "grad_norm": 3.9634902477264404, "kl": 0.231201171875, "learning_rate": 8.04972804972805e-07, "loss": 0.0002, "reward": 0.90625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "step": 502 }, { "clip_ratio": 0.0, "completion_length": 730.6875, "epoch": 0.5862470862470862, "grad_norm": 0.7899097800254822, "kl": 0.15185546875, "learning_rate": 8.045843045843046e-07, "loss": 0.0002, "reward": 1.40625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 503 }, { "clip_ratio": 0.0, "completion_length": 653.25, "epoch": 0.5874125874125874, "grad_norm": 21.5330867767334, "kl": 0.245361328125, "learning_rate": 8.041958041958041e-07, "loss": 0.0002, "reward": 1.4375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.4375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 504 }, { "clip_ratio": 0.0, "completion_length": 1042.90625, "epoch": 0.5885780885780886, "grad_norm": 0.006581469904631376, "kl": 0.14013671875, "learning_rate": 8.038073038073038e-07, "loss": 0.0001, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.75, "rewards/format_reward": 0.5, "step": 505 }, { "clip_ratio": 0.0, "completion_length": 997.90625, "epoch": 0.5897435897435898, "grad_norm": 1.0014102458953857, "kl": 0.146240234375, "learning_rate": 8.034188034188034e-07, "loss": 0.0001, "reward": 1.375, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 506 }, { "clip_ratio": 0.0, "completion_length": 863.3125, "epoch": 0.5909090909090909, "grad_norm": 0.9787000417709351, "kl": 0.1494140625, "learning_rate": 8.030303030303029e-07, "loss": 0.0001, "reward": 1.6875, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.6875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.3333333333333333, "step": 507 }, { "clip_ratio": 0.0, "completion_length": 1063.6875, "epoch": 0.5920745920745921, "grad_norm": 0.6609181761741638, "kl": 0.12939453125, "learning_rate": 8.026418026418025e-07, "loss": 0.0001, "reward": 1.125, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 508 }, { "clip_ratio": 0.0, "completion_length": 912.03125, "epoch": 0.5932400932400932, "grad_norm": 7.026159763336182, "kl": 0.15087890625, "learning_rate": 8.022533022533023e-07, "loss": 0.0002, "reward": 1.4375, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 509 }, { "clip_ratio": 0.0, "completion_length": 844.4375, "epoch": 0.5944055944055944, "grad_norm": 0.9700106382369995, "kl": 0.229736328125, "learning_rate": 8.018648018648019e-07, "loss": 0.0002, "reward": 0.9375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "step": 510 }, { "clip_ratio": 0.0, "completion_length": 660.15625, "epoch": 0.5955710955710956, "grad_norm": 22.417972564697266, "kl": 0.25048828125, "learning_rate": 8.014763014763014e-07, "loss": 0.0003, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 511 }, { "clip_ratio": 0.0, "completion_length": 521.4375, "epoch": 0.5967365967365967, "grad_norm": 4.730589866638184, "kl": 0.3544921875, "learning_rate": 8.01087801087801e-07, "loss": 0.0004, "reward": 1.875, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.40625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 512 }, { "clip_ratio": 0.0, "completion_length": 1079.6875, "epoch": 0.5979020979020979, "grad_norm": 2.1530652046203613, "kl": 0.16162109375, "learning_rate": 8.006993006993007e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 513 }, { "clip_ratio": 0.0, "completion_length": 674.15625, "epoch": 0.5990675990675991, "grad_norm": 5.262468338012695, "kl": 0.306396484375, "learning_rate": 8.003108003108002e-07, "loss": 0.0003, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.4375, "rewards/comparison_task": 1.5, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 514 }, { "clip_ratio": 0.0, "completion_length": 1081.09375, "epoch": 0.6002331002331003, "grad_norm": 0.00979061983525753, "kl": 0.15185546875, "learning_rate": 7.999222999222998e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 515 }, { "clip_ratio": 0.0, "completion_length": 976.15625, "epoch": 0.6013986013986014, "grad_norm": 0.6070682406425476, "kl": 0.160888671875, "learning_rate": 7.995337995337996e-07, "loss": 0.0002, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 516 }, { "clip_ratio": 0.0, "completion_length": 929.5, "epoch": 0.6025641025641025, "grad_norm": 0.8508919477462769, "kl": 0.15087890625, "learning_rate": 7.991452991452992e-07, "loss": 0.0002, "reward": 1.65625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.65625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 517 }, { "clip_ratio": 0.0, "completion_length": 473.46875, "epoch": 0.6037296037296037, "grad_norm": 3.5401549339294434, "kl": 0.267822265625, "learning_rate": 7.987567987567987e-07, "loss": 0.0003, "reward": 1.90625, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.4375, "rewards/comparison_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 518 }, { "clip_ratio": 0.0, "completion_length": 887.84375, "epoch": 0.6048951048951049, "grad_norm": 2.8800384998321533, "kl": 0.205078125, "learning_rate": 7.983682983682983e-07, "loss": 0.0002, "reward": 1.375, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 519 }, { "clip_ratio": 0.0, "completion_length": 996.625, "epoch": 0.6060606060606061, "grad_norm": 0.5787278413772583, "kl": 0.171142578125, "learning_rate": 7.97979797979798e-07, "loss": 0.0002, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.71875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 520 }, { "clip_ratio": 0.0, "completion_length": 910.75, "epoch": 0.6072261072261073, "grad_norm": 0.7896198630332947, "kl": 0.19677734375, "learning_rate": 7.975912975912975e-07, "loss": 0.0002, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "step": 521 }, { "clip_ratio": 0.0, "completion_length": 958.34375, "epoch": 0.6083916083916084, "grad_norm": 0.009917091578245163, "kl": 0.160400390625, "learning_rate": 7.972027972027971e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 522 }, { "clip_ratio": 0.0, "completion_length": 747.0, "epoch": 0.6095571095571095, "grad_norm": 2.7684991359710693, "kl": 0.246826171875, "learning_rate": 7.968142968142968e-07, "loss": 0.0002, "reward": 1.53125, "reward_std": 0.3377464786171913, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 523 }, { "clip_ratio": 0.0, "completion_length": 1054.78125, "epoch": 0.6107226107226107, "grad_norm": 1.3051972389221191, "kl": 0.159423828125, "learning_rate": 7.964257964257965e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.25, "rewards/format_reward": 0.71875, "step": 524 }, { "clip_ratio": 0.0, "completion_length": 1030.1875, "epoch": 0.6118881118881119, "grad_norm": 0.5977635979652405, "kl": 0.158203125, "learning_rate": 7.96037296037296e-07, "loss": 0.0002, "reward": 1.65625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.6875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 0.96875, "step": 525 }, { "clip_ratio": 0.0, "completion_length": 1019.40625, "epoch": 0.6130536130536131, "grad_norm": 1.2664049863815308, "kl": 0.169921875, "learning_rate": 7.956487956487956e-07, "loss": 0.0002, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.6875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 526 }, { "clip_ratio": 0.0, "completion_length": 928.53125, "epoch": 0.6142191142191142, "grad_norm": 0.8407391905784607, "kl": 0.16357421875, "learning_rate": 7.952602952602952e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 527 }, { "clip_ratio": 0.0, "completion_length": 671.34375, "epoch": 0.6153846153846154, "grad_norm": 0.04718818515539169, "kl": 0.288330078125, "learning_rate": 7.948717948717948e-07, "loss": 0.0003, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.5, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "step": 528 }, { "clip_ratio": 0.0, "completion_length": 889.34375, "epoch": 0.6165501165501166, "grad_norm": 0.6027591228485107, "kl": 0.16943359375, "learning_rate": 7.944832944832944e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 529 }, { "clip_ratio": 0.0, "completion_length": 1095.3125, "epoch": 0.6177156177156177, "grad_norm": 0.6514151096343994, "kl": 0.160888671875, "learning_rate": 7.940947940947941e-07, "loss": 0.0002, "reward": 1.65625, "reward_std": 0.18600594997406006, "rewards/accuracy_reward": 0.6875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 530 }, { "clip_ratio": 0.0, "completion_length": 853.3125, "epoch": 0.6188811188811189, "grad_norm": 5.958168983459473, "kl": 0.236083984375, "learning_rate": 7.937062937062937e-07, "loss": 0.0002, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "step": 531 }, { "clip_ratio": 0.0, "completion_length": 858.25, "epoch": 0.62004662004662, "grad_norm": 0.7513402104377747, "kl": 0.179443359375, "learning_rate": 7.933177933177933e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.53125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 532 }, { "clip_ratio": 0.0, "completion_length": 641.34375, "epoch": 0.6212121212121212, "grad_norm": 3.277092933654785, "kl": 0.234130859375, "learning_rate": 7.929292929292929e-07, "loss": 0.0002, "reward": 1.5625, "reward_std": 0.3104073107242584, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.6666666666666667, "step": 533 }, { "clip_ratio": 0.0, "completion_length": 1010.84375, "epoch": 0.6223776223776224, "grad_norm": 0.010684143751859665, "kl": 0.16162109375, "learning_rate": 7.925407925407925e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 534 }, { "clip_ratio": 0.0, "completion_length": 1078.15625, "epoch": 0.6235431235431236, "grad_norm": 60.14698791503906, "kl": 0.146728515625, "learning_rate": 7.92152292152292e-07, "loss": 0.0001, "reward": 1.59375, "reward_std": 0.22201896458864212, "rewards/accuracy_reward": 0.59375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 535 }, { "clip_ratio": 0.0, "completion_length": 1027.21875, "epoch": 0.6247086247086248, "grad_norm": 1.513410210609436, "kl": 0.15234375, "learning_rate": 7.917637917637917e-07, "loss": 0.0002, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.71875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 536 }, { "clip_ratio": 0.0, "completion_length": 1025.09375, "epoch": 0.6258741258741258, "grad_norm": 0.009823227301239967, "kl": 0.15185546875, "learning_rate": 7.913752913752914e-07, "loss": 0.0002, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 537 }, { "clip_ratio": 0.0, "completion_length": 738.75, "epoch": 0.627039627039627, "grad_norm": 4.842891693115234, "kl": 0.23486328125, "learning_rate": 7.90986790986791e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 538 }, { "clip_ratio": 0.0, "completion_length": 1205.4375, "epoch": 0.6282051282051282, "grad_norm": 0.7836298942565918, "kl": 0.150634765625, "learning_rate": 7.905982905982905e-07, "loss": 0.0002, "reward": 1.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.6875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.75, "rewards/format_reward": 1.0, "step": 539 }, { "clip_ratio": 0.0, "completion_length": 991.5625, "epoch": 0.6293706293706294, "grad_norm": 0.8075949549674988, "kl": 0.18212890625, "learning_rate": 7.902097902097902e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 540 }, { "clip_ratio": 0.0, "completion_length": 470.75, "epoch": 0.6305361305361306, "grad_norm": 1.460820198059082, "kl": 0.29833984375, "learning_rate": 7.898212898212898e-07, "loss": 0.0003, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.71875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "step": 541 }, { "clip_ratio": 0.0, "completion_length": 1018.0625, "epoch": 0.6317016317016317, "grad_norm": 0.007929584011435509, "kl": 0.149658203125, "learning_rate": 7.894327894327893e-07, "loss": 0.0001, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 542 }, { "clip_ratio": 0.0, "completion_length": 657.84375, "epoch": 0.6328671328671329, "grad_norm": 3.4406723976135254, "kl": 0.266845703125, "learning_rate": 7.890442890442889e-07, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.4375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "step": 543 }, { "clip_ratio": 0.0, "completion_length": 948.5625, "epoch": 0.634032634032634, "grad_norm": 0.8129713535308838, "kl": 0.19091796875, "learning_rate": 7.886557886557887e-07, "loss": 0.0002, "reward": 1.65625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "step": 544 }, { "clip_ratio": 0.0, "completion_length": 916.90625, "epoch": 0.6351981351981352, "grad_norm": 1.9235788583755493, "kl": 0.24462890625, "learning_rate": 7.882672882672883e-07, "loss": 0.0002, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "step": 545 }, { "clip_ratio": 0.0, "completion_length": 654.21875, "epoch": 0.6363636363636364, "grad_norm": 3.5536673069000244, "kl": 0.2763671875, "learning_rate": 7.878787878787878e-07, "loss": 0.0003, "reward": 1.15625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 546 }, { "clip_ratio": 0.0, "completion_length": 1078.21875, "epoch": 0.6375291375291375, "grad_norm": 0.7203482389450073, "kl": 0.16162109375, "learning_rate": 7.874902874902874e-07, "loss": 0.0002, "reward": 1.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.6875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 547 }, { "clip_ratio": 0.0, "completion_length": 1093.875, "epoch": 0.6386946386946387, "grad_norm": 0.9199702143669128, "kl": 0.1668701171875, "learning_rate": 7.871017871017871e-07, "loss": 0.0002, "reward": 1.46875, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 548 }, { "clip_ratio": 0.0, "completion_length": 918.15625, "epoch": 0.6398601398601399, "grad_norm": 0.04324110224843025, "kl": 0.228271484375, "learning_rate": 7.867132867132866e-07, "loss": 0.0002, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 549 }, { "clip_ratio": 0.0, "completion_length": 1200.15625, "epoch": 0.6410256410256411, "grad_norm": 4.385340690612793, "kl": 0.1558837890625, "learning_rate": 7.863247863247862e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 550 }, { "clip_ratio": 0.0, "completion_length": 1129.125, "epoch": 0.6421911421911422, "grad_norm": 0.9475212097167969, "kl": 0.1630859375, "learning_rate": 7.85936285936286e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 551 }, { "clip_ratio": 0.0, "completion_length": 1135.34375, "epoch": 0.6433566433566433, "grad_norm": 0.502124011516571, "kl": 0.1474609375, "learning_rate": 7.855477855477856e-07, "loss": 0.0001, "reward": 0.375, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.25, "step": 552 }, { "clip_ratio": 0.0, "completion_length": 1299.8125, "epoch": 0.6445221445221445, "grad_norm": 0.729947566986084, "kl": 0.1494140625, "learning_rate": 7.851592851592851e-07, "loss": 0.0001, "reward": 1.375, "reward_std": 0.3535533845424652, "rewards/accuracy_reward": 0.6875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 0.6875, "step": 553 }, { "clip_ratio": 0.0, "completion_length": 1160.15625, "epoch": 0.6456876456876457, "grad_norm": 0.009568476118147373, "kl": 0.153076171875, "learning_rate": 7.847707847707847e-07, "loss": 0.0002, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.75, "rewards/format_reward": 0.5, "step": 554 }, { "clip_ratio": 0.0, "completion_length": 596.34375, "epoch": 0.6468531468531469, "grad_norm": 2.3137471675872803, "kl": 0.29833984375, "learning_rate": 7.843822843822844e-07, "loss": 0.0003, "reward": 1.6875, "reward_std": 0.3514062538743019, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.46875, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 555 }, { "clip_ratio": 0.0, "completion_length": 1254.125, "epoch": 0.6480186480186481, "grad_norm": 0.7167354226112366, "kl": 0.146484375, "learning_rate": 7.839937839937839e-07, "loss": 0.0001, "reward": 0.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.75, "rewards/format_reward": 0.5, "step": 556 }, { "clip_ratio": 0.0, "completion_length": 1271.0625, "epoch": 0.6491841491841492, "grad_norm": 0.8784035444259644, "kl": 0.140625, "learning_rate": 7.836052836052835e-07, "loss": 0.0001, "reward": 1.5625, "reward_std": 0.3471825420856476, "rewards/accuracy_reward": 0.5625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.75, "rewards/format_reward": 1.0, "step": 557 }, { "clip_ratio": 0.0, "completion_length": 895.8125, "epoch": 0.6503496503496503, "grad_norm": 2.1142184734344482, "kl": 0.229248046875, "learning_rate": 7.832167832167832e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.71875, "rewards/score_task": 1.0, "step": 558 }, { "clip_ratio": 0.0, "completion_length": 849.90625, "epoch": 0.6515151515151515, "grad_norm": 5.608709812164307, "kl": 0.233154296875, "learning_rate": 7.828282828282829e-07, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.31539323925971985, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.71875, "rewards/score_task": 1.0, "step": 559 }, { "clip_ratio": 0.0, "completion_length": 832.4375, "epoch": 0.6526806526806527, "grad_norm": 3.757068157196045, "kl": 0.2314453125, "learning_rate": 7.824397824397824e-07, "loss": 0.0002, "reward": 1.78125, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.5625, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 560 }, { "clip_ratio": 0.0, "completion_length": 844.03125, "epoch": 0.6538461538461539, "grad_norm": 2.252671718597412, "kl": 0.18994140625, "learning_rate": 7.82051282051282e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.2587745785713196, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.3333333333333333, "step": 561 }, { "clip_ratio": 0.0, "completion_length": 1047.28125, "epoch": 0.655011655011655, "grad_norm": 1.8573347330093384, "kl": 0.1904296875, "learning_rate": 7.816627816627816e-07, "loss": 0.0002, "reward": 1.4375, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 562 }, { "clip_ratio": 0.0, "completion_length": 1135.34375, "epoch": 0.6561771561771562, "grad_norm": 0.7356560826301575, "kl": 0.155029296875, "learning_rate": 7.812742812742812e-07, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 563 }, { "clip_ratio": 0.0, "completion_length": 586.3125, "epoch": 0.6573426573426573, "grad_norm": 12.044687271118164, "kl": 0.344482421875, "learning_rate": 7.808857808857809e-07, "loss": 0.0003, "reward": 1.875, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.4375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 564 }, { "clip_ratio": 0.0, "completion_length": 713.8125, "epoch": 0.6585081585081585, "grad_norm": 0.45428067445755005, "kl": 0.283935546875, "learning_rate": 7.804972804972805e-07, "loss": 0.0003, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 565 }, { "clip_ratio": 0.0, "completion_length": 1280.03125, "epoch": 0.6596736596736597, "grad_norm": 0.6723991632461548, "kl": 0.146240234375, "learning_rate": 7.801087801087801e-07, "loss": 0.0001, "reward": 0.4375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.25, "step": 566 }, { "clip_ratio": 0.0, "completion_length": 877.5625, "epoch": 0.6608391608391608, "grad_norm": 0.9988238215446472, "kl": 0.1976318359375, "learning_rate": 7.797202797202797e-07, "loss": 0.0002, "reward": 1.59375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.59375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.3333333333333333, "step": 567 }, { "clip_ratio": 0.0, "completion_length": 1152.0, "epoch": 0.662004662004662, "grad_norm": 0.013030829839408398, "kl": 0.136962890625, "learning_rate": 7.793317793317793e-07, "loss": 0.0001, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 568 }, { "clip_ratio": 0.0, "completion_length": 601.6875, "epoch": 0.6631701631701632, "grad_norm": 0.030429678037762642, "kl": 0.3330078125, "learning_rate": 7.789432789432789e-07, "loss": 0.0003, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.5, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 569 }, { "clip_ratio": 0.0, "completion_length": 965.84375, "epoch": 0.6643356643356644, "grad_norm": 1.75020170211792, "kl": 0.2178955078125, "learning_rate": 7.785547785547784e-07, "loss": 0.0002, "reward": 1.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 570 }, { "clip_ratio": 0.0, "completion_length": 679.75, "epoch": 0.6655011655011654, "grad_norm": 0.7702856063842773, "kl": 0.30419921875, "learning_rate": 7.781662781662782e-07, "loss": 0.0003, "reward": 1.03125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 571 }, { "clip_ratio": 0.0, "completion_length": 682.375, "epoch": 0.6666666666666666, "grad_norm": 5.100589275360107, "kl": 0.2373046875, "learning_rate": 7.777777777777778e-07, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.25, "step": 572 }, { "clip_ratio": 0.0, "completion_length": 906.46875, "epoch": 0.6678321678321678, "grad_norm": 0.7339485883712769, "kl": 0.190185546875, "learning_rate": 7.773892773892774e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 573 }, { "clip_ratio": 0.0, "completion_length": 971.09375, "epoch": 0.668997668997669, "grad_norm": 52.588199615478516, "kl": 0.2080078125, "learning_rate": 7.770007770007769e-07, "loss": 0.0002, "reward": 1.40625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "step": 574 }, { "clip_ratio": 0.0, "completion_length": 1178.875, "epoch": 0.6701631701631702, "grad_norm": 3.7292251586914062, "kl": 0.177734375, "learning_rate": 7.766122766122766e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 575 }, { "clip_ratio": 0.0, "completion_length": 1073.34375, "epoch": 0.6713286713286714, "grad_norm": 0.3448844850063324, "kl": 0.1785888671875, "learning_rate": 7.762237762237762e-07, "loss": 0.0002, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 576 }, { "clip_ratio": 0.0, "completion_length": 1127.53125, "epoch": 0.6724941724941725, "grad_norm": 0.8948142528533936, "kl": 0.3583984375, "learning_rate": 7.758352758352757e-07, "loss": 0.0004, "reward": 1.25, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 577 }, { "clip_ratio": 0.0, "completion_length": 524.28125, "epoch": 0.6736596736596736, "grad_norm": 28.04563331604004, "kl": 0.335205078125, "learning_rate": 7.754467754467754e-07, "loss": 0.0003, "reward": 1.5, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.40625, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 578 }, { "clip_ratio": 0.0, "completion_length": 1136.9375, "epoch": 0.6748251748251748, "grad_norm": 0.7669429779052734, "kl": 0.1875, "learning_rate": 7.750582750582751e-07, "loss": 0.0002, "reward": 1.03125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 579 }, { "clip_ratio": 0.0, "completion_length": 1215.34375, "epoch": 0.675990675990676, "grad_norm": 0.5905019640922546, "kl": 0.151123046875, "learning_rate": 7.746697746697747e-07, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.5, "step": 580 }, { "clip_ratio": 0.0, "completion_length": 1275.71875, "epoch": 0.6771561771561772, "grad_norm": 0.01109304465353489, "kl": 0.144775390625, "learning_rate": 7.742812742812742e-07, "loss": 0.0001, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.25, "rewards/format_reward": 0.75, "step": 581 }, { "clip_ratio": 0.0, "completion_length": 1288.1875, "epoch": 0.6783216783216783, "grad_norm": 0.5769696235656738, "kl": 0.1455078125, "learning_rate": 7.738927738927738e-07, "loss": 0.0001, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 582 }, { "clip_ratio": 0.0, "completion_length": 818.9375, "epoch": 0.6794871794871795, "grad_norm": 0.013860102742910385, "kl": 0.2470703125, "learning_rate": 7.735042735042735e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 583 }, { "clip_ratio": 0.0, "completion_length": 734.625, "epoch": 0.6806526806526807, "grad_norm": 2.630621910095215, "kl": 0.2562255859375, "learning_rate": 7.73115773115773e-07, "loss": 0.0003, "reward": 1.8125, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.59375, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 584 }, { "clip_ratio": 0.0, "completion_length": 642.875, "epoch": 0.6818181818181818, "grad_norm": 6.29322624206543, "kl": 0.25634765625, "learning_rate": 7.727272727272727e-07, "loss": 0.0003, "reward": 1.8125, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.40625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 585 }, { "clip_ratio": 0.0, "completion_length": 426.84375, "epoch": 0.682983682983683, "grad_norm": 2.214247703552246, "kl": 0.33642578125, "learning_rate": 7.723387723387723e-07, "loss": 0.0003, "reward": 1.65625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.46875, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 586 }, { "clip_ratio": 0.0, "completion_length": 848.46875, "epoch": 0.6841491841491841, "grad_norm": 12.034211158752441, "kl": 0.2359619140625, "learning_rate": 7.71950271950272e-07, "loss": 0.0002, "reward": 1.125, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 587 }, { "clip_ratio": 0.0, "completion_length": 1013.9375, "epoch": 0.6853146853146853, "grad_norm": 1.121408462524414, "kl": 0.18359375, "learning_rate": 7.715617715617715e-07, "loss": 0.0002, "reward": 0.875, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 588 }, { "clip_ratio": 0.0, "completion_length": 1056.59375, "epoch": 0.6864801864801865, "grad_norm": 0.5731378197669983, "kl": 0.173095703125, "learning_rate": 7.711732711732711e-07, "loss": 0.0002, "reward": 0.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 589 }, { "clip_ratio": 0.0, "completion_length": 967.6875, "epoch": 0.6876456876456877, "grad_norm": 0.01123163104057312, "kl": 0.1771240234375, "learning_rate": 7.707847707847707e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 590 }, { "clip_ratio": 0.0, "completion_length": 778.125, "epoch": 0.6888111888111889, "grad_norm": 3.333902359008789, "kl": 0.2373046875, "learning_rate": 7.703962703962703e-07, "loss": 0.0002, "reward": 1.03125, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 591 }, { "clip_ratio": 0.0, "completion_length": 1175.46875, "epoch": 0.6899766899766899, "grad_norm": 0.006293183658272028, "kl": 0.14404296875, "learning_rate": 7.7000777000777e-07, "loss": 0.0001, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 592 }, { "clip_ratio": 0.0, "completion_length": 1007.34375, "epoch": 0.6911421911421911, "grad_norm": 0.9260781407356262, "kl": 0.1900634765625, "learning_rate": 7.696192696192696e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 593 }, { "clip_ratio": 0.0, "completion_length": 876.6875, "epoch": 0.6923076923076923, "grad_norm": 4.440556526184082, "kl": 0.247802734375, "learning_rate": 7.692307692307693e-07, "loss": 0.0002, "reward": 1.46875, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 594 }, { "clip_ratio": 0.0, "completion_length": 856.9375, "epoch": 0.6934731934731935, "grad_norm": 1.3199858665466309, "kl": 0.173095703125, "learning_rate": 7.688422688422688e-07, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.6666666666666667, "step": 595 }, { "clip_ratio": 0.0, "completion_length": 870.0625, "epoch": 0.6946386946386947, "grad_norm": 1.2469093799591064, "kl": 0.2060546875, "learning_rate": 7.684537684537684e-07, "loss": 0.0002, "reward": 1.46875, "reward_std": 0.3198433741927147, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.3333333333333333, "step": 596 }, { "clip_ratio": 0.0, "completion_length": 1127.0, "epoch": 0.6958041958041958, "grad_norm": 1.0692098140716553, "kl": 0.1434326171875, "learning_rate": 7.68065268065268e-07, "loss": 0.0001, "reward": 1.34375, "reward_std": 0.3061639815568924, "rewards/accuracy_reward": 0.59375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 597 }, { "clip_ratio": 0.0, "completion_length": 1221.59375, "epoch": 0.696969696969697, "grad_norm": 0.9245480895042419, "kl": 0.1361083984375, "learning_rate": 7.676767676767675e-07, "loss": 0.0001, "reward": 1.875, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.90625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 0.96875, "rewards/score_task": 2.0, "step": 598 }, { "clip_ratio": 0.0, "completion_length": 688.25, "epoch": 0.6981351981351981, "grad_norm": 6.174561023712158, "kl": 0.2939453125, "learning_rate": 7.672882672882673e-07, "loss": 0.0003, "reward": 1.34375, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 599 }, { "clip_ratio": 0.0, "completion_length": 1058.625, "epoch": 0.6993006993006993, "grad_norm": 1.1914880275726318, "kl": 0.160400390625, "learning_rate": 7.668997668997669e-07, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 600 }, { "clip_ratio": 0.0, "completion_length": 1053.25, "epoch": 0.7004662004662005, "grad_norm": 0.7481443285942078, "kl": 0.166748046875, "learning_rate": 7.665112665112665e-07, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 601 }, { "clip_ratio": 0.0, "completion_length": 774.84375, "epoch": 0.7016317016317016, "grad_norm": 3.333040475845337, "kl": 0.255615234375, "learning_rate": 7.66122766122766e-07, "loss": 0.0003, "reward": 1.5, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 602 }, { "clip_ratio": 0.0, "completion_length": 1094.125, "epoch": 0.7027972027972028, "grad_norm": 1.7898131608963013, "kl": 0.14990234375, "learning_rate": 7.657342657342657e-07, "loss": 0.0001, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.25, "rewards/format_reward": 0.75, "step": 603 }, { "clip_ratio": 0.0, "completion_length": 1077.71875, "epoch": 0.703962703962704, "grad_norm": 0.009560990147292614, "kl": 0.1513671875, "learning_rate": 7.653457653457653e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 604 }, { "clip_ratio": 0.0, "completion_length": 1232.71875, "epoch": 0.7051282051282052, "grad_norm": 0.00639670854434371, "kl": 0.14013671875, "learning_rate": 7.649572649572648e-07, "loss": 0.0001, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.25, "rewards/format_reward": 0.75, "step": 605 }, { "clip_ratio": 0.0, "completion_length": 1054.5625, "epoch": 0.7062937062937062, "grad_norm": 1.1477910280227661, "kl": 0.156005859375, "learning_rate": 7.645687645687646e-07, "loss": 0.0002, "reward": 1.59375, "reward_std": 0.3061639815568924, "rewards/accuracy_reward": 0.59375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 606 }, { "clip_ratio": 0.0, "completion_length": 1225.4375, "epoch": 0.7074592074592074, "grad_norm": 0.014238639734685421, "kl": 0.1444091796875, "learning_rate": 7.641802641802642e-07, "loss": 0.0001, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 0.75, "step": 607 }, { "clip_ratio": 0.0, "completion_length": 984.03125, "epoch": 0.7086247086247086, "grad_norm": 1.0076392889022827, "kl": 0.15625, "learning_rate": 7.637917637917638e-07, "loss": 0.0002, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.71875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 608 }, { "clip_ratio": 0.0, "completion_length": 1108.5625, "epoch": 0.7097902097902098, "grad_norm": 1.2032830715179443, "kl": 0.14794921875, "learning_rate": 7.634032634032633e-07, "loss": 0.0001, "reward": 0.90625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 609 }, { "clip_ratio": 0.0, "completion_length": 983.21875, "epoch": 0.710955710955711, "grad_norm": 0.8670274615287781, "kl": 0.15625, "learning_rate": 7.63014763014763e-07, "loss": 0.0002, "reward": 1.59375, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.59375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 610 }, { "clip_ratio": 0.0, "completion_length": 902.53125, "epoch": 0.7121212121212122, "grad_norm": 0.7938196063041687, "kl": 0.214111328125, "learning_rate": 7.626262626262626e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "step": 611 }, { "clip_ratio": 0.0, "completion_length": 1094.25, "epoch": 0.7132867132867133, "grad_norm": 0.7618253231048584, "kl": 0.156494140625, "learning_rate": 7.622377622377621e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.25, "rewards/format_reward": 0.75, "step": 612 }, { "clip_ratio": 0.0, "completion_length": 868.3125, "epoch": 0.7144522144522144, "grad_norm": 7.783769130706787, "kl": 0.2197265625, "learning_rate": 7.618492618492618e-07, "loss": 0.0002, "reward": 1.4375, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 613 }, { "clip_ratio": 0.0, "completion_length": 802.25, "epoch": 0.7156177156177156, "grad_norm": 10.875487327575684, "kl": 0.252197265625, "learning_rate": 7.614607614607615e-07, "loss": 0.0003, "reward": 1.28125, "reward_std": 0.4218914955854416, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.71875, "rewards/score_task": 2.0, "step": 614 }, { "clip_ratio": 0.0, "completion_length": 906.15625, "epoch": 0.7167832167832168, "grad_norm": 6.612956523895264, "kl": 0.2578125, "learning_rate": 7.610722610722611e-07, "loss": 0.0003, "reward": 1.34375, "reward_std": 0.3198433741927147, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "step": 615 }, { "clip_ratio": 0.0, "completion_length": 695.09375, "epoch": 0.717948717948718, "grad_norm": 8.516605377197266, "kl": 0.260986328125, "learning_rate": 7.606837606837606e-07, "loss": 0.0003, "reward": 1.03125, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 616 }, { "clip_ratio": 0.0, "completion_length": 583.25, "epoch": 0.7191142191142191, "grad_norm": 4.162206649780273, "kl": 0.424072265625, "learning_rate": 7.602952602952602e-07, "loss": 0.0004, "reward": 1.875, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.40625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "step": 617 }, { "clip_ratio": 0.0, "completion_length": 1165.0625, "epoch": 0.7202797202797203, "grad_norm": 0.011927187442779541, "kl": 0.13525390625, "learning_rate": 7.599067599067599e-07, "loss": 0.0001, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.25, "step": 618 }, { "clip_ratio": 0.0, "completion_length": 835.25, "epoch": 0.7214452214452215, "grad_norm": 1.8526803255081177, "kl": 0.15966796875, "learning_rate": 7.595182595182594e-07, "loss": 0.0002, "reward": 1.375, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 619 }, { "clip_ratio": 0.0, "completion_length": 874.8125, "epoch": 0.7226107226107226, "grad_norm": 0.00842986349016428, "kl": 0.14453125, "learning_rate": 7.591297591297591e-07, "loss": 0.0001, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.3333333333333333, "step": 620 }, { "clip_ratio": 0.0, "completion_length": 779.03125, "epoch": 0.7237762237762237, "grad_norm": 1.4484702348709106, "kl": 0.27197265625, "learning_rate": 7.587412587412587e-07, "loss": 0.0003, "reward": 1.34375, "reward_std": 0.22201896458864212, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 621 }, { "clip_ratio": 0.0, "completion_length": 863.40625, "epoch": 0.7249417249417249, "grad_norm": 0.04689738526940346, "kl": 0.277587890625, "learning_rate": 7.583527583527584e-07, "loss": 0.0003, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "step": 622 }, { "clip_ratio": 0.0, "completion_length": 1047.53125, "epoch": 0.7261072261072261, "grad_norm": 0.7426894307136536, "kl": 0.1552734375, "learning_rate": 7.579642579642579e-07, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 623 }, { "clip_ratio": 0.0, "completion_length": 806.375, "epoch": 0.7272727272727273, "grad_norm": 4.125370025634766, "kl": 0.237548828125, "learning_rate": 7.575757575757575e-07, "loss": 0.0002, "reward": 1.4375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 624 }, { "clip_ratio": 0.0, "completion_length": 776.625, "epoch": 0.7284382284382285, "grad_norm": 1.1329137086868286, "kl": 0.279296875, "learning_rate": 7.571872571872571e-07, "loss": 0.0003, "reward": 1.3125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 625 }, { "clip_ratio": 0.0, "completion_length": 863.03125, "epoch": 0.7296037296037297, "grad_norm": 0.010929190553724766, "kl": 0.255615234375, "learning_rate": 7.567987567987567e-07, "loss": 0.0003, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "step": 626 }, { "clip_ratio": 0.0, "completion_length": 778.4375, "epoch": 0.7307692307692307, "grad_norm": 0.9117987155914307, "kl": 0.228271484375, "learning_rate": 7.564102564102564e-07, "loss": 0.0002, "reward": 1.3125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 627 }, { "clip_ratio": 0.0, "completion_length": 749.59375, "epoch": 0.7319347319347319, "grad_norm": 5.2688679695129395, "kl": 0.31005859375, "learning_rate": 7.56021756021756e-07, "loss": 0.0003, "reward": 1.71875, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 628 }, { "clip_ratio": 0.0, "completion_length": 1261.625, "epoch": 0.7331002331002331, "grad_norm": 1.1351507902145386, "kl": 0.136962890625, "learning_rate": 7.556332556332556e-07, "loss": 0.0001, "reward": 1.625, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "step": 629 }, { "clip_ratio": 0.0, "completion_length": 1034.375, "epoch": 0.7342657342657343, "grad_norm": 1.324363112449646, "kl": 0.14892578125, "learning_rate": 7.552447552447552e-07, "loss": 0.0001, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 630 }, { "clip_ratio": 0.0, "completion_length": 1205.125, "epoch": 0.7354312354312355, "grad_norm": 0.0056030042469501495, "kl": 0.140869140625, "learning_rate": 7.548562548562548e-07, "loss": 0.0001, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 631 }, { "clip_ratio": 0.0, "completion_length": 1058.40625, "epoch": 0.7365967365967366, "grad_norm": 1.1737146377563477, "kl": 0.14208984375, "learning_rate": 7.544677544677544e-07, "loss": 0.0001, "reward": 1.125, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 632 }, { "clip_ratio": 0.0, "completion_length": 800.46875, "epoch": 0.7377622377622378, "grad_norm": 6.966175079345703, "kl": 0.28369140625, "learning_rate": 7.540792540792539e-07, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 633 }, { "clip_ratio": 0.0, "completion_length": 864.1875, "epoch": 0.7389277389277389, "grad_norm": 0.696052074432373, "kl": 0.159912109375, "learning_rate": 7.536907536907537e-07, "loss": 0.0002, "reward": 0.9375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 634 }, { "clip_ratio": 0.0, "completion_length": 866.34375, "epoch": 0.7400932400932401, "grad_norm": 1.5207942724227905, "kl": 0.169921875, "learning_rate": 7.533022533022533e-07, "loss": 0.0002, "reward": 1.125, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 635 }, { "clip_ratio": 0.0, "completion_length": 626.5625, "epoch": 0.7412587412587412, "grad_norm": 1.6932504177093506, "kl": 0.33740234375, "learning_rate": 7.529137529137529e-07, "loss": 0.0003, "reward": 1.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 636 }, { "clip_ratio": 0.0, "completion_length": 709.53125, "epoch": 0.7424242424242424, "grad_norm": 1.319443941116333, "kl": 0.291259765625, "learning_rate": 7.525252525252524e-07, "loss": 0.0003, "reward": 1.40625, "reward_std": 0.22201896458864212, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 637 }, { "clip_ratio": 0.0, "completion_length": 638.8125, "epoch": 0.7435897435897436, "grad_norm": 0.014507940970361233, "kl": 0.3466796875, "learning_rate": 7.521367521367521e-07, "loss": 0.0003, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 638 }, { "clip_ratio": 0.0, "completion_length": 987.71875, "epoch": 0.7447552447552448, "grad_norm": 0.006720430217683315, "kl": 0.15625, "learning_rate": 7.517482517482517e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 639 }, { "clip_ratio": 0.0, "completion_length": 841.375, "epoch": 0.745920745920746, "grad_norm": 0.8048001527786255, "kl": 0.3564453125, "learning_rate": 7.513597513597512e-07, "loss": 0.0004, "reward": 1.15625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "step": 640 }, { "clip_ratio": 0.0, "completion_length": 951.96875, "epoch": 0.747086247086247, "grad_norm": 1.3636887073516846, "kl": 0.146728515625, "learning_rate": 7.50971250971251e-07, "loss": 0.0001, "reward": 1.46875, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 641 }, { "clip_ratio": 0.0, "completion_length": 922.0, "epoch": 0.7482517482517482, "grad_norm": 0.8551910519599915, "kl": 0.152587890625, "learning_rate": 7.505827505827506e-07, "loss": 0.0002, "reward": 1.8125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.8125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 642 }, { "clip_ratio": 0.0, "completion_length": 896.09375, "epoch": 0.7494172494172494, "grad_norm": 1.3925578594207764, "kl": 0.164794921875, "learning_rate": 7.501942501942502e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.26726123690605164, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 643 }, { "clip_ratio": 0.0, "completion_length": 918.125, "epoch": 0.7505827505827506, "grad_norm": 0.6583772897720337, "kl": 0.167236328125, "learning_rate": 7.498057498057497e-07, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 644 }, { "clip_ratio": 0.0, "completion_length": 903.28125, "epoch": 0.7517482517482518, "grad_norm": 1.4157148599624634, "kl": 0.154052734375, "learning_rate": 7.494172494172494e-07, "loss": 0.0002, "reward": 1.375, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 645 }, { "clip_ratio": 0.0, "completion_length": 630.625, "epoch": 0.752913752913753, "grad_norm": 9.642718315124512, "kl": 0.40966796875, "learning_rate": 7.49028749028749e-07, "loss": 0.0004, "reward": 1.59375, "reward_std": 0.2773705795407295, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 0.96875, "step": 646 }, { "clip_ratio": 0.0, "completion_length": 1031.53125, "epoch": 0.754079254079254, "grad_norm": 1.3938920497894287, "kl": 0.151123046875, "learning_rate": 7.486402486402487e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 647 }, { "clip_ratio": 0.0, "completion_length": 944.0625, "epoch": 0.7552447552447552, "grad_norm": 1.9484246969223022, "kl": 0.164794921875, "learning_rate": 7.482517482517482e-07, "loss": 0.0002, "reward": 1.59375, "reward_std": 0.22201896458864212, "rewards/accuracy_reward": 0.59375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 648 }, { "clip_ratio": 0.0, "completion_length": 1162.75, "epoch": 0.7564102564102564, "grad_norm": 1.2813862562179565, "kl": 0.1484375, "learning_rate": 7.478632478632479e-07, "loss": 0.0001, "reward": 1.40625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.65625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 0.75, "step": 649 }, { "clip_ratio": 0.0, "completion_length": 898.0, "epoch": 0.7575757575757576, "grad_norm": 0.6539657711982727, "kl": 0.315673828125, "learning_rate": 7.474747474747475e-07, "loss": 0.0003, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "step": 650 }, { "clip_ratio": 0.0, "completion_length": 718.46875, "epoch": 0.7587412587412588, "grad_norm": 4.287980079650879, "kl": 0.2705078125, "learning_rate": 7.47086247086247e-07, "loss": 0.0003, "reward": 1.71875, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 651 }, { "clip_ratio": 0.0, "completion_length": 1030.96875, "epoch": 0.7599067599067599, "grad_norm": 0.008999990299344063, "kl": 0.159423828125, "learning_rate": 7.466977466977466e-07, "loss": 0.0002, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 652 }, { "clip_ratio": 0.0, "completion_length": 908.84375, "epoch": 0.7610722610722611, "grad_norm": 5.179237365722656, "kl": 0.328857421875, "learning_rate": 7.463092463092463e-07, "loss": 0.0003, "reward": 1.875, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.6875, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "step": 653 }, { "clip_ratio": 0.0, "completion_length": 1131.96875, "epoch": 0.7622377622377622, "grad_norm": 2.231149435043335, "kl": 0.1375732421875, "learning_rate": 7.45920745920746e-07, "loss": 0.0001, "reward": 0.96875, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 654 }, { "clip_ratio": 0.0, "completion_length": 901.15625, "epoch": 0.7634032634032634, "grad_norm": 0.015412844717502594, "kl": 0.2796630859375, "learning_rate": 7.455322455322455e-07, "loss": 0.0003, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "step": 655 }, { "clip_ratio": 0.0, "completion_length": 585.1875, "epoch": 0.7645687645687645, "grad_norm": 5.196659564971924, "kl": 0.5029296875, "learning_rate": 7.451437451437451e-07, "loss": 0.0005, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.4375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 656 }, { "clip_ratio": 0.0, "completion_length": 939.75, "epoch": 0.7657342657342657, "grad_norm": 0.012748613022267818, "kl": 0.238037109375, "learning_rate": 7.447552447552448e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "step": 657 }, { "clip_ratio": 0.0, "completion_length": 1039.25, "epoch": 0.7668997668997669, "grad_norm": 0.6527419686317444, "kl": 0.164794921875, "learning_rate": 7.443667443667443e-07, "loss": 0.0002, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.71875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 658 }, { "clip_ratio": 0.0, "completion_length": 1015.4375, "epoch": 0.7680652680652681, "grad_norm": 1.2534804344177246, "kl": 0.167236328125, "learning_rate": 7.439782439782439e-07, "loss": 0.0002, "reward": 1.53125, "reward_std": 0.3196365684270859, "rewards/accuracy_reward": 0.5625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 659 }, { "clip_ratio": 0.0, "completion_length": 1170.3125, "epoch": 0.7692307692307693, "grad_norm": 5.844786643981934, "kl": 0.152099609375, "learning_rate": 7.435897435897435e-07, "loss": 0.0002, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.71875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 0.75, "step": 660 }, { "clip_ratio": 0.0, "completion_length": 711.78125, "epoch": 0.7703962703962703, "grad_norm": 2.9558358192443848, "kl": 0.2236328125, "learning_rate": 7.432012432012433e-07, "loss": 0.0002, "reward": 1.40625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 661 }, { "clip_ratio": 0.0, "completion_length": 817.75, "epoch": 0.7715617715617715, "grad_norm": 0.02926255203783512, "kl": 0.1787109375, "learning_rate": 7.428127428127428e-07, "loss": 0.0002, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 662 }, { "clip_ratio": 0.0, "completion_length": 907.59375, "epoch": 0.7727272727272727, "grad_norm": 3.2081515789031982, "kl": 0.16259765625, "learning_rate": 7.424242424242424e-07, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 663 }, { "clip_ratio": 0.0, "completion_length": 786.65625, "epoch": 0.7738927738927739, "grad_norm": 0.7195518612861633, "kl": 0.22607421875, "learning_rate": 7.42035742035742e-07, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "step": 664 }, { "clip_ratio": 0.0, "completion_length": 960.53125, "epoch": 0.7750582750582751, "grad_norm": 0.0065539684146642685, "kl": 0.153564453125, "learning_rate": 7.416472416472416e-07, "loss": 0.0002, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 665 }, { "clip_ratio": 0.0, "completion_length": 592.96875, "epoch": 0.7762237762237763, "grad_norm": 2.305684804916382, "kl": 0.2548828125, "learning_rate": 7.412587412587412e-07, "loss": 0.0003, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.46875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 666 }, { "clip_ratio": 0.0, "completion_length": 1079.4375, "epoch": 0.7773892773892774, "grad_norm": 0.9634450078010559, "kl": 0.150634765625, "learning_rate": 7.408702408702408e-07, "loss": 0.0002, "reward": 0.90625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 667 }, { "clip_ratio": 0.0, "completion_length": 839.53125, "epoch": 0.7785547785547785, "grad_norm": 5.74575662612915, "kl": 0.224609375, "learning_rate": 7.404817404817405e-07, "loss": 0.0002, "reward": 1.8125, "reward_std": 0.408231720328331, "rewards/accuracy_reward": 0.625, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "step": 668 }, { "clip_ratio": 0.0, "completion_length": 861.90625, "epoch": 0.7797202797202797, "grad_norm": 2.546177625656128, "kl": 0.211669921875, "learning_rate": 7.400932400932401e-07, "loss": 0.0002, "reward": 1.65625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "step": 669 }, { "clip_ratio": 0.0, "completion_length": 770.3125, "epoch": 0.7808857808857809, "grad_norm": 1.210048794746399, "kl": 0.16015625, "learning_rate": 7.397047397047397e-07, "loss": 0.0002, "reward": 1.875, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 670 }, { "clip_ratio": 0.0, "completion_length": 875.78125, "epoch": 0.782051282051282, "grad_norm": 0.9919779300689697, "kl": 0.157470703125, "learning_rate": 7.393162393162393e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 671 }, { "clip_ratio": 0.0, "completion_length": 698.5625, "epoch": 0.7832167832167832, "grad_norm": 0.014537764713168144, "kl": 0.221435546875, "learning_rate": 7.389277389277388e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 672 }, { "clip_ratio": 0.0, "completion_length": 853.65625, "epoch": 0.7843822843822844, "grad_norm": 0.813754677772522, "kl": 0.16455078125, "learning_rate": 7.385392385392385e-07, "loss": 0.0002, "reward": 1.4375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 673 }, { "clip_ratio": 0.0, "completion_length": 602.15625, "epoch": 0.7855477855477856, "grad_norm": 23.613248825073242, "kl": 0.2373046875, "learning_rate": 7.381507381507381e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 674 }, { "clip_ratio": 0.0, "completion_length": 809.65625, "epoch": 0.7867132867132867, "grad_norm": 4.274684429168701, "kl": 0.218017578125, "learning_rate": 7.377622377622378e-07, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 675 }, { "clip_ratio": 0.0, "completion_length": 584.8125, "epoch": 0.7878787878787878, "grad_norm": 1.630208969116211, "kl": 0.287353515625, "learning_rate": 7.373737373737373e-07, "loss": 0.0003, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 676 }, { "clip_ratio": 0.0, "completion_length": 955.09375, "epoch": 0.789044289044289, "grad_norm": 0.0055786012671887875, "kl": 0.14794921875, "learning_rate": 7.36985236985237e-07, "loss": 0.0001, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 677 }, { "clip_ratio": 0.0, "completion_length": 743.5625, "epoch": 0.7902097902097902, "grad_norm": 0.8336368799209595, "kl": 0.208984375, "learning_rate": 7.365967365967366e-07, "loss": 0.0002, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "step": 678 }, { "clip_ratio": 0.0, "completion_length": 506.34375, "epoch": 0.7913752913752914, "grad_norm": 2.798611640930176, "kl": 0.294189453125, "learning_rate": 7.362082362082361e-07, "loss": 0.0003, "reward": 1.4375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.5, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 679 }, { "clip_ratio": 0.0, "completion_length": 868.5625, "epoch": 0.7925407925407926, "grad_norm": 1.4701074361801147, "kl": 0.156494140625, "learning_rate": 7.358197358197357e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 680 }, { "clip_ratio": 0.0, "completion_length": 723.90625, "epoch": 0.7937062937062938, "grad_norm": 0.0054223500192165375, "kl": 0.158203125, "learning_rate": 7.354312354312354e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 681 }, { "clip_ratio": 0.0, "completion_length": 784.5625, "epoch": 0.7948717948717948, "grad_norm": 0.021164558827877045, "kl": 0.239990234375, "learning_rate": 7.350427350427351e-07, "loss": 0.0002, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.5, "step": 682 }, { "clip_ratio": 0.0, "completion_length": 955.6875, "epoch": 0.796037296037296, "grad_norm": 0.038620855659246445, "kl": 0.201171875, "learning_rate": 7.346542346542346e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 683 }, { "clip_ratio": 0.0, "completion_length": 573.4375, "epoch": 0.7972027972027972, "grad_norm": 1.0636168718338013, "kl": 0.246337890625, "learning_rate": 7.342657342657343e-07, "loss": 0.0002, "reward": 1.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 684 }, { "clip_ratio": 0.0, "completion_length": 520.28125, "epoch": 0.7983682983682984, "grad_norm": 0.03629143163561821, "kl": 0.35595703125, "learning_rate": 7.338772338772339e-07, "loss": 0.0004, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.5, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.5, "step": 685 }, { "clip_ratio": 0.0, "completion_length": 735.46875, "epoch": 0.7995337995337995, "grad_norm": 1.3771523237228394, "kl": 0.169189453125, "learning_rate": 7.334887334887334e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.2630179077386856, "rewards/accuracy_reward": 0.53125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 686 }, { "clip_ratio": 0.0, "completion_length": 1059.65625, "epoch": 0.8006993006993007, "grad_norm": 1.2005177736282349, "kl": 0.143310546875, "learning_rate": 7.33100233100233e-07, "loss": 0.0001, "reward": 1.40625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.65625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 0.75, "step": 687 }, { "clip_ratio": 0.0, "completion_length": 694.0625, "epoch": 0.8018648018648019, "grad_norm": 0.007666144985705614, "kl": 0.2294921875, "learning_rate": 7.327117327117327e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.5, "rewards/score_task": 2.0, "step": 688 }, { "clip_ratio": 0.0, "completion_length": 662.4375, "epoch": 0.803030303030303, "grad_norm": 2.867168426513672, "kl": 0.218994140625, "learning_rate": 7.323232323232324e-07, "loss": 0.0002, "reward": 1.65625, "reward_std": 0.3808925524353981, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 689 }, { "clip_ratio": 0.0, "completion_length": 751.9375, "epoch": 0.8041958041958042, "grad_norm": 0.005651172250509262, "kl": 0.1640625, "learning_rate": 7.319347319347319e-07, "loss": 0.0002, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 690 }, { "clip_ratio": 0.0, "completion_length": 947.78125, "epoch": 0.8053613053613053, "grad_norm": 0.6616584062576294, "kl": 0.1689453125, "learning_rate": 7.315462315462315e-07, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 691 }, { "clip_ratio": 0.0, "completion_length": 854.21875, "epoch": 0.8065268065268065, "grad_norm": 0.7518628239631653, "kl": 0.165771484375, "learning_rate": 7.311577311577312e-07, "loss": 0.0002, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.71875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 692 }, { "clip_ratio": 0.0, "completion_length": 832.03125, "epoch": 0.8076923076923077, "grad_norm": 2.1839871406555176, "kl": 0.169189453125, "learning_rate": 7.307692307692307e-07, "loss": 0.0002, "reward": 1.09375, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 693 }, { "clip_ratio": 0.0, "completion_length": 779.8125, "epoch": 0.8088578088578089, "grad_norm": 1.4836593866348267, "kl": 0.161376953125, "learning_rate": 7.303807303807303e-07, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 694 }, { "clip_ratio": 0.0, "completion_length": 235.9375, "epoch": 0.8100233100233101, "grad_norm": 5.666616439819336, "kl": 0.401611328125, "learning_rate": 7.299922299922299e-07, "loss": 0.0004, "reward": 1.8125, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.71875, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 695 }, { "clip_ratio": 0.0, "completion_length": 813.75, "epoch": 0.8111888111888111, "grad_norm": 1.5051106214523315, "kl": 0.1591796875, "learning_rate": 7.296037296037297e-07, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 696 }, { "clip_ratio": 0.0, "completion_length": 548.34375, "epoch": 0.8123543123543123, "grad_norm": 2.8796939849853516, "kl": 0.25439453125, "learning_rate": 7.292152292152292e-07, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 697 }, { "clip_ratio": 0.0, "completion_length": 552.65625, "epoch": 0.8135198135198135, "grad_norm": 14.56486988067627, "kl": 0.287109375, "learning_rate": 7.288267288267288e-07, "loss": 0.0003, "reward": 1.375, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.4375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 698 }, { "clip_ratio": 0.0, "completion_length": 360.84375, "epoch": 0.8146853146853147, "grad_norm": 11.257431030273438, "kl": 0.364501953125, "learning_rate": 7.284382284382284e-07, "loss": 0.0004, "reward": 1.5625, "reward_std": 0.3104073107242584, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.4375, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 699 }, { "clip_ratio": 0.0, "completion_length": 756.15625, "epoch": 0.8158508158508159, "grad_norm": 0.004763707518577576, "kl": 0.16455078125, "learning_rate": 7.28049728049728e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 700 }, { "clip_ratio": 0.0, "completion_length": 662.25, "epoch": 0.817016317016317, "grad_norm": 3.6943492889404297, "kl": 0.237548828125, "learning_rate": 7.276612276612276e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 701 }, { "clip_ratio": 0.0, "completion_length": 853.28125, "epoch": 0.8181818181818182, "grad_norm": 1.2634373903274536, "kl": 0.1630859375, "learning_rate": 7.272727272727272e-07, "loss": 0.0002, "reward": 1.3125, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.5625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 702 }, { "clip_ratio": 0.0, "completion_length": 716.375, "epoch": 0.8193473193473193, "grad_norm": 2.0205159187316895, "kl": 0.175048828125, "learning_rate": 7.268842268842269e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 703 }, { "clip_ratio": 0.0, "completion_length": 861.375, "epoch": 0.8205128205128205, "grad_norm": 0.010954114608466625, "kl": 0.15576171875, "learning_rate": 7.264957264957265e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 704 }, { "clip_ratio": 0.0, "completion_length": 627.46875, "epoch": 0.8216783216783217, "grad_norm": 1.4775036573410034, "kl": 0.212646484375, "learning_rate": 7.261072261072261e-07, "loss": 0.0002, "reward": 1.3125, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.5625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 705 }, { "clip_ratio": 0.0, "completion_length": 668.96875, "epoch": 0.8228438228438228, "grad_norm": 3.3256027698516846, "kl": 0.245849609375, "learning_rate": 7.257187257187257e-07, "loss": 0.0002, "reward": 1.3125, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 706 }, { "clip_ratio": 0.0, "completion_length": 662.78125, "epoch": 0.824009324009324, "grad_norm": 0.015195311047136784, "kl": 0.172607421875, "learning_rate": 7.253302253302252e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 707 }, { "clip_ratio": 0.0, "completion_length": 694.4375, "epoch": 0.8251748251748252, "grad_norm": 1.525161623954773, "kl": 0.2431640625, "learning_rate": 7.249417249417249e-07, "loss": 0.0002, "reward": 1.40625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "step": 708 }, { "clip_ratio": 0.0, "completion_length": 660.625, "epoch": 0.8263403263403264, "grad_norm": 1.7830888032913208, "kl": 0.174072265625, "learning_rate": 7.245532245532245e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 709 }, { "clip_ratio": 0.0, "completion_length": 867.84375, "epoch": 0.8275058275058275, "grad_norm": 0.7535641193389893, "kl": 0.171630859375, "learning_rate": 7.241647241647242e-07, "loss": 0.0002, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.75, "rewards/format_reward": 0.96875, "step": 710 }, { "clip_ratio": 0.0, "completion_length": 936.1875, "epoch": 0.8286713286713286, "grad_norm": 1.091657280921936, "kl": 0.158203125, "learning_rate": 7.237762237762237e-07, "loss": 0.0002, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "step": 711 }, { "clip_ratio": 0.0, "completion_length": 870.125, "epoch": 0.8298368298368298, "grad_norm": 1.1088162660598755, "kl": 0.165283203125, "learning_rate": 7.233877233877234e-07, "loss": 0.0002, "reward": 1.90625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.90625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 712 }, { "clip_ratio": 0.0, "completion_length": 903.28125, "epoch": 0.831002331002331, "grad_norm": 1.0778120756149292, "kl": 0.169189453125, "learning_rate": 7.22999222999223e-07, "loss": 0.0002, "reward": 1.65625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.65625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.75, "rewards/format_reward": 1.0, "step": 713 }, { "clip_ratio": 0.0, "completion_length": 523.03125, "epoch": 0.8321678321678322, "grad_norm": 0.006309511139988899, "kl": 0.188232421875, "learning_rate": 7.226107226107225e-07, "loss": 0.0002, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.75, "step": 714 }, { "clip_ratio": 0.0, "completion_length": 766.09375, "epoch": 0.8333333333333334, "grad_norm": 0.005810405593365431, "kl": 0.1708984375, "learning_rate": 7.222222222222221e-07, "loss": 0.0002, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 715 }, { "clip_ratio": 0.0, "completion_length": 903.21875, "epoch": 0.8344988344988346, "grad_norm": 0.9241288900375366, "kl": 0.164306640625, "learning_rate": 7.218337218337218e-07, "loss": 0.0002, "reward": 0.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.25, "step": 716 }, { "clip_ratio": 0.0, "completion_length": 726.71875, "epoch": 0.8356643356643356, "grad_norm": 0.8134835362434387, "kl": 0.296630859375, "learning_rate": 7.214452214452215e-07, "loss": 0.0003, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "step": 717 }, { "clip_ratio": 0.0, "completion_length": 870.8125, "epoch": 0.8368298368298368, "grad_norm": 0.00596932927146554, "kl": 0.17578125, "learning_rate": 7.21056721056721e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.5, "step": 718 }, { "clip_ratio": 0.0, "completion_length": 599.0, "epoch": 0.837995337995338, "grad_norm": 0.0080425338819623, "kl": 0.2626953125, "learning_rate": 7.206682206682206e-07, "loss": 0.0003, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 719 }, { "clip_ratio": 0.0, "completion_length": 742.3125, "epoch": 0.8391608391608392, "grad_norm": 1.3470560312271118, "kl": 0.1630859375, "learning_rate": 7.202797202797203e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 720 }, { "clip_ratio": 0.0, "completion_length": 776.03125, "epoch": 0.8403263403263403, "grad_norm": 2.010561227798462, "kl": 0.17919921875, "learning_rate": 7.198912198912198e-07, "loss": 0.0002, "reward": 1.3125, "reward_std": 0.3335031494498253, "rewards/accuracy_reward": 0.5625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 721 }, { "clip_ratio": 0.0, "completion_length": 408.40625, "epoch": 0.8414918414918415, "grad_norm": 4.042581558227539, "kl": 0.3642578125, "learning_rate": 7.195027195027194e-07, "loss": 0.0004, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.46875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 722 }, { "clip_ratio": 0.0, "completion_length": 554.90625, "epoch": 0.8426573426573427, "grad_norm": 0.008478960953652859, "kl": 0.259033203125, "learning_rate": 7.19114219114219e-07, "loss": 0.0003, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 723 }, { "clip_ratio": 0.0, "completion_length": 528.6875, "epoch": 0.8438228438228438, "grad_norm": 12.484458923339844, "kl": 0.2783203125, "learning_rate": 7.187257187257188e-07, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 724 }, { "clip_ratio": 0.0, "completion_length": 508.59375, "epoch": 0.844988344988345, "grad_norm": 7.639255523681641, "kl": 0.314697265625, "learning_rate": 7.183372183372183e-07, "loss": 0.0003, "reward": 1.40625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.15625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 725 }, { "clip_ratio": 0.0, "completion_length": 712.71875, "epoch": 0.8461538461538461, "grad_norm": 3.3755671977996826, "kl": 2.701904296875, "learning_rate": 7.179487179487179e-07, "loss": 0.0027, "reward": 1.59375, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.59375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 726 }, { "clip_ratio": 0.0, "completion_length": 590.59375, "epoch": 0.8473193473193473, "grad_norm": 0.03442908078432083, "kl": 0.2841796875, "learning_rate": 7.175602175602176e-07, "loss": 0.0003, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 727 }, { "clip_ratio": 0.0, "completion_length": 833.65625, "epoch": 0.8484848484848485, "grad_norm": 0.8703358769416809, "kl": 0.1748046875, "learning_rate": 7.171717171717171e-07, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 728 }, { "clip_ratio": 0.0, "completion_length": 780.96875, "epoch": 0.8496503496503497, "grad_norm": 1.854462742805481, "kl": 0.1669921875, "learning_rate": 7.167832167832167e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.3650856465101242, "rewards/accuracy_reward": 0.625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.625, "rewards/score_task": 2.0, "step": 729 }, { "clip_ratio": 0.0, "completion_length": 498.53125, "epoch": 0.8508158508158508, "grad_norm": 1.0333352088928223, "kl": 0.28271484375, "learning_rate": 7.163947163947164e-07, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.71875, "rewards/score_task": 2.0, "step": 730 }, { "clip_ratio": 0.0, "completion_length": 808.875, "epoch": 0.8519813519813519, "grad_norm": 0.009211122058331966, "kl": 0.161865234375, "learning_rate": 7.160062160062161e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 731 }, { "clip_ratio": 0.0, "completion_length": 835.1875, "epoch": 0.8531468531468531, "grad_norm": 1.7293682098388672, "kl": 0.16650390625, "learning_rate": 7.156177156177156e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.3061639815568924, "rewards/accuracy_reward": 0.53125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 732 }, { "clip_ratio": 0.0, "completion_length": 530.9375, "epoch": 0.8543123543123543, "grad_norm": 0.010255655273795128, "kl": 0.2607421875, "learning_rate": 7.152292152292152e-07, "loss": 0.0003, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 733 }, { "clip_ratio": 0.0, "completion_length": 860.65625, "epoch": 0.8554778554778555, "grad_norm": 1.469641089439392, "kl": 0.160888671875, "learning_rate": 7.148407148407148e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 734 }, { "clip_ratio": 0.0, "completion_length": 727.71875, "epoch": 0.8566433566433567, "grad_norm": 0.7590431571006775, "kl": 0.169677734375, "learning_rate": 7.144522144522144e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 735 }, { "clip_ratio": 0.0, "completion_length": 799.21875, "epoch": 0.8578088578088578, "grad_norm": 3.636434316635132, "kl": 0.17041015625, "learning_rate": 7.14063714063714e-07, "loss": 0.0002, "reward": 1.40625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 736 }, { "clip_ratio": 0.0, "completion_length": 467.78125, "epoch": 0.8589743589743589, "grad_norm": 7.931349277496338, "kl": 0.26953125, "learning_rate": 7.136752136752137e-07, "loss": 0.0003, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 737 }, { "clip_ratio": 0.0, "completion_length": 427.5625, "epoch": 0.8601398601398601, "grad_norm": 2.9012033939361572, "kl": 0.24462890625, "learning_rate": 7.132867132867133e-07, "loss": 0.0002, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 738 }, { "clip_ratio": 0.0, "completion_length": 828.53125, "epoch": 0.8613053613053613, "grad_norm": 1.1279710531234741, "kl": 0.171875, "learning_rate": 7.128982128982129e-07, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 739 }, { "clip_ratio": 0.0, "completion_length": 711.25, "epoch": 0.8624708624708625, "grad_norm": 0.005367544014006853, "kl": 0.1640625, "learning_rate": 7.125097125097125e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 740 }, { "clip_ratio": 0.0, "completion_length": 414.15625, "epoch": 0.8636363636363636, "grad_norm": 12.037847518920898, "kl": 0.344970703125, "learning_rate": 7.121212121212121e-07, "loss": 0.0003, "reward": 1.65625, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.4375, "rewards/comparison_task": 1.5, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 741 }, { "clip_ratio": 0.0, "completion_length": 847.0625, "epoch": 0.8648018648018648, "grad_norm": 0.010493535548448563, "kl": 0.177978515625, "learning_rate": 7.117327117327116e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.25, "rewards/format_reward": 0.75, "step": 742 }, { "clip_ratio": 0.0, "completion_length": 864.78125, "epoch": 0.865967365967366, "grad_norm": 0.005247770342975855, "kl": 0.154541015625, "learning_rate": 7.113442113442113e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 743 }, { "clip_ratio": 0.0, "completion_length": 718.96875, "epoch": 0.8671328671328671, "grad_norm": 3.1964099407196045, "kl": 0.281005859375, "learning_rate": 7.10955710955711e-07, "loss": 0.0003, "reward": 1.625, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.125, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "step": 744 }, { "clip_ratio": 0.0, "completion_length": 589.6875, "epoch": 0.8682983682983683, "grad_norm": 1.5372133255004883, "kl": 0.165771484375, "learning_rate": 7.105672105672106e-07, "loss": 0.0002, "reward": 1.9375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.9375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 745 }, { "clip_ratio": 0.0, "completion_length": 764.65625, "epoch": 0.8694638694638694, "grad_norm": 0.7307614684104919, "kl": 0.16455078125, "learning_rate": 7.101787101787101e-07, "loss": 0.0002, "reward": 0.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 746 }, { "clip_ratio": 0.0, "completion_length": 719.84375, "epoch": 0.8706293706293706, "grad_norm": 0.008959976024925709, "kl": 0.173095703125, "learning_rate": 7.097902097902098e-07, "loss": 0.0002, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 747 }, { "clip_ratio": 0.0, "completion_length": 621.65625, "epoch": 0.8717948717948718, "grad_norm": 6.0944437980651855, "kl": 0.492919921875, "learning_rate": 7.094017094017094e-07, "loss": 0.0005, "reward": 0.9375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 748 }, { "clip_ratio": 0.0, "completion_length": 825.1875, "epoch": 0.872960372960373, "grad_norm": 9.635035514831543, "kl": 0.159423828125, "learning_rate": 7.090132090132089e-07, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 749 }, { "clip_ratio": 0.0, "completion_length": 563.25, "epoch": 0.8741258741258742, "grad_norm": 0.0256084855645895, "kl": 0.22509765625, "learning_rate": 7.086247086247085e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.3333333333333333, "step": 750 }, { "clip_ratio": 0.0, "completion_length": 800.6875, "epoch": 0.8752913752913752, "grad_norm": 10.811359405517578, "kl": 0.169189453125, "learning_rate": 7.082362082362083e-07, "loss": 0.0002, "reward": 1.78125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.78125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 751 }, { "clip_ratio": 0.0, "completion_length": 605.1875, "epoch": 0.8764568764568764, "grad_norm": 16.251405715942383, "kl": 0.283203125, "learning_rate": 7.078477078477079e-07, "loss": 0.0003, "reward": 1.65625, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 752 }, { "clip_ratio": 0.0, "completion_length": 753.96875, "epoch": 0.8776223776223776, "grad_norm": 2.7918710708618164, "kl": 0.160888671875, "learning_rate": 7.074592074592074e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 753 }, { "clip_ratio": 0.0, "completion_length": 828.84375, "epoch": 0.8787878787878788, "grad_norm": 0.005217334721237421, "kl": 0.166259765625, "learning_rate": 7.07070707070707e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 754 }, { "clip_ratio": 0.0, "completion_length": 845.3125, "epoch": 0.87995337995338, "grad_norm": 1.1683642864227295, "kl": 0.161376953125, "learning_rate": 7.066822066822067e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 755 }, { "clip_ratio": 0.0, "completion_length": 252.625, "epoch": 0.8811188811188811, "grad_norm": 4.557985782623291, "kl": 0.4423828125, "learning_rate": 7.062937062937062e-07, "loss": 0.0004, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.46875, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 756 }, { "clip_ratio": 0.0, "completion_length": 771.59375, "epoch": 0.8822843822843823, "grad_norm": 1.1379913091659546, "kl": 0.155517578125, "learning_rate": 7.059052059052058e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 757 }, { "clip_ratio": 0.0, "completion_length": 680.8125, "epoch": 0.8834498834498834, "grad_norm": 2.7860896587371826, "kl": 0.16064453125, "learning_rate": 7.055167055167055e-07, "loss": 0.0002, "reward": 1.53125, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.53125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 758 }, { "clip_ratio": 0.0, "completion_length": 460.28125, "epoch": 0.8846153846153846, "grad_norm": 6.383632183074951, "kl": 0.271240234375, "learning_rate": 7.051282051282052e-07, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 759 }, { "clip_ratio": 0.0, "completion_length": 643.78125, "epoch": 0.8857808857808858, "grad_norm": 4.491060256958008, "kl": 0.2939453125, "learning_rate": 7.047397047397047e-07, "loss": 0.0003, "reward": 1.1875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 760 }, { "clip_ratio": 0.0, "completion_length": 484.90625, "epoch": 0.8869463869463869, "grad_norm": 1.5185580253601074, "kl": 0.2734375, "learning_rate": 7.043512043512043e-07, "loss": 0.0003, "reward": 1.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 761 }, { "clip_ratio": 0.0, "completion_length": 742.125, "epoch": 0.8881118881118881, "grad_norm": 1.2458381652832031, "kl": 0.1806640625, "learning_rate": 7.03962703962704e-07, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 762 }, { "clip_ratio": 0.0, "completion_length": 691.65625, "epoch": 0.8892773892773893, "grad_norm": 0.011595387011766434, "kl": 0.184814453125, "learning_rate": 7.035742035742035e-07, "loss": 0.0002, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 763 }, { "clip_ratio": 0.0, "completion_length": 727.09375, "epoch": 0.8904428904428905, "grad_norm": 1.1865665912628174, "kl": 0.170166015625, "learning_rate": 7.031857031857031e-07, "loss": 0.0002, "reward": 1.59375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.59375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 764 }, { "clip_ratio": 0.0, "completion_length": 524.71875, "epoch": 0.8916083916083916, "grad_norm": 1.2102693319320679, "kl": 0.32470703125, "learning_rate": 7.027972027972028e-07, "loss": 0.0003, "reward": 1.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.5, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 765 }, { "clip_ratio": 0.0, "completion_length": 479.5, "epoch": 0.8927738927738927, "grad_norm": 8.259383201599121, "kl": 0.262451171875, "learning_rate": 7.024087024087025e-07, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 766 }, { "clip_ratio": 0.0, "completion_length": 348.53125, "epoch": 0.8939393939393939, "grad_norm": 10.202848434448242, "kl": 0.333984375, "learning_rate": 7.02020202020202e-07, "loss": 0.0003, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.4375, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 767 }, { "clip_ratio": 0.0, "completion_length": 524.46875, "epoch": 0.8951048951048951, "grad_norm": 5.8818230628967285, "kl": 0.396728515625, "learning_rate": 7.016317016317016e-07, "loss": 0.0004, "reward": 1.40625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.40625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 768 }, { "clip_ratio": 0.0, "completion_length": 598.0, "epoch": 0.8962703962703963, "grad_norm": 5.011071681976318, "kl": 0.258544921875, "learning_rate": 7.012432012432012e-07, "loss": 0.0003, "reward": 1.6875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 769 }, { "clip_ratio": 0.0, "completion_length": 657.59375, "epoch": 0.8974358974358975, "grad_norm": 0.007739373482763767, "kl": 0.170166015625, "learning_rate": 7.008547008547007e-07, "loss": 0.0002, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 770 }, { "clip_ratio": 0.0, "completion_length": 796.21875, "epoch": 0.8986013986013986, "grad_norm": 1.1717584133148193, "kl": 0.160400390625, "learning_rate": 7.004662004662004e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 771 }, { "clip_ratio": 0.0, "completion_length": 566.65625, "epoch": 0.8997668997668997, "grad_norm": 0.01675744540989399, "kl": 0.2919921875, "learning_rate": 7.000777000777001e-07, "loss": 0.0003, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 772 }, { "clip_ratio": 0.0, "completion_length": 684.0, "epoch": 0.9009324009324009, "grad_norm": 0.006083598360419273, "kl": 0.18212890625, "learning_rate": 6.996891996891997e-07, "loss": 0.0002, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 773 }, { "clip_ratio": 0.0, "completion_length": 342.65625, "epoch": 0.9020979020979021, "grad_norm": 3.6522042751312256, "kl": 0.331298828125, "learning_rate": 6.993006993006993e-07, "loss": 0.0003, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.46875, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 774 }, { "clip_ratio": 0.0, "completion_length": 521.375, "epoch": 0.9032634032634033, "grad_norm": 3.8508963584899902, "kl": 0.25341796875, "learning_rate": 6.989121989121989e-07, "loss": 0.0003, "reward": 1.25, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 775 }, { "clip_ratio": 0.0, "completion_length": 584.34375, "epoch": 0.9044289044289044, "grad_norm": 3.781874656677246, "kl": 0.255615234375, "learning_rate": 6.985236985236985e-07, "loss": 0.0003, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 776 }, { "clip_ratio": 0.0, "completion_length": 422.90625, "epoch": 0.9055944055944056, "grad_norm": 1.357017159461975, "kl": 0.24951171875, "learning_rate": 6.98135198135198e-07, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 777 }, { "clip_ratio": 0.0, "completion_length": 468.5625, "epoch": 0.9067599067599068, "grad_norm": 13.684246063232422, "kl": 0.216796875, "learning_rate": 6.977466977466977e-07, "loss": 0.0002, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 778 }, { "clip_ratio": 0.0, "completion_length": 606.09375, "epoch": 0.9079254079254079, "grad_norm": 0.7557269334793091, "kl": 0.234619140625, "learning_rate": 6.973581973581974e-07, "loss": 0.0002, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 779 }, { "clip_ratio": 0.0, "completion_length": 606.53125, "epoch": 0.9090909090909091, "grad_norm": 0.013890265487134457, "kl": 0.25634765625, "learning_rate": 6.96969696969697e-07, "loss": 0.0003, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 780 }, { "clip_ratio": 0.0, "completion_length": 692.90625, "epoch": 0.9102564102564102, "grad_norm": 1.0518147945404053, "kl": 0.17138671875, "learning_rate": 6.965811965811965e-07, "loss": 0.0002, "reward": 1.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.6875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 781 }, { "clip_ratio": 0.0, "completion_length": 672.96875, "epoch": 0.9114219114219114, "grad_norm": 0.014808615669608116, "kl": 0.248291015625, "learning_rate": 6.961926961926962e-07, "loss": 0.0002, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "step": 782 }, { "clip_ratio": 0.0, "completion_length": 897.84375, "epoch": 0.9125874125874126, "grad_norm": 0.8383765816688538, "kl": 0.16259765625, "learning_rate": 6.958041958041958e-07, "loss": 0.0002, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "step": 783 }, { "clip_ratio": 0.0, "completion_length": 472.1875, "epoch": 0.9137529137529138, "grad_norm": 2.6433277130126953, "kl": 0.22265625, "learning_rate": 6.954156954156953e-07, "loss": 0.0002, "reward": 1.59375, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 784 }, { "clip_ratio": 0.0, "completion_length": 435.34375, "epoch": 0.914918414918415, "grad_norm": 0.0167995635420084, "kl": 0.2998046875, "learning_rate": 6.950271950271949e-07, "loss": 0.0003, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.5, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 785 }, { "clip_ratio": 0.0, "completion_length": 597.96875, "epoch": 0.916083916083916, "grad_norm": 3.810410261154175, "kl": 0.222900390625, "learning_rate": 6.946386946386947e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 786 }, { "clip_ratio": 0.0, "completion_length": 759.09375, "epoch": 0.9172494172494172, "grad_norm": 0.009358086623251438, "kl": 0.19921875, "learning_rate": 6.942501942501943e-07, "loss": 0.0002, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "step": 787 }, { "clip_ratio": 0.0, "completion_length": 514.875, "epoch": 0.9184149184149184, "grad_norm": 0.009556882083415985, "kl": 0.230712890625, "learning_rate": 6.938616938616938e-07, "loss": 0.0002, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 788 }, { "clip_ratio": 0.0, "completion_length": 836.21875, "epoch": 0.9195804195804196, "grad_norm": 0.021438289433717728, "kl": 0.16748046875, "learning_rate": 6.934731934731934e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 789 }, { "clip_ratio": 0.0, "completion_length": 958.3125, "epoch": 0.9207459207459208, "grad_norm": 0.693794846534729, "kl": 0.1630859375, "learning_rate": 6.930846930846931e-07, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.5, "step": 790 }, { "clip_ratio": 0.0, "completion_length": 408.34375, "epoch": 0.921911421911422, "grad_norm": 0.008301430381834507, "kl": 0.20458984375, "learning_rate": 6.926961926961926e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.3333333333333333, "step": 791 }, { "clip_ratio": 0.0, "completion_length": 887.78125, "epoch": 0.9230769230769231, "grad_norm": 0.0065753282979130745, "kl": 0.167236328125, "learning_rate": 6.923076923076922e-07, "loss": 0.0002, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.5, "step": 792 }, { "clip_ratio": 0.0, "completion_length": 586.53125, "epoch": 0.9242424242424242, "grad_norm": 5.645848274230957, "kl": 0.206298828125, "learning_rate": 6.919191919191919e-07, "loss": 0.0002, "reward": 1.65625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 793 }, { "clip_ratio": 0.0, "completion_length": 931.96875, "epoch": 0.9254079254079254, "grad_norm": 0.006847698707133532, "kl": 0.1669921875, "learning_rate": 6.915306915306916e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 794 }, { "clip_ratio": 0.0, "completion_length": 543.09375, "epoch": 0.9265734265734266, "grad_norm": 0.008308821357786655, "kl": 0.210205078125, "learning_rate": 6.911421911421911e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 795 }, { "clip_ratio": 0.0, "completion_length": 789.78125, "epoch": 0.9277389277389277, "grad_norm": 0.007064209319651127, "kl": 0.171630859375, "learning_rate": 6.907536907536907e-07, "loss": 0.0002, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 796 }, { "clip_ratio": 0.0, "completion_length": 769.21875, "epoch": 0.9289044289044289, "grad_norm": 0.9207088351249695, "kl": 0.166259765625, "learning_rate": 6.903651903651903e-07, "loss": 0.0002, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.96875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 797 }, { "clip_ratio": 0.0, "completion_length": 646.96875, "epoch": 0.9300699300699301, "grad_norm": 7.271271705627441, "kl": 0.21142578125, "learning_rate": 6.899766899766899e-07, "loss": 0.0002, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 798 }, { "clip_ratio": 0.0, "completion_length": 635.21875, "epoch": 0.9312354312354313, "grad_norm": 0.00532436091452837, "kl": 0.172119140625, "learning_rate": 6.895881895881895e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.3333333333333333, "step": 799 }, { "clip_ratio": 0.0, "completion_length": 769.46875, "epoch": 0.9324009324009324, "grad_norm": 0.00820898823440075, "kl": 0.16943359375, "learning_rate": 6.891996891996892e-07, "loss": 0.0002, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 800 }, { "clip_ratio": 0.0, "completion_length": 748.1875, "epoch": 0.9335664335664335, "grad_norm": 1.2030190229415894, "kl": 0.189697265625, "learning_rate": 6.888111888111888e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.71875, "rewards/score_task": 1.0, "step": 801 }, { "clip_ratio": 0.0, "completion_length": 716.28125, "epoch": 0.9347319347319347, "grad_norm": 59.555484771728516, "kl": 0.19921875, "learning_rate": 6.884226884226884e-07, "loss": 0.0002, "reward": 1.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "step": 802 }, { "clip_ratio": 0.0, "completion_length": 512.5625, "epoch": 0.9358974358974359, "grad_norm": 0.005791075993329287, "kl": 0.182373046875, "learning_rate": 6.88034188034188e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 803 }, { "clip_ratio": 0.0, "completion_length": 840.40625, "epoch": 0.9370629370629371, "grad_norm": 0.006476235575973988, "kl": 0.1611328125, "learning_rate": 6.876456876456876e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 804 }, { "clip_ratio": 0.0, "completion_length": 547.09375, "epoch": 0.9382284382284383, "grad_norm": 3.506363868713379, "kl": 0.252685546875, "learning_rate": 6.872571872571871e-07, "loss": 0.0003, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.46875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "step": 805 }, { "clip_ratio": 0.0, "completion_length": 786.875, "epoch": 0.9393939393939394, "grad_norm": 10.108421325683594, "kl": 0.21923828125, "learning_rate": 6.868686868686868e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 806 }, { "clip_ratio": 0.0, "completion_length": 907.78125, "epoch": 0.9405594405594405, "grad_norm": 0.006205583456903696, "kl": 0.164306640625, "learning_rate": 6.864801864801865e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 0.75, "step": 807 }, { "clip_ratio": 0.0, "completion_length": 847.40625, "epoch": 0.9417249417249417, "grad_norm": 0.005169635638594627, "kl": 0.16650390625, "learning_rate": 6.860916860916861e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 808 }, { "clip_ratio": 0.0, "completion_length": 697.09375, "epoch": 0.9428904428904429, "grad_norm": 0.024622434750199318, "kl": 0.2216796875, "learning_rate": 6.857031857031856e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "step": 809 }, { "clip_ratio": 0.0, "completion_length": 726.59375, "epoch": 0.9440559440559441, "grad_norm": 3.133176326751709, "kl": 0.19580078125, "learning_rate": 6.853146853146853e-07, "loss": 0.0002, "reward": 1.1875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 810 }, { "clip_ratio": 0.0, "completion_length": 832.875, "epoch": 0.9452214452214452, "grad_norm": 0.005108645651489496, "kl": 0.16796875, "learning_rate": 6.849261849261849e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "rewards/score_task": 2.0, "step": 811 }, { "clip_ratio": 0.0, "completion_length": 823.375, "epoch": 0.9463869463869464, "grad_norm": 0.005941313691437244, "kl": 0.187255859375, "learning_rate": 6.845376845376844e-07, "loss": 0.0002, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 812 }, { "clip_ratio": 0.0, "completion_length": 713.5, "epoch": 0.9475524475524476, "grad_norm": 0.007223431020975113, "kl": 0.165283203125, "learning_rate": 6.84149184149184e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 813 }, { "clip_ratio": 0.0, "completion_length": 727.15625, "epoch": 0.9487179487179487, "grad_norm": 1.2502658367156982, "kl": 0.171142578125, "learning_rate": 6.837606837606838e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 814 }, { "clip_ratio": 0.0, "completion_length": 512.875, "epoch": 0.9498834498834499, "grad_norm": 4.894864082336426, "kl": 0.33740234375, "learning_rate": 6.833721833721834e-07, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.46875, "rewards/comparison_task": 1.5, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.5, "step": 815 }, { "clip_ratio": 0.0, "completion_length": 568.59375, "epoch": 0.951048951048951, "grad_norm": 0.005667886696755886, "kl": 0.201416015625, "learning_rate": 6.829836829836829e-07, "loss": 0.0002, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.6666666666666667, "step": 816 }, { "clip_ratio": 0.0, "completion_length": 787.375, "epoch": 0.9522144522144522, "grad_norm": 0.0048684109933674335, "kl": 0.1650390625, "learning_rate": 6.825951825951826e-07, "loss": 0.0002, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 817 }, { "clip_ratio": 0.0, "completion_length": 795.4375, "epoch": 0.9533799533799534, "grad_norm": 0.006169602274894714, "kl": 0.166015625, "learning_rate": 6.822066822066822e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 818 }, { "clip_ratio": 0.0, "completion_length": 460.625, "epoch": 0.9545454545454546, "grad_norm": 8.352587699890137, "kl": 0.305908203125, "learning_rate": 6.818181818181817e-07, "loss": 0.0003, "reward": 1.53125, "reward_std": 0.3061639815568924, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 819 }, { "clip_ratio": 0.0, "completion_length": 842.875, "epoch": 0.9557109557109557, "grad_norm": 8.160935401916504, "kl": 0.157958984375, "learning_rate": 6.814296814296813e-07, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 820 }, { "clip_ratio": 0.0, "completion_length": 683.90625, "epoch": 0.9568764568764568, "grad_norm": 7.15080451965332, "kl": 0.201904296875, "learning_rate": 6.810411810411811e-07, "loss": 0.0002, "reward": 0.84375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 821 }, { "clip_ratio": 0.0, "completion_length": 637.84375, "epoch": 0.958041958041958, "grad_norm": 0.005965921096503735, "kl": 0.233154296875, "learning_rate": 6.806526806526807e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 822 }, { "clip_ratio": 0.0, "completion_length": 435.71875, "epoch": 0.9592074592074592, "grad_norm": 2.250840425491333, "kl": 0.3408203125, "learning_rate": 6.802641802641802e-07, "loss": 0.0003, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 823 }, { "clip_ratio": 0.0, "completion_length": 515.15625, "epoch": 0.9603729603729604, "grad_norm": 0.010080293752253056, "kl": 0.4052734375, "learning_rate": 6.798756798756798e-07, "loss": 0.0004, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 824 }, { "clip_ratio": 0.0, "completion_length": 509.4375, "epoch": 0.9615384615384616, "grad_norm": 2.200310468673706, "kl": 0.361083984375, "learning_rate": 6.794871794871795e-07, "loss": 0.0004, "reward": 1.5625, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 825 }, { "clip_ratio": 0.0, "completion_length": 420.875, "epoch": 0.9627039627039627, "grad_norm": 4.590648651123047, "kl": 0.35400390625, "learning_rate": 6.79098679098679e-07, "loss": 0.0004, "reward": 1.3125, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 826 }, { "clip_ratio": 0.0, "completion_length": 311.78125, "epoch": 0.9638694638694638, "grad_norm": 5.2217206954956055, "kl": 0.489501953125, "learning_rate": 6.787101787101787e-07, "loss": 0.0005, "reward": 1.53125, "reward_std": 0.3787454217672348, "rewards/accuracy_reward": 0.53125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 827 }, { "clip_ratio": 0.0, "completion_length": 654.8125, "epoch": 0.965034965034965, "grad_norm": 0.00804218277335167, "kl": 0.238037109375, "learning_rate": 6.783216783216783e-07, "loss": 0.0002, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.5, "step": 828 }, { "clip_ratio": 0.0, "completion_length": 815.3125, "epoch": 0.9662004662004662, "grad_norm": 0.012289268895983696, "kl": 0.187744140625, "learning_rate": 6.77933177933178e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "step": 829 }, { "clip_ratio": 0.0, "completion_length": 798.3125, "epoch": 0.9673659673659674, "grad_norm": 0.005305759608745575, "kl": 0.169189453125, "learning_rate": 6.775446775446775e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.25, "rewards/format_reward": 0.75, "step": 830 }, { "clip_ratio": 0.0, "completion_length": 864.375, "epoch": 0.9685314685314685, "grad_norm": 0.8696191310882568, "kl": 0.16845703125, "learning_rate": 6.771561771561771e-07, "loss": 0.0002, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.75, "rewards/format_reward": 0.96875, "step": 831 }, { "clip_ratio": 0.0, "completion_length": 427.03125, "epoch": 0.9696969696969697, "grad_norm": 5.399799823760986, "kl": 0.54541015625, "learning_rate": 6.767676767676767e-07, "loss": 0.0005, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 0.96875, "rewards/score_task": 1.5, "step": 832 }, { "clip_ratio": 0.0, "completion_length": 244.15625, "epoch": 0.9708624708624709, "grad_norm": 9.987430572509766, "kl": 0.968017578125, "learning_rate": 6.763791763791763e-07, "loss": 0.001, "reward": 1.28125, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 833 }, { "clip_ratio": 0.0, "completion_length": 200.3125, "epoch": 0.972027972027972, "grad_norm": 0.0034983730874955654, "kl": 0.70849609375, "learning_rate": 6.75990675990676e-07, "loss": 0.0007, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 834 }, { "clip_ratio": 0.0, "completion_length": 434.71875, "epoch": 0.9731934731934732, "grad_norm": 2.0292999744415283, "kl": 1.364990234375, "learning_rate": 6.756021756021756e-07, "loss": 0.0014, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 835 }, { "clip_ratio": 0.0, "completion_length": 415.625, "epoch": 0.9743589743589743, "grad_norm": 6.86417293548584, "kl": 0.45849609375, "learning_rate": 6.752136752136752e-07, "loss": 0.0005, "reward": 1.375, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 836 }, { "clip_ratio": 0.0, "completion_length": 195.15625, "epoch": 0.9755244755244755, "grad_norm": 0.009829702787101269, "kl": 0.7392578125, "learning_rate": 6.748251748251748e-07, "loss": 0.0007, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.3333333333333333, "step": 837 }, { "clip_ratio": 0.0, "completion_length": 599.21875, "epoch": 0.9766899766899767, "grad_norm": 0.7744724154472351, "kl": 0.284912109375, "learning_rate": 6.744366744366744e-07, "loss": 0.0003, "reward": 0.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.46875, "rewards/score_task": 2.0, "step": 838 }, { "clip_ratio": 0.0, "completion_length": 202.625, "epoch": 0.9778554778554779, "grad_norm": 31.438093185424805, "kl": 0.720947265625, "learning_rate": 6.74048174048174e-07, "loss": 0.0007, "reward": 0.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.71875, "rewards/score_task": 1.0, "step": 839 }, { "clip_ratio": 0.0, "completion_length": 405.21875, "epoch": 0.9790209790209791, "grad_norm": 2.14853572845459, "kl": 0.42236328125, "learning_rate": 6.736596736596735e-07, "loss": 0.0004, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 840 }, { "clip_ratio": 0.0, "completion_length": 587.5625, "epoch": 0.9801864801864801, "grad_norm": 0.005395143758505583, "kl": 0.325927734375, "learning_rate": 6.732711732711733e-07, "loss": 0.0003, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 841 }, { "clip_ratio": 0.0, "completion_length": 577.78125, "epoch": 0.9813519813519813, "grad_norm": 0.061672963201999664, "kl": 0.443359375, "learning_rate": 6.728826728826729e-07, "loss": 0.0004, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.5, "rewards/score_task": 2.0, "step": 842 }, { "clip_ratio": 0.0, "completion_length": 754.5, "epoch": 0.9825174825174825, "grad_norm": 0.0092972656711936, "kl": 0.18310546875, "learning_rate": 6.724941724941725e-07, "loss": 0.0002, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.75, "rewards/format_reward": 0.5, "step": 843 }, { "clip_ratio": 0.0, "completion_length": 405.34375, "epoch": 0.9836829836829837, "grad_norm": 4.4594221115112305, "kl": 0.314208984375, "learning_rate": 6.72105672105672e-07, "loss": 0.0003, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.46875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 844 }, { "clip_ratio": 0.0, "completion_length": 379.9375, "epoch": 0.9848484848484849, "grad_norm": 0.005609810817986727, "kl": 0.424072265625, "learning_rate": 6.717171717171717e-07, "loss": 0.0004, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 845 }, { "clip_ratio": 0.0, "completion_length": 376.25, "epoch": 0.986013986013986, "grad_norm": 0.9091343283653259, "kl": 0.54296875, "learning_rate": 6.713286713286713e-07, "loss": 0.0005, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 846 }, { "clip_ratio": 0.0, "completion_length": 607.90625, "epoch": 0.9871794871794872, "grad_norm": 0.00467204675078392, "kl": 0.22998046875, "learning_rate": 6.709401709401708e-07, "loss": 0.0002, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "step": 847 }, { "clip_ratio": 0.0, "completion_length": 581.15625, "epoch": 0.9883449883449883, "grad_norm": 0.023282932117581367, "kl": 0.383056640625, "learning_rate": 6.705516705516705e-07, "loss": 0.0004, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 848 }, { "clip_ratio": 0.0, "completion_length": 214.96875, "epoch": 0.9895104895104895, "grad_norm": 19.087047576904297, "kl": 0.5986328125, "learning_rate": 6.701631701631702e-07, "loss": 0.0006, "reward": 1.375, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.9375, "rewards/score_task": 1.5, "step": 849 }, { "clip_ratio": 0.0, "completion_length": 397.3125, "epoch": 0.9906759906759907, "grad_norm": 5.339278697967529, "kl": 0.466064453125, "learning_rate": 6.697746697746698e-07, "loss": 0.0005, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 850 }, { "clip_ratio": 0.0, "completion_length": 382.65625, "epoch": 0.9918414918414918, "grad_norm": 2.272014856338501, "kl": 0.47216796875, "learning_rate": 6.693861693861693e-07, "loss": 0.0005, "reward": 1.6875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 851 }, { "clip_ratio": 0.0, "completion_length": 608.875, "epoch": 0.993006993006993, "grad_norm": 0.019766485318541527, "kl": 0.38134765625, "learning_rate": 6.68997668997669e-07, "loss": 0.0004, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 852 }, { "clip_ratio": 0.0, "completion_length": 799.34375, "epoch": 0.9941724941724942, "grad_norm": 0.006263336166739464, "kl": 0.179443359375, "learning_rate": 6.686091686091686e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.25, "rewards/format_reward": 0.75, "step": 853 }, { "clip_ratio": 0.0, "completion_length": 378.53125, "epoch": 0.9953379953379954, "grad_norm": 4.875955104827881, "kl": 0.626220703125, "learning_rate": 6.682206682206681e-07, "loss": 0.0006, "reward": 1.0625, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 854 }, { "clip_ratio": 0.0, "completion_length": 381.75, "epoch": 0.9965034965034965, "grad_norm": 0.006670533679425716, "kl": 0.569091796875, "learning_rate": 6.678321678321678e-07, "loss": 0.0006, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 855 }, { "clip_ratio": 0.0, "completion_length": 428.25, "epoch": 0.9976689976689976, "grad_norm": 0.024344634264707565, "kl": 0.470458984375, "learning_rate": 6.674436674436675e-07, "loss": 0.0005, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 856 }, { "clip_ratio": 0.0, "completion_length": 421.3125, "epoch": 0.9988344988344988, "grad_norm": 0.004566722083836794, "kl": 0.415283203125, "learning_rate": 6.670551670551671e-07, "loss": 0.0004, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 857 }, { "clip_ratio": 0.0, "completion_length": 548.9375, "epoch": 1.0, "grad_norm": 0.0070753092877566814, "kl": 0.417724609375, "learning_rate": 6.666666666666666e-07, "loss": 0.0004, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 858 }, { "clip_ratio": 0.0, "completion_length": 381.34375, "epoch": 1.0011655011655012, "grad_norm": 11.550578117370605, "kl": 0.3037109375, "learning_rate": 6.662781662781662e-07, "loss": 0.0003, "reward": 1.40625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.40625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 859 }, { "clip_ratio": 0.0, "completion_length": 622.9375, "epoch": 1.0023310023310024, "grad_norm": 4.015313625335693, "kl": 0.441650390625, "learning_rate": 6.658896658896659e-07, "loss": 0.0004, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.9375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 860 }, { "clip_ratio": 0.0, "completion_length": 390.53125, "epoch": 1.0034965034965035, "grad_norm": 1.463830590248108, "kl": 0.49853515625, "learning_rate": 6.655011655011654e-07, "loss": 0.0005, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 861 }, { "clip_ratio": 0.0, "completion_length": 626.15625, "epoch": 1.0046620046620047, "grad_norm": 0.005566823296248913, "kl": 0.231689453125, "learning_rate": 6.651126651126651e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 862 }, { "clip_ratio": 0.0, "completion_length": 227.6875, "epoch": 1.005827505827506, "grad_norm": 0.3042296767234802, "kl": 0.789306640625, "learning_rate": 6.647241647241647e-07, "loss": 0.0008, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.46875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 863 }, { "clip_ratio": 0.0, "completion_length": 230.4375, "epoch": 1.006993006993007, "grad_norm": 10.160773277282715, "kl": 0.506103515625, "learning_rate": 6.643356643356644e-07, "loss": 0.0005, "reward": 1.59375, "reward_std": 0.22201896458864212, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.34375, "rewards/comparison_task": 1.5, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 864 }, { "clip_ratio": 0.0, "completion_length": 598.03125, "epoch": 1.008158508158508, "grad_norm": 0.004759290721267462, "kl": 0.3916015625, "learning_rate": 6.639471639471639e-07, "loss": 0.0004, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 865 }, { "clip_ratio": 0.0, "completion_length": 802.34375, "epoch": 1.0093240093240092, "grad_norm": 0.006891284603625536, "kl": 0.1630859375, "learning_rate": 6.635586635586635e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 0.75, "step": 866 }, { "clip_ratio": 0.0, "completion_length": 204.59375, "epoch": 1.0104895104895104, "grad_norm": 0.5874224305152893, "kl": 0.5380859375, "learning_rate": 6.631701631701631e-07, "loss": 0.0005, "reward": 1.15625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.40625, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 867 }, { "clip_ratio": 0.0, "completion_length": 190.5625, "epoch": 1.0116550116550116, "grad_norm": 2.9364917278289795, "kl": 0.612548828125, "learning_rate": 6.627816627816627e-07, "loss": 0.0006, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.46875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 868 }, { "clip_ratio": 0.0, "completion_length": 615.75, "epoch": 1.0128205128205128, "grad_norm": 0.03537534922361374, "kl": 0.4921875, "learning_rate": 6.623931623931624e-07, "loss": 0.0005, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 869 }, { "clip_ratio": 0.0, "completion_length": 400.0625, "epoch": 1.013986013986014, "grad_norm": 74.570068359375, "kl": 0.643798828125, "learning_rate": 6.62004662004662e-07, "loss": 0.0006, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 870 }, { "clip_ratio": 0.0, "completion_length": 420.375, "epoch": 1.0151515151515151, "grad_norm": 0.00405954010784626, "kl": 0.690673828125, "learning_rate": 6.616161616161616e-07, "loss": 0.0007, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 871 }, { "clip_ratio": 0.0, "completion_length": 400.9375, "epoch": 1.0163170163170163, "grad_norm": 11.808073043823242, "kl": 0.445556640625, "learning_rate": 6.612276612276612e-07, "loss": 0.0004, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 872 }, { "clip_ratio": 0.0, "completion_length": 748.8125, "epoch": 1.0174825174825175, "grad_norm": 0.005972168408334255, "kl": 0.172607421875, "learning_rate": 6.608391608391608e-07, "loss": 0.0002, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.25, "step": 873 }, { "clip_ratio": 0.0, "completion_length": 604.21875, "epoch": 1.0186480186480187, "grad_norm": 0.01863485760986805, "kl": 0.447021484375, "learning_rate": 6.604506604506604e-07, "loss": 0.0004, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 874 }, { "clip_ratio": 0.0, "completion_length": 382.53125, "epoch": 1.0198135198135199, "grad_norm": 4.891656398773193, "kl": 0.464111328125, "learning_rate": 6.600621600621599e-07, "loss": 0.0005, "reward": 1.9375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 875 }, { "clip_ratio": 0.0, "completion_length": 390.375, "epoch": 1.020979020979021, "grad_norm": 69.43263244628906, "kl": 0.53125, "learning_rate": 6.596736596736597e-07, "loss": 0.0005, "reward": 1.53125, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 876 }, { "clip_ratio": 0.0, "completion_length": 185.46875, "epoch": 1.0221445221445222, "grad_norm": 0.05875428020954132, "kl": 0.868408203125, "learning_rate": 6.592851592851593e-07, "loss": 0.0009, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 877 }, { "clip_ratio": 0.0, "completion_length": 583.84375, "epoch": 1.0233100233100234, "grad_norm": 0.09531284868717194, "kl": 0.61376953125, "learning_rate": 6.588966588966589e-07, "loss": 0.0006, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.5, "rewards/score_task": 2.0, "step": 878 }, { "clip_ratio": 0.0, "completion_length": 558.6875, "epoch": 1.0244755244755244, "grad_norm": 0.01940552145242691, "kl": 0.457763671875, "learning_rate": 6.585081585081584e-07, "loss": 0.0005, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 879 }, { "clip_ratio": 0.0, "completion_length": 413.375, "epoch": 1.0256410256410255, "grad_norm": 0.6750799417495728, "kl": 0.29541015625, "learning_rate": 6.581196581196581e-07, "loss": 0.0003, "reward": 1.9375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.4375, "rewards/comparison_task": 1.5, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "step": 880 }, { "clip_ratio": 0.0, "completion_length": 743.34375, "epoch": 1.0268065268065267, "grad_norm": 0.00668159406632185, "kl": 0.174560546875, "learning_rate": 6.577311577311577e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.5, "step": 881 }, { "clip_ratio": 0.0, "completion_length": 395.1875, "epoch": 1.027972027972028, "grad_norm": 0.9390003681182861, "kl": 0.51806640625, "learning_rate": 6.573426573426572e-07, "loss": 0.0005, "reward": 1.4375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 882 }, { "clip_ratio": 0.0, "completion_length": 559.53125, "epoch": 1.029137529137529, "grad_norm": 13.634501457214355, "kl": 0.433349609375, "learning_rate": 6.569541569541569e-07, "loss": 0.0004, "reward": 0.625, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.5, "rewards/score_task": 2.0, "step": 883 }, { "clip_ratio": 0.0, "completion_length": 378.96875, "epoch": 1.0303030303030303, "grad_norm": 0.019326770678162575, "kl": 0.449462890625, "learning_rate": 6.565656565656566e-07, "loss": 0.0004, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 884 }, { "clip_ratio": 0.0, "completion_length": 722.6875, "epoch": 1.0314685314685315, "grad_norm": 0.9974822998046875, "kl": 0.17041015625, "learning_rate": 6.561771561771562e-07, "loss": 0.0002, "reward": 1.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.6875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.75, "rewards/format_reward": 1.0, "step": 885 }, { "clip_ratio": 0.0, "completion_length": 603.0, "epoch": 1.0326340326340326, "grad_norm": 1.2122408151626587, "kl": 0.466064453125, "learning_rate": 6.557886557886557e-07, "loss": 0.0005, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 886 }, { "clip_ratio": 0.0, "completion_length": 582.59375, "epoch": 1.0337995337995338, "grad_norm": 0.009809213690459728, "kl": 0.464111328125, "learning_rate": 6.554001554001553e-07, "loss": 0.0005, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 887 }, { "clip_ratio": 0.0, "completion_length": 549.9375, "epoch": 1.034965034965035, "grad_norm": 0.0045128390192985535, "kl": 0.239501953125, "learning_rate": 6.55011655011655e-07, "loss": 0.0002, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.3333333333333333, "rewards/format_reward": 0.5, "step": 888 }, { "clip_ratio": 0.0, "completion_length": 726.21875, "epoch": 1.0361305361305362, "grad_norm": 0.007682791445404291, "kl": 0.1923828125, "learning_rate": 6.546231546231545e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.5, "step": 889 }, { "clip_ratio": 0.0, "completion_length": 622.21875, "epoch": 1.0372960372960374, "grad_norm": 1.8195185661315918, "kl": 0.5068359375, "learning_rate": 6.542346542346542e-07, "loss": 0.0005, "reward": 1.65625, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.6875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 890 }, { "clip_ratio": 0.0, "completion_length": 567.71875, "epoch": 1.0384615384615385, "grad_norm": 0.014977667480707169, "kl": 0.50390625, "learning_rate": 6.538461538461538e-07, "loss": 0.0005, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 891 }, { "clip_ratio": 0.0, "completion_length": 387.40625, "epoch": 1.0396270396270397, "grad_norm": 4.649765968322754, "kl": 0.645751953125, "learning_rate": 6.534576534576535e-07, "loss": 0.0006, "reward": 1.78125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.78125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 892 }, { "clip_ratio": 0.0, "completion_length": 205.78125, "epoch": 1.0407925407925407, "grad_norm": 0.022643400356173515, "kl": 0.739501953125, "learning_rate": 6.53069153069153e-07, "loss": 0.0007, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 893 }, { "clip_ratio": 0.0, "completion_length": 227.96875, "epoch": 1.0419580419580419, "grad_norm": 0.9864293336868286, "kl": 0.738037109375, "learning_rate": 6.526806526806526e-07, "loss": 0.0007, "reward": 1.4375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 894 }, { "clip_ratio": 0.0, "completion_length": 556.15625, "epoch": 1.043123543123543, "grad_norm": 5.647745132446289, "kl": 0.394775390625, "learning_rate": 6.522921522921522e-07, "loss": 0.0004, "reward": 0.84375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 895 }, { "clip_ratio": 0.0, "completion_length": 377.1875, "epoch": 1.0442890442890442, "grad_norm": 0.8780983686447144, "kl": 0.5029296875, "learning_rate": 6.519036519036518e-07, "loss": 0.0005, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 896 }, { "clip_ratio": 0.0, "completion_length": 392.21875, "epoch": 1.0454545454545454, "grad_norm": 0.009906746447086334, "kl": 0.556640625, "learning_rate": 6.515151515151515e-07, "loss": 0.0006, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 897 }, { "clip_ratio": 0.0, "completion_length": 566.46875, "epoch": 1.0466200466200466, "grad_norm": 3.1603031158447266, "kl": 0.356689453125, "learning_rate": 6.511266511266511e-07, "loss": 0.0004, "reward": 0.90625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "rewards/score_task": 2.0, "step": 898 }, { "clip_ratio": 0.0, "completion_length": 618.1875, "epoch": 1.0477855477855478, "grad_norm": 0.007131508085876703, "kl": 0.232666015625, "learning_rate": 6.507381507381508e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 899 }, { "clip_ratio": 0.0, "completion_length": 465.28125, "epoch": 1.048951048951049, "grad_norm": 0.008681204169988632, "kl": 0.370361328125, "learning_rate": 6.503496503496503e-07, "loss": 0.0004, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 900 }, { "clip_ratio": 0.0, "completion_length": 768.125, "epoch": 1.0501165501165501, "grad_norm": 0.006051101256161928, "kl": 0.179931640625, "learning_rate": 6.499611499611499e-07, "loss": 0.0002, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.75, "rewards/format_reward": 0.5, "step": 901 }, { "clip_ratio": 0.0, "completion_length": 207.84375, "epoch": 1.0512820512820513, "grad_norm": 0.04643717408180237, "kl": 0.35205078125, "learning_rate": 6.495726495726495e-07, "loss": 0.0004, "reward": 1.9375, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.6875, "rewards/comparison_task": 1.6666666666666667, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "step": 902 }, { "clip_ratio": 0.0, "completion_length": 733.28125, "epoch": 1.0524475524475525, "grad_norm": 2.5218241214752197, "kl": 0.17822265625, "learning_rate": 6.491841491841493e-07, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.5, "step": 903 }, { "clip_ratio": 0.0, "completion_length": 699.65625, "epoch": 1.0536130536130537, "grad_norm": 2.1788196563720703, "kl": 0.2021484375, "learning_rate": 6.487956487956488e-07, "loss": 0.0002, "reward": 1.65625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.65625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 904 }, { "clip_ratio": 0.0, "completion_length": 317.875, "epoch": 1.0547785547785549, "grad_norm": 3.0081284046173096, "kl": 0.319580078125, "learning_rate": 6.484071484071484e-07, "loss": 0.0003, "reward": 1.6875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.4375, "rewards/comparison_task": 1.5, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 905 }, { "clip_ratio": 0.0, "completion_length": 806.125, "epoch": 1.055944055944056, "grad_norm": 0.007120463531464338, "kl": 0.185546875, "learning_rate": 6.48018648018648e-07, "loss": 0.0002, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.75, "rewards/format_reward": 0.5, "step": 906 }, { "clip_ratio": 0.0, "completion_length": 315.4375, "epoch": 1.057109557109557, "grad_norm": 0.011895408853888512, "kl": 0.3310546875, "learning_rate": 6.476301476301476e-07, "loss": 0.0003, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 907 }, { "clip_ratio": 0.0, "completion_length": 498.5625, "epoch": 1.0582750582750582, "grad_norm": 0.05924631655216217, "kl": 0.276611328125, "learning_rate": 6.472416472416472e-07, "loss": 0.0003, "reward": 1.4375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.1875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 908 }, { "clip_ratio": 0.0, "completion_length": 358.03125, "epoch": 1.0594405594405594, "grad_norm": 2.4207234382629395, "kl": 0.324462890625, "learning_rate": 6.468531468531468e-07, "loss": 0.0003, "reward": 1.5625, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 909 }, { "clip_ratio": 0.0, "completion_length": 648.15625, "epoch": 1.0606060606060606, "grad_norm": 0.9750411510467529, "kl": 0.20263671875, "learning_rate": 6.464646464646465e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 910 }, { "clip_ratio": 0.0, "completion_length": 436.6875, "epoch": 1.0617715617715617, "grad_norm": 1.4567954540252686, "kl": 0.296875, "learning_rate": 6.460761460761461e-07, "loss": 0.0003, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 911 }, { "clip_ratio": 0.0, "completion_length": 372.0, "epoch": 1.062937062937063, "grad_norm": 15.746253967285156, "kl": 0.329345703125, "learning_rate": 6.456876456876457e-07, "loss": 0.0003, "reward": 1.84375, "reward_std": 0.3061639815568924, "rewards/accuracy_reward": 0.625, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 912 }, { "clip_ratio": 0.0, "completion_length": 667.0, "epoch": 1.064102564102564, "grad_norm": 1.2801166772842407, "kl": 0.2080078125, "learning_rate": 6.452991452991453e-07, "loss": 0.0002, "reward": 1.3125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.5625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 913 }, { "clip_ratio": 0.0, "completion_length": 434.25, "epoch": 1.0652680652680653, "grad_norm": 0.004795829299837351, "kl": 0.31005859375, "learning_rate": 6.449106449106448e-07, "loss": 0.0003, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 914 }, { "clip_ratio": 0.0, "completion_length": 322.53125, "epoch": 1.0664335664335665, "grad_norm": 2.021533489227295, "kl": 0.357421875, "learning_rate": 6.445221445221445e-07, "loss": 0.0004, "reward": 1.65625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 915 }, { "clip_ratio": 0.0, "completion_length": 472.34375, "epoch": 1.0675990675990676, "grad_norm": 0.008589215576648712, "kl": 0.28466796875, "learning_rate": 6.441336441336441e-07, "loss": 0.0003, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 916 }, { "clip_ratio": 0.0, "completion_length": 469.03125, "epoch": 1.0687645687645688, "grad_norm": 0.014281713403761387, "kl": 0.29638671875, "learning_rate": 6.437451437451438e-07, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 917 }, { "clip_ratio": 0.0, "completion_length": 464.4375, "epoch": 1.06993006993007, "grad_norm": 2.3696129322052, "kl": 0.299560546875, "learning_rate": 6.433566433566433e-07, "loss": 0.0003, "reward": 1.71875, "reward_std": 0.2651650384068489, "rewards/accuracy_reward": 0.53125, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 0.96875, "rewards/score_task": 2.0, "step": 918 }, { "clip_ratio": 0.0, "completion_length": 333.875, "epoch": 1.0710955710955712, "grad_norm": 3.820401668548584, "kl": 0.30810546875, "learning_rate": 6.42968142968143e-07, "loss": 0.0003, "reward": 1.5625, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 919 }, { "clip_ratio": 0.0, "completion_length": 644.09375, "epoch": 1.0722610722610724, "grad_norm": 0.004428595770150423, "kl": 0.2314453125, "learning_rate": 6.425796425796426e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 920 }, { "clip_ratio": 0.0, "completion_length": 601.78125, "epoch": 1.0734265734265733, "grad_norm": 0.0064634764567017555, "kl": 0.246826171875, "learning_rate": 6.421911421911421e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 1.0, "step": 921 }, { "clip_ratio": 0.0, "completion_length": 673.5625, "epoch": 1.0745920745920745, "grad_norm": 1.8055092096328735, "kl": 0.220947265625, "learning_rate": 6.418026418026417e-07, "loss": 0.0002, "reward": 1.03125, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 922 }, { "clip_ratio": 0.0, "completion_length": 701.03125, "epoch": 1.0757575757575757, "grad_norm": 0.010107080452144146, "kl": 0.208984375, "learning_rate": 6.414141414141414e-07, "loss": 0.0002, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 923 }, { "clip_ratio": 0.0, "completion_length": 498.21875, "epoch": 1.0769230769230769, "grad_norm": 1.9517523050308228, "kl": 0.31689453125, "learning_rate": 6.410256410256411e-07, "loss": 0.0003, "reward": 1.0625, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 924 }, { "clip_ratio": 0.0, "completion_length": 331.1875, "epoch": 1.078088578088578, "grad_norm": 0.003751560812816024, "kl": 0.311279296875, "learning_rate": 6.406371406371406e-07, "loss": 0.0003, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.5, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 925 }, { "clip_ratio": 0.0, "completion_length": 411.8125, "epoch": 1.0792540792540792, "grad_norm": 1.4292241334915161, "kl": 0.26318359375, "learning_rate": 6.402486402486402e-07, "loss": 0.0003, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 926 }, { "clip_ratio": 0.0, "completion_length": 726.3125, "epoch": 1.0804195804195804, "grad_norm": 1.2104058265686035, "kl": 0.203857421875, "learning_rate": 6.398601398601399e-07, "loss": 0.0002, "reward": 0.78125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "rewards/score_task": 1.0, "step": 927 }, { "clip_ratio": 0.0, "completion_length": 200.0, "epoch": 1.0815850815850816, "grad_norm": 0.34250298142433167, "kl": 0.36962890625, "learning_rate": 6.394716394716394e-07, "loss": 0.0004, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.71875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "step": 928 }, { "clip_ratio": 0.0, "completion_length": 488.375, "epoch": 1.0827505827505828, "grad_norm": 1.419145107269287, "kl": 0.288818359375, "learning_rate": 6.39083139083139e-07, "loss": 0.0003, "reward": 1.3125, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 929 }, { "clip_ratio": 0.0, "completion_length": 499.25, "epoch": 1.083916083916084, "grad_norm": 2.0992283821105957, "kl": 0.3203125, "learning_rate": 6.386946386946386e-07, "loss": 0.0003, "reward": 1.59375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.59375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.6666666666666667, "step": 930 }, { "clip_ratio": 0.0, "completion_length": 620.84375, "epoch": 1.0850815850815851, "grad_norm": 3.242292881011963, "kl": 0.3720703125, "learning_rate": 6.383061383061384e-07, "loss": 0.0004, "reward": 1.625, "reward_std": 0.3535533845424652, "rewards/accuracy_reward": 0.65625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 931 }, { "clip_ratio": 0.0, "completion_length": 695.59375, "epoch": 1.0862470862470863, "grad_norm": 0.030759625136852264, "kl": 0.3896484375, "learning_rate": 6.379176379176379e-07, "loss": 0.0004, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "step": 932 }, { "clip_ratio": 0.0, "completion_length": 505.375, "epoch": 1.0874125874125875, "grad_norm": 3.23589825630188, "kl": 0.490234375, "learning_rate": 6.375291375291375e-07, "loss": 0.0005, "reward": 1.3125, "reward_std": 0.4355512708425522, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 0.6666666666666666, "step": 933 }, { "clip_ratio": 0.0, "completion_length": 600.375, "epoch": 1.0885780885780885, "grad_norm": 3.0855751037597656, "kl": 0.4443359375, "learning_rate": 6.371406371406371e-07, "loss": 0.0004, "reward": 1.15625, "reward_std": 0.3377464786171913, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.65625, "rewards/score_task": 1.0, "step": 934 }, { "clip_ratio": 0.0, "completion_length": 338.46875, "epoch": 1.0897435897435896, "grad_norm": 4.701046943664551, "kl": 0.46044921875, "learning_rate": 6.367521367521367e-07, "loss": 0.0005, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.0, "step": 935 }, { "clip_ratio": 0.0, "completion_length": 345.4375, "epoch": 1.0909090909090908, "grad_norm": 0.21977683901786804, "kl": 0.4384765625, "learning_rate": 6.363636363636363e-07, "loss": 0.0004, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.5, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "step": 936 }, { "clip_ratio": 0.0, "completion_length": 549.0625, "epoch": 1.092074592074592, "grad_norm": 2.7376163005828857, "kl": 0.50341796875, "learning_rate": 6.359751359751359e-07, "loss": 0.0005, "reward": 1.03125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 937 }, { "clip_ratio": 0.0, "completion_length": 384.5625, "epoch": 1.0932400932400932, "grad_norm": 9.964888572692871, "kl": 0.3857421875, "learning_rate": 6.355866355866357e-07, "loss": 0.0004, "reward": 1.125, "reward_std": 0.3104073107242584, "rewards/accuracy_reward": 0.15625, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 938 }, { "clip_ratio": 0.0, "completion_length": 292.5, "epoch": 1.0944055944055944, "grad_norm": 27.900022506713867, "kl": 0.35595703125, "learning_rate": 6.351981351981352e-07, "loss": 0.0004, "reward": 1.1875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.4375, "rewards/comparison_task": 1.5, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 939 }, { "clip_ratio": 0.0, "completion_length": 517.375, "epoch": 1.0955710955710956, "grad_norm": 0.04298647493124008, "kl": 0.283935546875, "learning_rate": 6.348096348096348e-07, "loss": 0.0003, "reward": 0.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.25, "step": 940 }, { "clip_ratio": 0.0, "completion_length": 581.34375, "epoch": 1.0967365967365967, "grad_norm": 0.9012244939804077, "kl": 0.27685546875, "learning_rate": 6.344211344211344e-07, "loss": 0.0003, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "step": 941 }, { "clip_ratio": 0.0, "completion_length": 601.1875, "epoch": 1.097902097902098, "grad_norm": 0.806927502155304, "kl": 0.22314453125, "learning_rate": 6.34032634032634e-07, "loss": 0.0002, "reward": 0.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 942 }, { "clip_ratio": 0.0, "completion_length": 658.53125, "epoch": 1.099067599067599, "grad_norm": 2.0078399181365967, "kl": 0.22509765625, "learning_rate": 6.336441336441336e-07, "loss": 0.0002, "reward": 1.125, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 943 }, { "clip_ratio": 0.0, "completion_length": 647.28125, "epoch": 1.1002331002331003, "grad_norm": 2.2370800971984863, "kl": 0.190673828125, "learning_rate": 6.332556332556332e-07, "loss": 0.0002, "reward": 1.3125, "reward_std": 0.2925042062997818, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 944 }, { "clip_ratio": 0.0, "completion_length": 680.375, "epoch": 1.1013986013986015, "grad_norm": 1.2691926956176758, "kl": 0.191650390625, "learning_rate": 6.328671328671329e-07, "loss": 0.0002, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.71875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 945 }, { "clip_ratio": 0.0, "completion_length": 692.71875, "epoch": 1.1025641025641026, "grad_norm": 5.458542823791504, "kl": 0.177490234375, "learning_rate": 6.324786324786325e-07, "loss": 0.0002, "reward": 1.53125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.53125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 946 }, { "clip_ratio": 0.0, "completion_length": 405.28125, "epoch": 1.1037296037296038, "grad_norm": 4.902122974395752, "kl": 0.362548828125, "learning_rate": 6.320901320901321e-07, "loss": 0.0004, "reward": 1.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.46875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "step": 947 }, { "clip_ratio": 0.0, "completion_length": 423.53125, "epoch": 1.104895104895105, "grad_norm": 0.008182195015251637, "kl": 0.3134765625, "learning_rate": 6.317016317016317e-07, "loss": 0.0003, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.5, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 948 }, { "clip_ratio": 0.0, "completion_length": 508.375, "epoch": 1.106060606060606, "grad_norm": 0.0055789947509765625, "kl": 0.184326171875, "learning_rate": 6.313131313131312e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.3333333333333333, "step": 949 }, { "clip_ratio": 0.0, "completion_length": 613.6875, "epoch": 1.1072261072261071, "grad_norm": 1.9083982706069946, "kl": 0.195556640625, "learning_rate": 6.309246309246309e-07, "loss": 0.0002, "reward": 1.75, "reward_std": 0.2314550280570984, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.5, "step": 950 }, { "clip_ratio": 0.0, "completion_length": 578.6875, "epoch": 1.1083916083916083, "grad_norm": 0.005917372647672892, "kl": 0.23876953125, "learning_rate": 6.305361305361305e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "step": 951 }, { "clip_ratio": 0.0, "completion_length": 614.8125, "epoch": 1.1095571095571095, "grad_norm": 2.637587070465088, "kl": 0.191162109375, "learning_rate": 6.301476301476302e-07, "loss": 0.0002, "reward": 0.53125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.03125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.5, "rewards/score_task": 1.5, "step": 952 }, { "clip_ratio": 0.0, "completion_length": 536.71875, "epoch": 1.1107226107226107, "grad_norm": 1.1686315536499023, "kl": 0.20068359375, "learning_rate": 6.297591297591297e-07, "loss": 0.0002, "reward": 0.8125, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.0625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 953 }, { "clip_ratio": 0.0, "completion_length": 579.1875, "epoch": 1.1118881118881119, "grad_norm": 1.6030857563018799, "kl": 0.2529296875, "learning_rate": 6.293706293706294e-07, "loss": 0.0003, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "step": 954 }, { "clip_ratio": 0.0, "completion_length": 771.3125, "epoch": 1.113053613053613, "grad_norm": 1.1797999143600464, "kl": 0.181640625, "learning_rate": 6.28982128982129e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.25, "rewards/format_reward": 0.75, "step": 955 }, { "clip_ratio": 0.0, "completion_length": 510.40625, "epoch": 1.1142191142191142, "grad_norm": 19.999465942382812, "kl": 0.242919921875, "learning_rate": 6.285936285936285e-07, "loss": 0.0002, "reward": 1.53125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 956 }, { "clip_ratio": 0.0, "completion_length": 669.1875, "epoch": 1.1153846153846154, "grad_norm": 1.2463845014572144, "kl": 0.17919921875, "learning_rate": 6.282051282051281e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.53125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 957 }, { "clip_ratio": 0.0, "completion_length": 655.96875, "epoch": 1.1165501165501166, "grad_norm": 1.222624659538269, "kl": 0.2001953125, "learning_rate": 6.278166278166278e-07, "loss": 0.0002, "reward": 1.03125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 958 }, { "clip_ratio": 0.0, "completion_length": 670.0, "epoch": 1.1177156177156178, "grad_norm": 0.0057969167828559875, "kl": 0.189453125, "learning_rate": 6.274281274281275e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 959 }, { "clip_ratio": 0.0, "completion_length": 667.125, "epoch": 1.118881118881119, "grad_norm": 0.973347008228302, "kl": 0.19384765625, "learning_rate": 6.27039627039627e-07, "loss": 0.0002, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.71875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 960 }, { "clip_ratio": 0.0, "completion_length": 518.65625, "epoch": 1.1200466200466201, "grad_norm": 4.940195083618164, "kl": 0.26513671875, "learning_rate": 6.266511266511266e-07, "loss": 0.0003, "reward": 1.71875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 1.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 961 }, { "clip_ratio": 0.0, "completion_length": 397.53125, "epoch": 1.121212121212121, "grad_norm": 0.8836221098899841, "kl": 0.253173828125, "learning_rate": 6.262626262626263e-07, "loss": 0.0003, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.96875, "rewards/score_task": 1.5, "step": 962 }, { "clip_ratio": 0.0, "completion_length": 626.90625, "epoch": 1.1223776223776223, "grad_norm": 0.00660707289353013, "kl": 0.18896484375, "learning_rate": 6.258741258741258e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 963 }, { "clip_ratio": 0.0, "completion_length": 637.71875, "epoch": 1.1235431235431235, "grad_norm": 1.943556308746338, "kl": 0.177001953125, "learning_rate": 6.254856254856254e-07, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.40625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 964 }, { "clip_ratio": 0.0, "completion_length": 738.75, "epoch": 1.1247086247086246, "grad_norm": 1.2333430051803589, "kl": 0.18017578125, "learning_rate": 6.25097125097125e-07, "loss": 0.0002, "reward": 1.75, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.75, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 965 }, { "clip_ratio": 0.0, "completion_length": 712.90625, "epoch": 1.1258741258741258, "grad_norm": 1.7253817319869995, "kl": 0.201171875, "learning_rate": 6.247086247086248e-07, "loss": 0.0002, "reward": 1.59375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.59375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 966 }, { "clip_ratio": 0.0, "completion_length": 534.46875, "epoch": 1.127039627039627, "grad_norm": 0.007585329003632069, "kl": 0.302001953125, "learning_rate": 6.243201243201243e-07, "loss": 0.0003, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 967 }, { "clip_ratio": 0.0, "completion_length": 708.15625, "epoch": 1.1282051282051282, "grad_norm": 0.00919166300445795, "kl": 0.200927734375, "learning_rate": 6.239316239316239e-07, "loss": 0.0002, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 968 }, { "clip_ratio": 0.0, "completion_length": 761.96875, "epoch": 1.1293706293706294, "grad_norm": 0.7882614731788635, "kl": 0.185546875, "learning_rate": 6.235431235431235e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.25, "rewards/format_reward": 0.75, "step": 969 }, { "clip_ratio": 0.0, "completion_length": 549.0, "epoch": 1.1305361305361306, "grad_norm": 0.9832872748374939, "kl": 0.263427734375, "learning_rate": 6.231546231546231e-07, "loss": 0.0003, "reward": 1.84375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.59375, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 970 }, { "clip_ratio": 0.0, "completion_length": 586.21875, "epoch": 1.1317016317016317, "grad_norm": 2.2243194580078125, "kl": 0.2080078125, "learning_rate": 6.227661227661227e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.24511480331420898, "rewards/accuracy_reward": 0.53125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 971 }, { "clip_ratio": 0.0, "completion_length": 698.03125, "epoch": 1.132867132867133, "grad_norm": 1.917562484741211, "kl": 0.191650390625, "learning_rate": 6.223776223776223e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.53125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 1.0, "step": 972 }, { "clip_ratio": 0.0, "completion_length": 709.71875, "epoch": 1.134032634032634, "grad_norm": 0.018549103289842606, "kl": 0.204345703125, "learning_rate": 6.21989121989122e-07, "loss": 0.0002, "reward": 0.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.75, "rewards/format_reward": 0.75, "step": 973 }, { "clip_ratio": 0.0, "completion_length": 553.46875, "epoch": 1.1351981351981353, "grad_norm": 23.661945343017578, "kl": 0.260009765625, "learning_rate": 6.216006216006216e-07, "loss": 0.0003, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 974 }, { "clip_ratio": 0.0, "completion_length": 594.53125, "epoch": 1.1363636363636362, "grad_norm": 5.71004056930542, "kl": 0.258056640625, "learning_rate": 6.212121212121212e-07, "loss": 0.0003, "reward": 0.96875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.6666666666666666, "rewards/format_reward": 0.5, "step": 975 }, { "clip_ratio": 0.0, "completion_length": 701.5, "epoch": 1.1375291375291376, "grad_norm": 1.9110360145568848, "kl": 0.195068359375, "learning_rate": 6.208236208236208e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 976 }, { "clip_ratio": 0.0, "completion_length": 385.59375, "epoch": 1.1386946386946386, "grad_norm": 0.01663910783827305, "kl": 0.3310546875, "learning_rate": 6.204351204351203e-07, "loss": 0.0003, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.5, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 977 }, { "clip_ratio": 0.0, "completion_length": 671.84375, "epoch": 1.1398601398601398, "grad_norm": 2.2609541416168213, "kl": 0.197265625, "learning_rate": 6.2004662004662e-07, "loss": 0.0002, "reward": 1.5625, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.5625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 978 }, { "clip_ratio": 0.0, "completion_length": 501.90625, "epoch": 1.141025641025641, "grad_norm": 0.011669727973639965, "kl": 0.266357421875, "learning_rate": 6.196581196581196e-07, "loss": 0.0003, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 979 }, { "clip_ratio": 0.0, "completion_length": 168.15625, "epoch": 1.1421911421911422, "grad_norm": 5.94711971282959, "kl": 0.412353515625, "learning_rate": 6.192696192696193e-07, "loss": 0.0004, "reward": 1.6875, "reward_std": 0.1767766922712326, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.6875, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 980 }, { "clip_ratio": 0.0, "completion_length": 522.59375, "epoch": 1.1433566433566433, "grad_norm": 1.8966286182403564, "kl": 0.264892578125, "learning_rate": 6.188811188811188e-07, "loss": 0.0003, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "step": 981 }, { "clip_ratio": 0.0, "completion_length": 487.40625, "epoch": 1.1445221445221445, "grad_norm": 3.0296053886413574, "kl": 0.19921875, "learning_rate": 6.184926184926185e-07, "loss": 0.0002, "reward": 1.28125, "reward_std": 0.3787454217672348, "rewards/accuracy_reward": 0.28125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.6666666666666667, "step": 982 }, { "clip_ratio": 0.0, "completion_length": 519.53125, "epoch": 1.1456876456876457, "grad_norm": 1.298100471496582, "kl": 0.19580078125, "learning_rate": 6.181041181041181e-07, "loss": 0.0002, "reward": 1.6875, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.6875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 983 }, { "clip_ratio": 0.0, "completion_length": 535.90625, "epoch": 1.1468531468531469, "grad_norm": 1.276336669921875, "kl": 0.236572265625, "learning_rate": 6.177156177156176e-07, "loss": 0.0002, "reward": 1.4375, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.4375, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "step": 984 }, { "clip_ratio": 0.0, "completion_length": 328.75, "epoch": 1.148018648018648, "grad_norm": 1.5016493797302246, "kl": 0.281982421875, "learning_rate": 6.173271173271172e-07, "loss": 0.0003, "reward": 1.34375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.09375, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 985 }, { "clip_ratio": 0.0, "completion_length": 544.375, "epoch": 1.1491841491841492, "grad_norm": 1.995629906654358, "kl": 0.195068359375, "learning_rate": 6.16938616938617e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.2587745785713196, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 986 }, { "clip_ratio": 0.0, "completion_length": 405.0625, "epoch": 1.1503496503496504, "grad_norm": 0.015881147235631943, "kl": 0.301513671875, "learning_rate": 6.165501165501166e-07, "loss": 0.0003, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/comparison_reward": 0.5, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "step": 987 }, { "clip_ratio": 0.0, "completion_length": 467.3125, "epoch": 1.1515151515151516, "grad_norm": 1.625146746635437, "kl": 0.28955078125, "learning_rate": 6.161616161616161e-07, "loss": 0.0003, "reward": 1.34375, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.34375, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 988 }, { "clip_ratio": 0.0, "completion_length": 483.59375, "epoch": 1.1526806526806528, "grad_norm": 0.014811793342232704, "kl": 0.2509765625, "learning_rate": 6.157731157731158e-07, "loss": 0.0003, "reward": 1.75, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 989 }, { "clip_ratio": 0.0, "completion_length": 433.1875, "epoch": 1.1538461538461537, "grad_norm": 26.932743072509766, "kl": 0.232666015625, "learning_rate": 6.153846153846154e-07, "loss": 0.0002, "reward": 1.15625, "reward_std": 0.2041158601641655, "rewards/accuracy_reward": 0.1875, "rewards/comparison_reward": 0.21875, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 0.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 990 }, { "clip_ratio": 0.0, "completion_length": 475.96875, "epoch": 1.155011655011655, "grad_norm": 1.4806623458862305, "kl": 0.203125, "learning_rate": 6.149961149961149e-07, "loss": 0.0002, "reward": 1.375, "reward_std": 0.13363061845302582, "rewards/accuracy_reward": 0.375, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.6666666666666667, "step": 991 }, { "clip_ratio": 0.0, "completion_length": 668.84375, "epoch": 1.156177156177156, "grad_norm": 0.008837471716105938, "kl": 0.20849609375, "learning_rate": 6.146076146076145e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.6666666666666667, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 992 }, { "clip_ratio": 0.0, "completion_length": 752.28125, "epoch": 1.1573426573426573, "grad_norm": 0.012555418536067009, "kl": 0.184814453125, "learning_rate": 6.142191142191143e-07, "loss": 0.0002, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "step": 993 }, { "clip_ratio": 0.0, "completion_length": 520.75, "epoch": 1.1585081585081585, "grad_norm": 1.4958277940750122, "kl": 0.21044921875, "learning_rate": 6.138306138306139e-07, "loss": 0.0002, "reward": 1.0625, "reward_std": 0.1157275140285492, "rewards/accuracy_reward": 0.3125, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 0.5, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 994 }, { "clip_ratio": 0.0, "completion_length": 555.1875, "epoch": 1.1596736596736597, "grad_norm": 1.36172354221344, "kl": 0.211181640625, "learning_rate": 6.134421134421134e-07, "loss": 0.0002, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.46875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 995 }, { "clip_ratio": 0.0, "completion_length": 729.75, "epoch": 1.1608391608391608, "grad_norm": 0.007925576530396938, "kl": 0.20751953125, "learning_rate": 6.13053613053613e-07, "loss": 0.0002, "reward": 1.5, "reward_std": 0.0, "rewards/accuracy_reward": 0.5, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.5, "rewards/format_reward": 1.0, "step": 996 }, { "clip_ratio": 0.0, "completion_length": 367.3125, "epoch": 1.162004662004662, "grad_norm": 1.675963044166565, "kl": 0.30224609375, "learning_rate": 6.126651126651127e-07, "loss": 0.0003, "reward": 1.90625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward": 0.65625, "rewards/comparison_reward": 0.25, "rewards/comparison_task": 2.0, "rewards/deficiency_task": 2.0, "rewards/format_reward": 1.0, "rewards/score_task": 2.0, "step": 997 }, { "clip_ratio": 0.0, "completion_length": 580.875, "epoch": 1.1631701631701632, "grad_norm": 2.0892698764801025, "kl": 0.206298828125, "learning_rate": 6.122766122766122e-07, "loss": 0.0002, "reward": 1.3125, "reward_std": 0.2177756354212761, "rewards/accuracy_reward": 0.5625, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 0.75, "rewards/score_task": 1.5, "step": 998 }, { "clip_ratio": 0.0, "completion_length": 442.4375, "epoch": 1.1643356643356644, "grad_norm": 0.8382810950279236, "kl": 0.196044921875, "learning_rate": 6.118881118881118e-07, "loss": 0.0002, "reward": 1.21875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.21875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.0, "rewards/format_reward": 1.0, "rewards/score_task": 1.0, "step": 999 }, { "clip_ratio": 0.0, "completion_length": 695.09375, "epoch": 1.1655011655011656, "grad_norm": 1.7531179189682007, "kl": 0.195068359375, "learning_rate": 6.114996114996115e-07, "loss": 0.0002, "reward": 1.46875, "reward_std": 0.0883883461356163, "rewards/accuracy_reward": 0.71875, "rewards/comparison_reward": 0.0, "rewards/deficiency_task": 1.3333333333333333, "rewards/format_reward": 0.75, "rewards/score_task": 2.0, "step": 1000 } ], "logging_steps": 1.0, "max_steps": 2574, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }